From e7f1f6875543a48d8e1697e742d5ffd9e426f31c Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 24 Jul 2025 15:15:04 +0200 Subject: [PATCH 01/56] Stricter template on RleDecoder --- cpp/src/arrow/util/rle_encoding_internal.h | 100 +++++++++++---------- cpp/src/arrow/util/rle_encoding_test.cc | 16 ++-- cpp/src/parquet/column_reader.cc | 6 +- cpp/src/parquet/column_reader.h | 3 +- cpp/src/parquet/decoder.cc | 15 ++-- 5 files changed, 72 insertions(+), 68 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index fab8c505120d..4cc961039856 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -85,8 +85,12 @@ namespace util { // /// Decoder class for RLE encoded data. +template class RleDecoder { public: + /// The type in which the data should be decoded. + using value_type = T; + /// Create a decoder object. buffer/buffer_len is the decoded data. /// bit_width is the width of each value (before encoding). RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) @@ -118,29 +122,26 @@ class RleDecoder { /// input with zeros. Since the encoding does not differentiate between /// input values and padding, Get() returns true even for these padding /// values. - template - bool Get(T* val); + bool Get(value_type* val); /// Gets a batch of values. Returns the number of decoded elements. - template - int GetBatch(T* values, int batch_size); + int GetBatch(value_type* values, int batch_size); /// Like GetBatch but add spacing for null entries - template int GetBatchSpaced(int batch_size, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, T* out); + int64_t valid_bits_offset, value_type* out); /// Like GetBatch but the values are then decoded using the provided dictionary - template - int GetBatchWithDict(const T* dictionary, int32_t dictionary_length, T* values, + template + int GetBatchWithDict(const V* dictionary, int32_t dictionary_length, V* values, int batch_size); /// Like GetBatchWithDict but add spacing for null entries /// /// Null entries will be zero-initialized in `values` to avoid leaking /// private data. - template - int GetBatchWithDictSpaced(const T* dictionary, int32_t dictionary_length, T* values, + template + int GetBatchWithDictSpaced(const V* dictionary, int32_t dictionary_length, V* values, int batch_size, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset); @@ -155,13 +156,12 @@ class RleDecoder { private: /// Fills literal_count_ and repeat_count_ with next values. Returns false if there /// are no more. - template bool NextCounts(); /// Utility methods for retrieving spaced values. - template + template int GetSpaced(Converter converter, int batch_size, int null_count, - const uint8_t* valid_bits, int64_t valid_bits_offset, T* out); + const uint8_t* valid_bits, int64_t valid_bits_offset, V* out); }; /// Class to incrementally build the rle data. This class does not allocate any memory. @@ -300,12 +300,12 @@ class RleEncoder { }; template -inline bool RleDecoder::Get(T* val) { +inline bool RleDecoder::Get(value_type* val) { return GetBatch(val, 1) == 1; } template -inline int RleDecoder::GetBatch(T* values, int batch_size) { +inline int RleDecoder::GetBatch(value_type* values, int batch_size) { ARROW_DCHECK_GE(bit_width_, 0); int values_read = 0; @@ -316,7 +316,7 @@ inline int RleDecoder::GetBatch(T* values, int batch_size) { if (repeat_count_ > 0) { // Repeated value case. int repeat_batch = std::min(remaining, repeat_count_); - std::fill(out, out + repeat_batch, static_cast(current_value_)); + std::fill(out, out + repeat_batch, static_cast(current_value_)); repeat_count_ -= repeat_batch; values_read += repeat_batch; @@ -332,17 +332,18 @@ inline int RleDecoder::GetBatch(T* values, int batch_size) { values_read += literal_batch; out += literal_batch; } else { - if (!NextCounts()) return values_read; + if (!NextCounts()) return values_read; } } return values_read; } -template -inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_count, - const uint8_t* valid_bits, int64_t valid_bits_offset, - T* out) { +template +template +inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_count, + const uint8_t* valid_bits, int64_t valid_bits_offset, + V* out) { if (ARROW_PREDICT_FALSE(null_count == batch_size)) { converter.FillZero(out, out + batch_size); return batch_size; @@ -366,7 +367,7 @@ inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_c if (valid_run.set) { if ((repeat_count_ == 0) && (literal_count_ == 0)) { - if (!NextCounts()) return values_read; + if (!NextCounts()) return values_read; ARROW_DCHECK((repeat_count_ > 0) ^ (literal_count_ > 0)); } @@ -394,7 +395,7 @@ inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_c valid_run = bit_reader.NextRun(); } } - RunType current_value = static_cast(current_value_); + value_type current_value = static_cast(current_value_); if (ARROW_PREDICT_FALSE(!converter.IsValid(current_value))) { return values_read; } @@ -407,7 +408,7 @@ inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_c // Decode the literals constexpr int kBufferSize = 1024; - RunType indices[kBufferSize]; + value_type indices[kBufferSize]; literal_batch = std::min(literal_batch, kBufferSize); int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch); if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) { @@ -469,14 +470,14 @@ struct PlainRleConverter { }; template -inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset, T* out) { +inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset, value_type* out) { if (null_count == 0) { - return GetBatch(out, batch_size); + return GetBatch(out, batch_size); } - PlainRleConverter converter; + PlainRleConverter converter; arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset, batch_size); @@ -490,12 +491,12 @@ inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count, break; } if (block.AllSet()) { - processed = GetBatch(out, block.length); + processed = GetBatch(out, block.length); } else if (block.NoneSet()) { converter.FillZero(out, out + block.length); processed = block.length; } else { - processed = GetSpaced>( + processed = GetSpaced>( converter, block.length, block.length - block.popcount, valid_bits, valid_bits_offset, out); } @@ -545,12 +546,13 @@ struct DictionaryConverter { }; template -inline int RleDecoder::GetBatchWithDict(const T* dictionary, int32_t dictionary_length, - T* values, int batch_size) { +template +inline int RleDecoder::GetBatchWithDict(const V* dictionary, int32_t dictionary_length, + V* values, int batch_size) { // Per https://github.com/apache/parquet-format/blob/master/Encodings.md, // the maximum dictionary index width in Parquet is 32 bits. - using IndexType = int32_t; - DictionaryConverter converter; + using IndexType = value_type; + DictionaryConverter converter; converter.dictionary = dictionary; converter.dictionary_length = dictionary_length; @@ -567,7 +569,7 @@ inline int RleDecoder::GetBatchWithDict(const T* dictionary, int32_t dictionary_ if (ARROW_PREDICT_FALSE(!IndexInRange(idx, dictionary_length))) { return values_read; } - T val = dictionary[idx]; + V val = dictionary[idx]; int repeat_batch = std::min(remaining, repeat_count_); std::fill(out, out + repeat_batch, val); @@ -597,7 +599,7 @@ inline int RleDecoder::GetBatchWithDict(const T* dictionary, int32_t dictionary_ values_read += literal_batch; out += literal_batch; } else { - if (!NextCounts()) return values_read; + if (!NextCounts()) return values_read; } } @@ -605,18 +607,18 @@ inline int RleDecoder::GetBatchWithDict(const T* dictionary, int32_t dictionary_ } template -inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, - int32_t dictionary_length, T* out, - int batch_size, int null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset) { +template +inline int RleDecoder::GetBatchWithDictSpaced(const V* dictionary, + int32_t dictionary_length, V* out, + int batch_size, int null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset) { if (null_count == 0) { - return GetBatchWithDict(dictionary, dictionary_length, out, batch_size); + return GetBatchWithDict(dictionary, dictionary_length, out, batch_size); } arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset, batch_size); - using IndexType = int32_t; - DictionaryConverter converter; + DictionaryConverter converter; converter.dictionary = dictionary; converter.dictionary_length = dictionary_length; @@ -629,12 +631,12 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, break; } if (block.AllSet()) { - processed = GetBatchWithDict(dictionary, dictionary_length, out, block.length); + processed = GetBatchWithDict(dictionary, dictionary_length, out, block.length); } else if (block.NoneSet()) { converter.FillZero(out, out + block.length); processed = block.length; } else { - processed = GetSpaced>( + processed = GetSpaced>( converter, block.length, block.length - block.popcount, valid_bits, valid_bits_offset, out); } @@ -646,7 +648,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, } template -bool RleDecoder::NextCounts() { +bool RleDecoder::NextCounts() { // Read the next run's indicator int, it could be a literal or repeated run. // The int is encoded as a vlq-encoded value. uint32_t indicator_value = 0; @@ -666,7 +668,7 @@ bool RleDecoder::NextCounts() { } repeat_count_ = count; T value = {}; - if (!bit_reader_.GetAligned( + if (!bit_reader_.GetAligned( static_cast(::arrow::bit_util::CeilDiv(bit_width_, 8)), &value)) { return false; } diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 0cc0a276a25f..c185edc4ea02 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -240,7 +240,7 @@ void ValidateRle(const std::vector& values, int bit_width, // Verify read { - RleDecoder decoder(buffer, len, bit_width); + RleDecoder decoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { uint64_t val; bool result = decoder.Get(&val); @@ -251,7 +251,7 @@ void ValidateRle(const std::vector& values, int bit_width, // Verify batch read { - RleDecoder decoder(buffer, len, bit_width); + RleDecoder decoder(buffer, len, bit_width); std::vector values_read(values.size()); ASSERT_EQ(values.size(), decoder.GetBatch(values_read.data(), static_cast(values.size()))); @@ -282,7 +282,7 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { int out = 0; { - RleDecoder decoder(buffer, encoded_len, bit_width); + RleDecoder decoder(buffer, encoded_len, bit_width); for (size_t i = 0; i < values.size(); ++i) { EXPECT_TRUE(decoder.Get(&out)); if (values[i] != out) { @@ -293,7 +293,7 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { // Verify batch read { - RleDecoder decoder(buffer, encoded_len, bit_width); + RleDecoder decoder(buffer, encoded_len, bit_width); std::vector values_read(values.size()); if (static_cast(values.size()) != decoder.GetBatch(values_read.data(), static_cast(values.size()))) { @@ -419,7 +419,7 @@ TEST(Rle, BitWidthZeroRepeated) { uint8_t buffer[1]; const int num_values = 15; buffer[0] = num_values << 1; // repeated indicator byte - RleDecoder decoder(buffer, sizeof(buffer), 0); + RleDecoder decoder(buffer, sizeof(buffer), 0); uint8_t val; for (int i = 0; i < num_values; ++i) { bool result = decoder.Get(&val); @@ -433,7 +433,7 @@ TEST(Rle, BitWidthZeroLiteral) { uint8_t buffer[1]; const int num_groups = 4; buffer[0] = num_groups << 1 | 1; // literal indicator byte - RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0); + RleDecoder decoder = {buffer, sizeof(buffer), 0}; const int num_values = num_groups * 8; uint8_t val; for (int i = 0; i < num_values; ++i) { @@ -538,7 +538,7 @@ TEST(BitRle, Overflow) { EXPECT_LE(bytes_written, len); EXPECT_GT(num_added, 0); - RleDecoder decoder(buffer.data(), bytes_written, bit_width); + RleDecoder decoder(buffer.data(), bytes_written, bit_width); parity = true; uint32_t v; for (int i = 0; i < num_added; ++i) { @@ -575,7 +575,7 @@ void CheckRoundTripSpaced(const Array& data, int bit_width) { int encoded_size = encoder.Flush(); // Verify batch read - RleDecoder decoder(buffer.data(), encoded_size, bit_width); + RleDecoder decoder(buffer.data(), encoded_size, bit_width); std::vector values_read(num_values); if (num_values != decoder.GetBatchSpaced( diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index f40e3cae54ef..9ad2abf1c6dc 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -113,8 +113,8 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, } const uint8_t* decoder_data = data + 4; if (!rle_decoder_) { - rle_decoder_ = std::make_unique<::arrow::util::RleDecoder>(decoder_data, - num_bytes, bit_width_); + rle_decoder_ = std::make_unique<::arrow::util::RleDecoder>( + decoder_data, num_bytes, bit_width_); } else { rle_decoder_->Reset(decoder_data, num_bytes, bit_width_); } @@ -158,7 +158,7 @@ void LevelDecoder::SetDataV2(int32_t num_bytes, int16_t max_level, if (!rle_decoder_) { rle_decoder_ = - std::make_unique<::arrow::util::RleDecoder>(data, num_bytes, bit_width_); + std::make_unique<::arrow::util::RleDecoder>(data, num_bytes, bit_width_); } else { rle_decoder_->Reset(data, num_bytes, bit_width_); } diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 0bff52f79299..57130f30d185 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -39,6 +39,7 @@ class BitReader; } // namespace bit_util namespace util { +template class RleDecoder; } // namespace util @@ -95,7 +96,7 @@ class PARQUET_EXPORT LevelDecoder { int bit_width_; int num_values_remaining_; Encoding::type encoding_; - std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_; + std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_; std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_; int16_t max_level_; }; diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index fc191e8ded35..a563f520e093 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -861,7 +861,7 @@ class DictDecoderImpl : public TypedDecoderImpl, public DictDecoder this->num_values_ = num_values; if (len == 0) { // Initialize dummy decoder to avoid crashes later on - idx_decoder_ = ::arrow::util::RleDecoder(data, len, /*bit_width=*/1); + idx_decoder_ = ::arrow::util::RleDecoder(data, len, /*bit_width=*/1); return; } uint8_t bit_width = *data; @@ -869,7 +869,7 @@ class DictDecoderImpl : public TypedDecoderImpl, public DictDecoder throw ParquetException("Invalid or corrupted bit_width " + std::to_string(bit_width) + ". Maximum allowed is 32."); } - idx_decoder_ = ::arrow::util::RleDecoder(++data, --len, bit_width); + idx_decoder_ = ::arrow::util::RleDecoder(++data, --len, bit_width); } int Decode(T* buffer, int num_values) override { @@ -1003,7 +1003,7 @@ class DictDecoderImpl : public TypedDecoderImpl, public DictDecoder // BinaryDictionary32Builder std::shared_ptr indices_scratch_space_; - ::arrow::util::RleDecoder idx_decoder_; + ::arrow::util::RleDecoder idx_decoder_; }; template @@ -1810,8 +1810,9 @@ class RleBooleanDecoder : public TypedDecoderImpl, public BooleanDe auto decoder_data = data + 4; if (decoder_ == nullptr) { - decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, num_bytes, - /*bit_width=*/1); + decoder_ = + std::make_shared<::arrow::util::RleDecoder>(decoder_data, num_bytes, + /*bit_width=*/1); } else { decoder_->Reset(decoder_data, num_bytes, /*bit_width=*/1); } @@ -1898,7 +1899,7 @@ class RleBooleanDecoder : public TypedDecoderImpl, public BooleanDe } private: - std::shared_ptr<::arrow::util::RleDecoder> decoder_; + std::shared_ptr<::arrow::util::RleDecoder> decoder_; }; // ---------------------------------------------------------------------- @@ -2123,7 +2124,7 @@ class DeltaByteArrayDecoderImpl : public TypedDecoderImpl { int num_valid_values_{0}; uint32_t prefix_len_offset_{0}; std::shared_ptr buffered_prefix_length_; - // buffer for decoded strings, which gurantees the lifetime of the decoded strings + // buffer for decoded strings, which guarantees the lifetime of the decoded strings // until the next call of Decode. std::shared_ptr buffered_data_; }; From 6f5670c6ef402b2b8e048b6de5b8a1be6c1f13ec Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 24 Jul 2025 18:40:17 +0200 Subject: [PATCH 02/56] Refactor number of vlq bytes needed --- cpp/src/arrow/util/bit_stream_utils_internal.h | 6 +++--- cpp/src/arrow/util/bit_util.h | 5 +++++ cpp/src/arrow/util/bit_util_test.cc | 9 ++++++++- cpp/src/arrow/util/rle_encoding_internal.h | 2 +- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 2b5ec3830eee..4fad5da4199e 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -190,10 +190,10 @@ class BitReader { } /// Maximum byte length of a vlq encoded int - static constexpr int kMaxVlqByteLength = 5; + static constexpr int kMaxVlqByteLengthForInt32 = MaxLEB128ByteLenFor; /// Maximum byte length of a vlq encoded int64 - static constexpr int kMaxVlqByteLengthForInt64 = 10; + static constexpr int kMaxVlqByteLengthForInt64 = MaxLEB128ByteLenFor; private: const uint8_t* buffer_; @@ -452,7 +452,7 @@ inline bool BitWriter::PutVlqInt(uint32_t v) { inline bool BitReader::GetVlqInt(uint32_t* v) { uint32_t tmp = 0; - for (int i = 0; i < kMaxVlqByteLength; i++) { + for (int i = 0; i < kMaxVlqByteLengthForInt32; i++) { uint8_t byte = 0; if (ARROW_PREDICT_FALSE(!GetAligned(1, &byte))) { return false; diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 13d265f0be55..2836debfba9c 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -365,5 +365,10 @@ void PackBits(const uint32_t* values, uint8_t* out) { } } +constexpr int64_t MaxLEB128ByteLen(int64_t n_bits) { return CeilDiv(n_bits, 7); } + +template +constexpr int64_t MaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8); + } // namespace bit_util } // namespace arrow diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index fcaeb49261d1..b2dc7b6b9bf8 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -1997,8 +1997,15 @@ TEST(BitUtil, RoundUpToPowerOf2) { #undef U64 #undef S64 +/// Test the maximum number of bytes needed to write a LEB128 of a give size. +TEST(BitStreamUtil, MaxLEB128ByteLenFor) { + EXPECT_EQ(bit_util::MaxLEB128ByteLenFor, 3); + EXPECT_EQ(bit_util::MaxLEB128ByteLenFor, 5); + EXPECT_EQ(bit_util::MaxLEB128ByteLenFor, 10); +} + static void TestZigZag(int32_t v, std::array buffer_expect) { - uint8_t buffer[bit_util::BitReader::kMaxVlqByteLength] = {}; + uint8_t buffer[bit_util::BitReader::kMaxVlqByteLengthForInt32] = {}; bit_util::BitWriter writer(buffer, sizeof(buffer)); bit_util::BitReader reader(buffer, sizeof(buffer)); writer.PutZigZagVlqInt(v); diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 4cc961039856..9072e10a2a4f 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -196,7 +196,7 @@ class RleEncoder { MAX_VALUES_PER_LITERAL_RUN * bit_width)); /// Up to kMaxVlqByteLength indicator and a single 'bit_width' value. int max_repeated_run_size = - ::arrow::bit_util::BitReader::kMaxVlqByteLength + + ::arrow::bit_util::BitReader::kMaxVlqByteLengthForInt32 + static_cast(::arrow::bit_util::BytesForBits(bit_width)); return std::max(max_literal_run_size, max_repeated_run_size); } From dfa3b7106b16072854ab56f46ee2091f7b6645fc Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 25 Jul 2025 10:37:49 +0200 Subject: [PATCH 03/56] Refactor LEB128 reading --- .../arrow/util/bit_stream_utils_internal.h | 39 ++++++++++------ cpp/src/arrow/util/bit_util.h | 40 +++++++++++++++++ cpp/src/arrow/util/bit_util_test.cc | 44 ++++++++++++++++++- 3 files changed, 109 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 4fad5da4199e..fb8b8654a60f 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -450,22 +450,35 @@ inline bool BitWriter::PutVlqInt(uint32_t v) { } inline bool BitReader::GetVlqInt(uint32_t* v) { - uint32_t tmp = 0; - - for (int i = 0; i < kMaxVlqByteLengthForInt32; i++) { - uint8_t byte = 0; - if (ARROW_PREDICT_FALSE(!GetAligned(1, &byte))) { - return false; - } - tmp |= static_cast(byte & 0x7F) << (7 * i); + // The data that we will pass to the LEB128 parser + // In all case, we read an byte-aligned value, skipping remaining bits + uint8_t const* data = NULLPTR; + int max_size = 0; + + // Number of bytes left in the buffered values, not including the current + // byte (i.e., there may be an additional fraction of a byte). + int const bytes_left_in_cache = + sizeof(buffered_values_) - static_cast(bit_util::BytesForBits(bit_offset_)); + + // If there are clearly enough bytes left we can try to parse from the cache + if (bytes_left_in_cache >= kMaxVlqByteLengthForInt32) { + max_size = bytes_left_in_cache; + data = reinterpret_cast(&buffered_values_) + + bit_util::BytesForBits(bit_offset_); + // Otherwise, we try straight from buffer (ignoring few bytes that may be cached) + } else { + max_size = bytes_left(); + data = buffer_ + (max_bytes_ - max_size); + } - if ((byte & 0x80) == 0) { - *v = tmp; - return true; - } + auto const read = bit_util::ParseLeadingLEB128(data, max_size, v); + if (ARROW_PREDICT_FALSE(read == 0)) { + // Corrupt LEB128 + return false; } - return false; + // Advance for the bytes we have read + the bit we skipped + return Advance((8 * read) + (bit_offset_ % 8)); } inline bool BitWriter::PutZigZagVlqInt(int32_t v) { diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 2836debfba9c..91bb49c356a2 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -370,5 +370,45 @@ constexpr int64_t MaxLEB128ByteLen(int64_t n_bits) { return CeilDiv(n_bits, 7); template constexpr int64_t MaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8); +/// Parse a leading LEB128 +/// +/// Take as input a data pointer and the maximum number of bytes that can be read from it +/// (typically the array size). +/// When a valid LEB128 is found at the start of the data, the function writes it to the +/// out pointer and return the number of bytes read. +/// Otherwise, the out pointer is unmodified and zero is returned. +/// +/// \see https://en.wikipedia.org/wiki/LEB128 +/// \see MaxLEB128ByteLenFor +template +constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, + Int* out) { + constexpr uint8_t kLow7Mask = 0x7F; + constexpr uint8_t kContinuationBit = 0x80; + + // Iteratively building the value + Int value = 0; + + // Read as many bytes as the could be for the give output + for (int32_t i = 0; i < MaxLEB128ByteLenFor; i++) { + // We have not finished reading a valid LEB128, yet we run out of data + if (i >= max_data_size) { + return 0; + } + + // Read the byte and set its 7 LSB to in the final value + uint8_t const byte = data[i]; + value |= static_cast(byte & kLow7Mask) << (7 * i); + + // Check for lack of continuation flag in MSB + if ((byte & kContinuationBit) == 0) { + *out = value; + return i + 1; + } + } + + // There is still data + return 0; +} } // namespace bit_util } // namespace arrow diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index b2dc7b6b9bf8..1a80c8ed35e3 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -2004,11 +2004,53 @@ TEST(BitStreamUtil, MaxLEB128ByteLenFor) { EXPECT_EQ(bit_util::MaxLEB128ByteLenFor, 10); } +/// Utility function to test LEB128 decoding with known byte array and expected result +template +void TestLEB128Decode(std::array const& data, Int expected_value, + int32_t expected_bytes_read) { + Int result = 0; + auto bytes_read = bit_util::ParseLeadingLEB128( + data.data(), static_cast(data.size()), &result); + EXPECT_EQ(bytes_read, expected_bytes_read); + if (expected_bytes_read > 0) { + EXPECT_EQ(result, expected_value); + } +} + +/// Test decoding from known LEB128 byte sequences +TEST(BitStreamUtil, LEB128) { + // Single byte value 0 + TestLEB128Decode(std::array{0x00}, 0U, 1); + // Single byte value 127 + TestLEB128Decode(std::array{0x7F}, 127U, 1); + // Two byte value 128 + TestLEB128Decode(std::array{0x80, 0x01}, 128U, 2); + // Two byte value 300 + TestLEB128Decode(std::array{0xAC, 0x02}, 300U, 2); + // Three byte value 16384 + TestLEB128Decode(std::array{0x80, 0x80, 0x01}, 16384U, 3); + // Three byte value 16384, with remaining data + TestLEB128Decode(std::array{0x80, 0x80, 0x01, 0x80, 0x00}, 16384U, 3); + // Four byte value 268435455 + TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0x7F}, 268435455U, 4); + // Five byte uint32_t max value + TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, 4294967295U, 5); + // uint64_t value requiring 10 bytes + TestLEB128Decode( + std::array{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, + 18446744073709551615ULL, 10); + // Error case: Truncated sequence (continuation bit set but no more data) + TestLEB128Decode(std::array{0x80}, 0U, 0); + // Error case: Oversized sequence for uint32_t (too many bytes) + TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 0U, 0); +} + static void TestZigZag(int32_t v, std::array buffer_expect) { uint8_t buffer[bit_util::BitReader::kMaxVlqByteLengthForInt32] = {}; bit_util::BitWriter writer(buffer, sizeof(buffer)); - bit_util::BitReader reader(buffer, sizeof(buffer)); writer.PutZigZagVlqInt(v); + // WARN reader buffer input on creation so it must be created after the data is written + bit_util::BitReader reader(buffer, sizeof(buffer)); EXPECT_THAT(buffer, testing::ElementsAreArray(buffer_expect)); int32_t result; EXPECT_TRUE(reader.GetZigZagVlqInt(&result)); From 77fc9e0f4ec9d1ce08aaeaa14bebf168958f55c2 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 25 Jul 2025 12:10:48 +0200 Subject: [PATCH 04/56] Refactor LEB128 writing --- .../arrow/util/bit_stream_utils_internal.h | 16 +++--- cpp/src/arrow/util/bit_util.h | 43 +++++++++++++++ cpp/src/arrow/util/bit_util_test.cc | 52 +++++++++++++++++++ 3 files changed, 105 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index fb8b8654a60f..0e62e8a40d6d 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -496,13 +496,17 @@ inline bool BitReader::GetZigZagVlqInt(int32_t* v) { } inline bool BitWriter::PutVlqInt(uint64_t v) { - bool result = true; - while ((v & 0xFFFFFFFFFFFFFF80ULL) != 0ULL) { - result &= PutAligned(static_cast((v & 0x7F) | 0x80), 1); - v >>= 7; + constexpr auto kMaxBytes = bit_util::MaxLEB128ByteLenFor; + + uint8_t leb128[kMaxBytes] = {}; + auto const bytes_written = bit_util::WriteLEB128(v, leb128, kMaxBytes); + ARROW_DCHECK_NE(bytes_written, 0); + + if (auto* out = GetNextBytePtr(bytes_written)) { + std::memcpy(out, leb128, bytes_written); + return true; } - result &= PutAligned(static_cast(v & 0x7F), 1); - return result; + return false; } inline bool BitReader::GetVlqInt(uint64_t* v) { diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 91bb49c356a2..bf63f740f58e 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -370,6 +370,49 @@ constexpr int64_t MaxLEB128ByteLen(int64_t n_bits) { return CeilDiv(n_bits, 7); template constexpr int64_t MaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8); +/// Write a integer as LEB128 +/// +/// Write the input value as LEB128 into the outptu buffer and return the number of bytes +/// written. +/// If the output buffer size is insufficient, return 0 but the output may have been +/// written to. +/// +/// \see https://en.wikipedia.org/wiki/LEB128 +/// \see MaxLEB128ByteLenFor +template +constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) { + constexpr Int kLow7Mask = Int(0x7F); + constexpr Int kHigh7Mask = ~kLow7Mask; + constexpr uint8_t kContinuationBit = 0x80; + + auto const out_first = out; + + // Write as many bytes as the could be for the given input + while ((value & kHigh7Mask) != Int(0)) { + // We do not have enough room to write the LEB128 + if (out - out_first >= max_out_size) { + return 0; + } + + // Write the encoded byte with continuation bit + *out = static_cast(value & kLow7Mask) | kContinuationBit; + ++out; + // Shift remaining data + value >>= 7; + } + + // We do not have enough room to write the LEB128 + if (out - out_first >= max_out_size) { + return 0; + } + + // Write last non-continuing byte + *out = static_cast(value & kLow7Mask); + ++out; + + return static_cast(out - out_first); +} + /// Parse a leading LEB128 /// /// Take as input a data pointer and the maximum number of bytes that can be read from it diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 1a80c8ed35e3..9790669443ea 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -2004,6 +2004,58 @@ TEST(BitStreamUtil, MaxLEB128ByteLenFor) { EXPECT_EQ(bit_util::MaxLEB128ByteLenFor, 10); } +/// Utility function to test LEB128 encoding with known input value and expected byte +/// array +template +void TestLEB128Encode(Int input_value, std::array const& expected_data, + int32_t expected_bytes_written) { + std::array buffer{}; + auto bytes_written = bit_util::WriteLEB128(input_value, buffer.data(), + static_cast(buffer.size())); + EXPECT_EQ(bytes_written, expected_bytes_written); + if (bytes_written > 0) { + EXPECT_EQ(buffer, expected_data); + } +} + +/// Test encoding to known LEB128 byte sequences +TEST(WriteLEB128Test, KnownArrayValues) { + // Single byte value 0 + TestLEB128Encode(0U, std::array{0x00}, 1); + // Single byte value 127 + TestLEB128Encode(127U, std::array{0x7F}, 1); + // Two byte value 128 + TestLEB128Encode(128U, std::array{0x80, 0x01}, 2); + // Two byte value 300 + TestLEB128Encode(300U, std::array{0xAC, 0x02}, 2); + // Three byte value 16384 + TestLEB128Encode(16384U, std::array{0x80, 0x80, 0x01}, 3); + // Four byte value 268435455 + TestLEB128Encode(268435455U, std::array{0xFF, 0xFF, 0xFF, 0x7F}, 4); + // Five byte uint32_t max value + TestLEB128Encode(4294967295U, std::array{0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, 5); + // uint64_t value requiring 10 bytes + TestLEB128Encode( + 18446744073709551615ULL, + std::array{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, + 10); + // Edge case: Exact buffer size match + TestLEB128Encode(16384U, std::array{0x80, 0x80, 0x01}, 3); + // Various single byte values + TestLEB128Encode(1U, std::array{0x01}, 1); + TestLEB128Encode(63U, std::array{0x3F}, 1); + TestLEB128Encode(64U, std::array{0x40}, 1); + // Two byte boundary values + TestLEB128Encode(129U, std::array{0x81, 0x01}, 2); + TestLEB128Encode(16383U, std::array{0xFF, 0x7F}, 2); + // Error case: Buffer too small for value 128 (needs 2 bytes but only 1 provided) + TestLEB128Encode(128U, std::array{}, 0); + // Error case: Buffer too small for uint32_t max (needs 5 bytes but only 4 provided) + TestLEB128Encode(4294967295U, std::array{}, 0); + // Error case: Zero buffer size + TestLEB128Encode(52U, std::array{}, 0); +} + /// Utility function to test LEB128 decoding with known byte array and expected result template void TestLEB128Decode(std::array const& data, Int expected_value, From 22e628260915fb94d360bedd4148cd77635a5f2a Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 4 Aug 2025 10:25:12 +0200 Subject: [PATCH 05/56] Simple Run classes --- cpp/src/arrow/util/rle_encoding_internal.h | 87 +++++++++++++++++++++- 1 file changed, 85 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 9072e10a2a4f..9a5220063dd5 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -21,9 +21,9 @@ #pragma once #include -#include +#include #include -#include +#include #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" @@ -84,6 +84,89 @@ namespace util { /// (total 26 bytes, 1 byte overhead) // +class RleRun { + public: + using byte = uint8_t; + /// Enough space to store a 64bit value + using raw_data_storage = std::array; + using raw_data_const_pointer = const byte*; + using raw_data_size_type = int32_t; + /// The type of the size of either run, between 1 and 2^31-1 as per Parquet spec + using values_count_type = int32_t; + /// The type to represent a size in bits + using bit_size_type = int32_t; + + constexpr RleRun() noexcept = default; + constexpr RleRun(RleRun const&) noexcept = default; + constexpr RleRun(RleRun&&) noexcept = default; + + explicit RleRun(raw_data_const_pointer data, values_count_type values_count, + bit_size_type value_bit_width) noexcept; + + constexpr RleRun& operator=(RleRun const&) noexcept = default; + constexpr RleRun& operator=(RleRun&&) noexcept = default; + + /// The number of repeated values in this run. + [[nodiscard]] constexpr values_count_type ValuesCount() const noexcept; + + /// The size in bits of each encoded value. + [[nodiscard]] constexpr bit_size_type ValuesBitWidth() const noexcept; + + /// A pointer to the repeated value raw bytes. + [[nodiscard]] constexpr raw_data_const_pointer RawDataPtr() const noexcept; + + /// The number of bytes used for the raw repeated value. + [[nodiscard]] constexpr raw_data_size_type RawDataSize() const noexcept; + + private: + /// The repeated value raw bytes stored inside the class + raw_data_storage data_ = {}; + /// The number of time the value is repeated + values_count_type values_count_ = 0; + /// The size in bit of a packed value in the run + bit_size_type value_bit_width_ = 0; +}; + +class BitPackedRun { + public: + using byte = uint8_t; + using raw_data_const_pointer = const byte*; + /// According to the Parquet thrift definition the page size can be written into an + /// int32_t. + using raw_data_size_type = int32_t; + /// The type of the size of either run, between 1 and 2^31-1 as per Parquet spec + using values_count_type = int32_t; + /// The type to represent a size in bits + using bit_size_type = int32_t; + + constexpr BitPackedRun() noexcept = default; + constexpr BitPackedRun(BitPackedRun const&) noexcept = default; + constexpr BitPackedRun(BitPackedRun&&) noexcept = default; + + constexpr BitPackedRun(raw_data_const_pointer data, values_count_type values_count, + bit_size_type value_bit_width) noexcept; + + constexpr BitPackedRun& operator=(BitPackedRun const&) noexcept = default; + constexpr BitPackedRun& operator=(BitPackedRun&&) noexcept = default; + + [[nodiscard]] constexpr values_count_type ValuesCount() const noexcept; + + /// The size in bits of each encoded value. + [[nodiscard]] constexpr bit_size_type ValuesBitWidth() const noexcept; + + [[nodiscard]] constexpr raw_data_const_pointer RawDataPtr() const noexcept; + + [[nodiscard]] constexpr raw_data_size_type RawDataSize() const noexcept; + + private: + /// The pointer to the beginning of the run + raw_data_const_pointer data_ = nullptr; + /// Number of values in this run. + raw_data_size_type values_count_ = 0; + /// The size in bit of a packed value in the run + bit_size_type value_bit_width_ = 0; +}; + /// Decoder class for RLE encoded data. template class RleDecoder { From 15cde241fb776b8d4b66d6f7d3fcb546c1ab9003 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 4 Aug 2025 10:44:20 +0200 Subject: [PATCH 06/56] Rename RleDecoder > RleBitPackedDecoder --- cpp/src/arrow/util/rle_encoding_internal.h | 38 +++++++++++----------- cpp/src/arrow/util/rle_encoding_test.cc | 16 ++++----- cpp/src/parquet/column_reader.cc | 6 ++-- cpp/src/parquet/column_reader.h | 4 +-- cpp/src/parquet/decoder.cc | 15 +++++---- 5 files changed, 40 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 9a5220063dd5..0b2b603ce6dd 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -169,14 +169,14 @@ class BitPackedRun { /// Decoder class for RLE encoded data. template -class RleDecoder { +class RleBitPackedDecoder { public: /// The type in which the data should be decoded. using value_type = T; /// Create a decoder object. buffer/buffer_len is the decoded data. /// bit_width is the width of each value (before encoding). - RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) + RleBitPackedDecoder(const uint8_t* buffer, int buffer_len, int bit_width) : bit_reader_(buffer, buffer_len), bit_width_(bit_width), current_value_(0), @@ -186,7 +186,7 @@ class RleDecoder { ARROW_DCHECK_LE(bit_width_, 64); } - RleDecoder() : bit_width_(-1) {} + RleBitPackedDecoder() : bit_width_(-1) {} void Reset(const uint8_t* buffer, int buffer_len, int bit_width) { ARROW_DCHECK_GE(bit_width, 0); @@ -383,12 +383,12 @@ class RleEncoder { }; template -inline bool RleDecoder::Get(value_type* val) { +inline bool RleBitPackedDecoder::Get(value_type* val) { return GetBatch(val, 1) == 1; } template -inline int RleDecoder::GetBatch(value_type* values, int batch_size) { +inline int RleBitPackedDecoder::GetBatch(value_type* values, int batch_size) { ARROW_DCHECK_GE(bit_width_, 0); int values_read = 0; @@ -424,9 +424,9 @@ inline int RleDecoder::GetBatch(value_type* values, int batch_size) { template template -inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_count, - const uint8_t* valid_bits, int64_t valid_bits_offset, - V* out) { +inline int RleBitPackedDecoder::GetSpaced(Converter converter, int batch_size, + int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, V* out) { if (ARROW_PREDICT_FALSE(null_count == batch_size)) { converter.FillZero(out, out + batch_size); return batch_size; @@ -553,9 +553,10 @@ struct PlainRleConverter { }; template -inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset, value_type* out) { +inline int RleBitPackedDecoder::GetBatchSpaced(int batch_size, int null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset, + value_type* out) { if (null_count == 0) { return GetBatch(out, batch_size); } @@ -630,8 +631,9 @@ struct DictionaryConverter { template template -inline int RleDecoder::GetBatchWithDict(const V* dictionary, int32_t dictionary_length, - V* values, int batch_size) { +inline int RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, + int32_t dictionary_length, V* values, + int batch_size) { // Per https://github.com/apache/parquet-format/blob/master/Encodings.md, // the maximum dictionary index width in Parquet is 32 bits. using IndexType = value_type; @@ -691,11 +693,9 @@ inline int RleDecoder::GetBatchWithDict(const V* dictionary, int32_t dictiona template template -inline int RleDecoder::GetBatchWithDictSpaced(const V* dictionary, - int32_t dictionary_length, V* out, - int batch_size, int null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset) { +inline int RleBitPackedDecoder::GetBatchWithDictSpaced( + const V* dictionary, int32_t dictionary_length, V* out, int batch_size, + int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset) { if (null_count == 0) { return GetBatchWithDict(dictionary, dictionary_length, out, batch_size); } @@ -731,7 +731,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const V* dictionary, } template -bool RleDecoder::NextCounts() { +bool RleBitPackedDecoder::NextCounts() { // Read the next run's indicator int, it could be a literal or repeated run. // The int is encoded as a vlq-encoded value. uint32_t indicator_value = 0; diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index c185edc4ea02..ce9e2bd4fbb1 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -240,7 +240,7 @@ void ValidateRle(const std::vector& values, int bit_width, // Verify read { - RleDecoder decoder(buffer, len, bit_width); + RleBitPackedDecoder decoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { uint64_t val; bool result = decoder.Get(&val); @@ -251,7 +251,7 @@ void ValidateRle(const std::vector& values, int bit_width, // Verify batch read { - RleDecoder decoder(buffer, len, bit_width); + RleBitPackedDecoder decoder(buffer, len, bit_width); std::vector values_read(values.size()); ASSERT_EQ(values.size(), decoder.GetBatch(values_read.data(), static_cast(values.size()))); @@ -282,7 +282,7 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { int out = 0; { - RleDecoder decoder(buffer, encoded_len, bit_width); + RleBitPackedDecoder decoder(buffer, encoded_len, bit_width); for (size_t i = 0; i < values.size(); ++i) { EXPECT_TRUE(decoder.Get(&out)); if (values[i] != out) { @@ -293,7 +293,7 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { // Verify batch read { - RleDecoder decoder(buffer, encoded_len, bit_width); + RleBitPackedDecoder decoder(buffer, encoded_len, bit_width); std::vector values_read(values.size()); if (static_cast(values.size()) != decoder.GetBatch(values_read.data(), static_cast(values.size()))) { @@ -419,7 +419,7 @@ TEST(Rle, BitWidthZeroRepeated) { uint8_t buffer[1]; const int num_values = 15; buffer[0] = num_values << 1; // repeated indicator byte - RleDecoder decoder(buffer, sizeof(buffer), 0); + RleBitPackedDecoder decoder(buffer, sizeof(buffer), 0); uint8_t val; for (int i = 0; i < num_values; ++i) { bool result = decoder.Get(&val); @@ -433,7 +433,7 @@ TEST(Rle, BitWidthZeroLiteral) { uint8_t buffer[1]; const int num_groups = 4; buffer[0] = num_groups << 1 | 1; // literal indicator byte - RleDecoder decoder = {buffer, sizeof(buffer), 0}; + RleBitPackedDecoder decoder = {buffer, sizeof(buffer), 0}; const int num_values = num_groups * 8; uint8_t val; for (int i = 0; i < num_values; ++i) { @@ -538,7 +538,7 @@ TEST(BitRle, Overflow) { EXPECT_LE(bytes_written, len); EXPECT_GT(num_added, 0); - RleDecoder decoder(buffer.data(), bytes_written, bit_width); + RleBitPackedDecoder decoder(buffer.data(), bytes_written, bit_width); parity = true; uint32_t v; for (int i = 0; i < num_added; ++i) { @@ -575,7 +575,7 @@ void CheckRoundTripSpaced(const Array& data, int bit_width) { int encoded_size = encoder.Flush(); // Verify batch read - RleDecoder decoder(buffer.data(), encoded_size, bit_width); + RleBitPackedDecoder decoder(buffer.data(), encoded_size, bit_width); std::vector values_read(num_values); if (num_values != decoder.GetBatchSpaced( diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 9ad2abf1c6dc..9c314cf81817 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -113,7 +113,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, } const uint8_t* decoder_data = data + 4; if (!rle_decoder_) { - rle_decoder_ = std::make_unique<::arrow::util::RleDecoder>( + rle_decoder_ = std::make_unique<::arrow::util::RleBitPackedDecoder>( decoder_data, num_bytes, bit_width_); } else { rle_decoder_->Reset(decoder_data, num_bytes, bit_width_); @@ -157,8 +157,8 @@ void LevelDecoder::SetDataV2(int32_t num_bytes, int16_t max_level, bit_width_ = bit_util::Log2(max_level + 1); if (!rle_decoder_) { - rle_decoder_ = - std::make_unique<::arrow::util::RleDecoder>(data, num_bytes, bit_width_); + rle_decoder_ = std::make_unique<::arrow::util::RleBitPackedDecoder>( + data, num_bytes, bit_width_); } else { rle_decoder_->Reset(data, num_bytes, bit_width_); } diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 57130f30d185..ac4469b1904f 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -40,7 +40,7 @@ class BitReader; namespace util { template -class RleDecoder; +class RleBitPackedDecoder; } // namespace util } // namespace arrow @@ -96,7 +96,7 @@ class PARQUET_EXPORT LevelDecoder { int bit_width_; int num_values_remaining_; Encoding::type encoding_; - std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_; + std::unique_ptr<::arrow::util::RleBitPackedDecoder> rle_decoder_; std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_; int16_t max_level_; }; diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index a563f520e093..46d1c201e927 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -861,7 +861,8 @@ class DictDecoderImpl : public TypedDecoderImpl, public DictDecoder this->num_values_ = num_values; if (len == 0) { // Initialize dummy decoder to avoid crashes later on - idx_decoder_ = ::arrow::util::RleDecoder(data, len, /*bit_width=*/1); + idx_decoder_ = + ::arrow::util::RleBitPackedDecoder(data, len, /*bit_width=*/1); return; } uint8_t bit_width = *data; @@ -869,7 +870,7 @@ class DictDecoderImpl : public TypedDecoderImpl, public DictDecoder throw ParquetException("Invalid or corrupted bit_width " + std::to_string(bit_width) + ". Maximum allowed is 32."); } - idx_decoder_ = ::arrow::util::RleDecoder(++data, --len, bit_width); + idx_decoder_ = ::arrow::util::RleBitPackedDecoder(++data, --len, bit_width); } int Decode(T* buffer, int num_values) override { @@ -1003,7 +1004,7 @@ class DictDecoderImpl : public TypedDecoderImpl, public DictDecoder // BinaryDictionary32Builder std::shared_ptr indices_scratch_space_; - ::arrow::util::RleDecoder idx_decoder_; + ::arrow::util::RleBitPackedDecoder idx_decoder_; }; template @@ -1810,9 +1811,9 @@ class RleBooleanDecoder : public TypedDecoderImpl, public BooleanDe auto decoder_data = data + 4; if (decoder_ == nullptr) { - decoder_ = - std::make_shared<::arrow::util::RleDecoder>(decoder_data, num_bytes, - /*bit_width=*/1); + decoder_ = std::make_shared<::arrow::util::RleBitPackedDecoder>( + decoder_data, num_bytes, + /*bit_width=*/1); } else { decoder_->Reset(decoder_data, num_bytes, /*bit_width=*/1); } @@ -1899,7 +1900,7 @@ class RleBooleanDecoder : public TypedDecoderImpl, public BooleanDe } private: - std::shared_ptr<::arrow::util::RleDecoder> decoder_; + std::shared_ptr<::arrow::util::RleBitPackedDecoder> decoder_; }; // ---------------------------------------------------------------------- From 1faac12c8586641a7488dd1e48c8fe12a166ae98 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 5 Aug 2025 10:39:32 +0200 Subject: [PATCH 07/56] Add RleDecoder --- cpp/src/arrow/util/rle_encoding_internal.h | 103 +++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 0b2b603ce6dd..b1e109a0621f 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -167,6 +167,46 @@ class BitPackedRun { bit_size_type value_bit_width_ = 0; }; +/// Decoder class for RLE encoded data. +template +class RleDecoder { + public: + /// The type in which the data should be decoded. + using value_type = T; + /// The type of run that can be decoded. + using run_type = RleRun; + using values_count_type = run_type::values_count_type; + + constexpr RleDecoder() noexcept = default; + + explicit RleDecoder(run_type const& run) noexcept; + + void Reset(run_type const& run) noexcept; + + /// Return the number of values that can be advanced. + [[nodiscard]] values_count_type Remaining() const; + + /// Return the repeated value of this decoder. + [[nodiscard]] constexpr value_type Value() const; + + /// Try to advance by as many values as provided. + /// Return the number of values skipped. + [[nodiscard]] values_count_type Advance(values_count_type batch_size); + + /// Get the next value and return false if there are no more. + [[nodiscard]] constexpr bool Get(value_type* out_value); + + /// Get a batch of values return the number of decoded elements. + [[nodiscard]] values_count_type GetBatch(value_type* out, values_count_type batch_size); + + private: + value_type value_ = {}; + values_count_type remaining_count_ = 0; + + static_assert(std::is_integral_v, + "This class makes assumptions about integer endianness and padding"); +}; + /// Decoder class for RLE encoded data. template class RleBitPackedDecoder { @@ -760,6 +800,69 @@ bool RleBitPackedDecoder::NextCounts() { return true; } +/**************** + * RleDecoder * + ****************/ + +template +RleDecoder::RleDecoder(run_type const& run) noexcept { + Reset(run); +} + +template +void RleDecoder::Reset(run_type const& run) noexcept { + remaining_count_ = run.ValuesCount(); + if constexpr (std::is_same_v) { + // ARROW-18031: just check the LSB of the next byte and move on. + // If we memcpy + FromLittleEndian, we have potential undefined behavior + // if the bool value isn't 0 or 1. + value_ = *run.RawDataPtr() & 1; + } + // Memcopy is required to avoid undefined behavior. + std::memset(&value_, 0, sizeof(value_type)); + std::memcpy(&value_, run.RawDataPtr(), run.RawDataSize()); + value_ = ::arrow::bit_util::FromLittleEndian(value_); +} + +template +auto RleDecoder::Remaining() const -> values_count_type { + return remaining_count_; +} + +template +auto constexpr RleDecoder::Value() const -> value_type { + return value_; +} + +template +auto RleDecoder::Advance(values_count_type batch_size) -> values_count_type { + auto const steps = std::min(batch_size, remaining_count_); + remaining_count_ -= steps; + return steps; +} + +template +constexpr bool RleDecoder::Get(value_type* out_value) { + return GetBatch(out_value, 1) == 1; +} + +template +auto RleDecoder::GetBatch(value_type* out, values_count_type batch_size) + -> values_count_type { + if (remaining_count_ == 0) { + return 0; + } + + auto const to_read = std::min(remaining_count_, batch_size); + std::fill(out, out + to_read, value_); + remaining_count_ -= to_read; + return to_read; +} + +/**************** + * RleEncoder * + ****************/ + /// This function buffers input values 8 at a time. After seeing all 8 values, /// it decides whether they should be encoded as a literal or repeated run. inline bool RleEncoder::Put(uint64_t value) { From 33bb556a6c16fa81a1ce54c84edc9288fd360c92 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 5 Aug 2025 11:55:23 +0200 Subject: [PATCH 08/56] Add BitPackedDecoder --- .../arrow/util/bit_stream_utils_internal.h | 4 +- cpp/src/arrow/util/rle_encoding_internal.h | 100 ++++++++++++++++++ 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 0e62e8a40d6d..3f8577944d7f 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -128,14 +128,14 @@ inline uint64_t ReadLittleEndianWord(const uint8_t* buffer, int bytes_remaining) /// bytes in one read (e.g. encoded int). class BitReader { public: - BitReader() = default; + BitReader() noexcept = default; /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. BitReader(const uint8_t* buffer, int buffer_len) : BitReader() { Reset(buffer, buffer_len); } - void Reset(const uint8_t* buffer, int buffer_len) { + void Reset(const uint8_t* buffer, int buffer_len) noexcept { buffer_ = buffer; max_bytes_ = buffer_len; byte_offset_ = 0; diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index b1e109a0621f..d1b17cd6a036 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -207,6 +207,48 @@ class RleDecoder { "This class makes assumptions about integer endianness and padding"); }; +/// Decoder class for Bit packing encoded data. +template +class BitPackedDecoder { + public: + /// The type in which the data should be decoded. + using value_type = T; + /// The type of run that can be decoded. + using run_type = BitPackedRun; + using values_count_type = run_type::values_count_type; + using bit_size_type = run_type::bit_size_type; + + BitPackedDecoder() noexcept = default; + + explicit BitPackedDecoder(run_type const& run) noexcept; + + void Reset(run_type const& run) noexcept; + + /// Return the number of values that can be advanced. + [[nodiscard]] constexpr values_count_type Remaining() const; + + /// Return the size in bit in which each encoded value is written. + [[nodiscard]] constexpr bit_size_type ValueBitWidth() const; + + /// Try to advance by as many values as provided. + /// Return the number of values skipped. + [[nodiscard]] values_count_type Advance(values_count_type batch_size); + + /// Get the next value and return false if there are no more. + [[nodiscard]] bool Get(value_type* out_value); + + /// Get a batch of values return the number of decoded elements. + [[nodiscard]] values_count_type GetBatch(value_type* out, values_count_type batch_size); + + private: + ::arrow::bit_util::BitReader bit_reader_ = {}; + bit_size_type value_bit_width_ = 0; + values_count_type remaining_count_ = 0; + + static_assert(std::is_integral_v, + "This class makes assumptions about integer endianness and padding"); +}; + /// Decoder class for RLE encoded data. template class RleBitPackedDecoder { @@ -859,6 +901,64 @@ auto RleDecoder::GetBatch(value_type* out, values_count_type batch_size) return to_read; } +/********************** + * BitPackedDecoder * + **********************/ + +template +BitPackedDecoder::BitPackedDecoder(run_type const& run) noexcept { + Reset(run); +} + +template +void BitPackedDecoder::Reset(run_type const& run) noexcept { + value_bit_width_ = run.ValuesBitWidth(); + remaining_count_ = run.ValuesCount(); + ARROW_DCHECK_GE(value_bit_width_, 0); + ARROW_DCHECK_LE(value_bit_width_, 64); + bit_reader_.Reset(run.RawDataPtr(), run.RawDataSize()); +} + +template +auto constexpr BitPackedDecoder::Remaining() const -> values_count_type { + return remaining_count_; +} + +template +auto constexpr BitPackedDecoder::ValueBitWidth() const -> bit_size_type { + return value_bit_width_; +} + +template +auto BitPackedDecoder::Advance(values_count_type batch_size) -> values_count_type { + auto const steps = std::min(batch_size, remaining_count_); + if (bit_reader_.Advance(steps * value_bit_width_)) { + remaining_count_ -= steps; + return steps; + } + return 0; +} + +template +bool BitPackedDecoder::Get(value_type* out_value) { + return GetBatch(out_value, 1) == 1; +} + +template +auto BitPackedDecoder::GetBatch(value_type* out, values_count_type batch_size) + -> values_count_type { + if (remaining_count_ == 0) { + return 0; + } + + auto const to_read = std::min(remaining_count_, batch_size); + auto const actual_read = bit_reader_.GetBatch(value_bit_width_, out, to_read); + // There should not be any reason why the actual read would be different + // but this is error resistant. + remaining_count_ -= actual_read; + return actual_read; +} + /**************** * RleEncoder * ****************/ From 8369eb6972051743d72cd2cf3d85b8dd601e638d Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 5 Aug 2025 17:58:06 +0200 Subject: [PATCH 09/56] Implement runs --- cpp/src/arrow/util/rle_encoding_internal.h | 61 ++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index d1b17cd6a036..672d8ef78d59 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -842,6 +842,67 @@ bool RleBitPackedDecoder::NextCounts() { return true; } +/************ + * RleRun * + ************/ + +inline RleRun::RleRun(raw_data_const_pointer data, values_count_type values_count, + bit_size_type value_bit_width) noexcept + : values_count_(values_count), value_bit_width_(value_bit_width) { + ARROW_DCHECK_GE(value_bit_width, 0); + ARROW_DCHECK_GE(values_count, 0); + std::copy(data, data + RawDataSize(), data_.begin()); +} + +constexpr auto RleRun::ValuesCount() const noexcept -> values_count_type { + return values_count_; +} + +constexpr auto RleRun::ValuesBitWidth() const noexcept -> bit_size_type { + return value_bit_width_; +} + +constexpr auto RleRun::RawDataPtr() const noexcept -> raw_data_const_pointer { + return data_.data(); +} + +constexpr auto RleRun::RawDataSize() const noexcept -> raw_data_size_type { + auto out = bit_util::BytesForBits(value_bit_width_); + ARROW_DCHECK_LE(out, std::numeric_limits::max()); + return static_cast(out); +}; + +/****************** + * BitPackedRun * + ******************/ + +constexpr BitPackedRun::BitPackedRun(raw_data_const_pointer data, + values_count_type values_count, + bit_size_type value_bit_width) noexcept + : data_(data), values_count_(values_count), value_bit_width_(value_bit_width) { + ARROW_CHECK_GE(value_bit_width_, 0); + ARROW_CHECK_GE(values_count_, 0); +} + +constexpr auto BitPackedRun::ValuesCount() const noexcept -> values_count_type { + return values_count_; +} + +constexpr auto BitPackedRun::ValuesBitWidth() const noexcept -> bit_size_type { + return value_bit_width_; +} + +constexpr auto BitPackedRun::RawDataPtr() const noexcept -> raw_data_const_pointer { + return data_; +} + +constexpr auto BitPackedRun::RawDataSize() const noexcept -> raw_data_size_type { + auto out = bit_util::BytesForBits(static_cast(value_bit_width_) * + static_cast(values_count_)); + ARROW_CHECK_LE(out, std::numeric_limits::max()); + return static_cast(out); +} + /**************** * RleDecoder * ****************/ From 97647ac4de0cb004ab47788376f038b5ec474d3a Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 6 Aug 2025 10:29:42 +0200 Subject: [PATCH 10/56] Add RleBitPackedParser --- cpp/src/arrow/util/rle_encoding_internal.h | 150 +++++++++++++++++++++ 1 file changed, 150 insertions(+) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 672d8ef78d59..e6aed2606f74 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -23,7 +23,9 @@ #include #include #include +#include #include +#include #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" @@ -167,6 +169,55 @@ class BitPackedRun { bit_size_type value_bit_width_ = 0; }; +/// A parser that emits either a ``BitPackedRun`` or a ``RleRun``. +class RleBitPackedParser { + public: + using byte = uint8_t; + using raw_data_const_pointer = const byte*; + /// By Parquet thrift definition the page size can be written into an int32_t. + using raw_data_size_type = int32_t; + /// The type to represent a size in bits + using bit_size_type = int32_t; + /// The different types of runs emitted by the parser + using dynamic_run_type = std::variant; + + constexpr RleBitPackedParser() noexcept = default; + + constexpr RleBitPackedParser(raw_data_const_pointer data, raw_data_size_type data_size, + bit_size_type value_bit_width) noexcept; + + constexpr void Reset(raw_data_const_pointer data, raw_data_size_type data_size, + bit_size_type value_bit_width_) noexcept; + + /// Get the current run with a small parsing cost without advancing the iteration. + [[nodiscard]] std::optional Peek() const; + + /// Move to the next run. + [[nodiscard]] bool Advance(); + + /// Advance and return the current run. + [[nodiscard]] std::optional Next(); + + /// Whether there is still runs to iterate over. + /// + /// WARN: Due to lack of proper error handling, iteration with Next and Peek could + /// return not data while the parser is not exhausted. + /// This is how one can check for errors. + [[nodiscard]] bool Exhausted() const; + + private: + /// The pointer to the beginning of the run + raw_data_const_pointer data_ = nullptr; + /// Size in bytes of the run. + raw_data_size_type data_size_ = 0; + /// The size in bit of a packed value in the run + bit_size_type value_bit_width_ = 0; + + /// Like Peek but also return the number of bytes to advance after. + [[nodiscard]] std::pair, raw_data_size_type> PeekCount() + const; +}; + /// Decoder class for RLE encoded data. template class RleDecoder { @@ -903,6 +954,105 @@ constexpr auto BitPackedRun::RawDataSize() const noexcept -> raw_data_size_type return static_cast(out); } +/************************ + * RleBitPackedParser * + ************************/ + +constexpr RleBitPackedParser::RleBitPackedParser(raw_data_const_pointer data, + raw_data_size_type size, + bit_size_type value_bit_width) noexcept { + Reset(data, size, value_bit_width); +} + +constexpr void RleBitPackedParser::Reset(raw_data_const_pointer data, + raw_data_size_type data_size, + bit_size_type value_bit_width) noexcept { + data_ = data; + data_size_ = data_size; + value_bit_width_ = value_bit_width; +} + +inline auto RleBitPackedParser::Peek() const -> std::optional { + auto [out, count] = PeekCount(); + return out; +} + +inline auto RleBitPackedParser::Next() -> std::optional { + auto [out, count] = PeekCount(); + data_ += count; + data_size_ -= count; + return out; +} + +inline bool RleBitPackedParser::Advance() { return Next().has_value(); } + +inline bool RleBitPackedParser::Exhausted() const { return data_size_ == 0; } + +namespace internal { +// The maximal unsigned size that a variable can fit. +template +constexpr auto max_size_for_v = + static_cast>(std::numeric_limits::max()); + +} // namespace internal + +inline auto RleBitPackedParser::PeekCount() const + -> std::pair, raw_data_size_type> { + if (ARROW_PREDICT_FALSE(Exhausted())) { + return {}; + } + + constexpr auto kMaxSize = bit_util::MaxLEB128ByteLenFor; + uint32_t run_len_type = 0; + auto const header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type); + + if (header_bytes == 0) { + // Malfomrmed LEB128 data + return {}; + } + + bool const is_bit_packed = run_len_type & 1; + uint32_t const count = run_len_type >> 1; + if (is_bit_packed) { + using values_count_type = BitPackedRun::values_count_type; + constexpr auto kMaxCount = + bit_util::CeilDiv(internal::max_size_for_v, 8); + if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) { + /// Illegal number of encoded values + return {}; + } + + auto const values_count = static_cast(count * 8); + ARROW_DCHECK_LT(count, internal::max_size_for_v); + // Count Already divided by 8 + auto const bytes_read = + header_bytes + static_cast(count) * value_bit_width_; + + return { + {BitPackedRun(data_ + header_bytes, values_count, value_bit_width_)}, + bytes_read, + }; + } + + using values_count_type = RleRun::values_count_type; + if (ARROW_PREDICT_FALSE( + count == 0 || + count > static_cast(std::numeric_limits::max()))) { + /// Illegal number of encoded values + return {}; + } + + auto const values_count = static_cast(count); + auto const value_bytes = bit_util::BytesForBits(value_bit_width_); + ARROW_DCHECK_LT(value_bytes, internal::max_size_for_v); + auto const bytes_read = header_bytes + static_cast(value_bytes); + + return { + {RleRun(data_ + header_bytes, values_count, value_bit_width_)}, + bytes_read, + }; +} + /**************** * RleDecoder * ****************/ From a4bc5932f2e952ac5f5bfa0ac2787902e07ad574 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 7 Aug 2025 09:57:56 +0200 Subject: [PATCH 11/56] Rename RleEncoder > RleBitPackedEncoder --- cpp/src/arrow/util/rle_encoding_internal.h | 18 ++++---- cpp/src/arrow/util/rle_encoding_test.cc | 52 +++++++++++----------- cpp/src/parquet/column_writer.cc | 8 ++-- cpp/src/parquet/column_writer.h | 4 +- cpp/src/parquet/encoder.cc | 11 ++--- 5 files changed, 47 insertions(+), 46 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index e6aed2606f74..feb3b9660ec3 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -386,7 +386,7 @@ class RleBitPackedDecoder { /// This class does so by buffering 8 values at a time. If they are not all the same /// they are added to the literal run. If they are the same, they are added to the /// repeated run. When we switch modes, the previous run is flushed out. -class RleEncoder { +class RleBitPackedEncoder { public: /// buffer/buffer_len: preallocated output buffer. /// bit_width: max number of bits for value. @@ -394,7 +394,7 @@ class RleEncoder { /// when values should be encoded as repeated runs. Currently this is derived /// based on the bit_width, which can determine a storage optimal choice. /// TODO: allow 0 bit_width (and have dict encoder use it) - RleEncoder(uint8_t* buffer, int buffer_len, int bit_width) + RleBitPackedEncoder(uint8_t* buffer, int buffer_len, int bit_width) : bit_width_(bit_width), bit_writer_(buffer, buffer_len) { ARROW_DCHECK_GE(bit_width_, 0); ARROW_DCHECK_LE(bit_width_, 64); @@ -1176,7 +1176,7 @@ auto BitPackedDecoder::GetBatch(value_type* out, values_count_type batch_size /// This function buffers input values 8 at a time. After seeing all 8 values, /// it decides whether they should be encoded as a literal or repeated run. -inline bool RleEncoder::Put(uint64_t value) { +inline bool RleBitPackedEncoder::Put(uint64_t value) { ARROW_DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_)); if (ARROW_PREDICT_FALSE(buffer_full_)) return false; @@ -1207,7 +1207,7 @@ inline bool RleEncoder::Put(uint64_t value) { return true; } -inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { +inline void RleBitPackedEncoder::FlushLiteralRun(bool update_indicator_byte) { if (literal_indicator_byte_ == NULL) { // The literal indicator byte has not been reserved yet, get one now. literal_indicator_byte_ = bit_writer_.GetNextBytePtr(); @@ -1237,7 +1237,7 @@ inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { } } -inline void RleEncoder::FlushRepeatedRun() { +inline void RleBitPackedEncoder::FlushRepeatedRun() { ARROW_DCHECK_GT(repeat_count_, 0); bool result = true; // The lsb of 0 indicates this is a repeated run @@ -1253,7 +1253,7 @@ inline void RleEncoder::FlushRepeatedRun() { /// Flush the values that have been buffered. At this point we decide whether /// we need to switch between the run types or continue the current one. -inline void RleEncoder::FlushBufferedValues(bool done) { +inline void RleBitPackedEncoder::FlushBufferedValues(bool done) { if (repeat_count_ >= 8) { // Clear the buffered values. They are part of the repeated run now and we // don't want to flush them out as literals. @@ -1283,7 +1283,7 @@ inline void RleEncoder::FlushBufferedValues(bool done) { repeat_count_ = 0; } -inline int RleEncoder::Flush() { +inline int RleBitPackedEncoder::Flush() { if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ || num_buffered_values_ == 0); @@ -1310,14 +1310,14 @@ inline int RleEncoder::Flush() { return bit_writer_.bytes_written(); } -inline void RleEncoder::CheckBufferFull() { +inline void RleBitPackedEncoder::CheckBufferFull() { int bytes_written = bit_writer_.bytes_written(); if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) { buffer_full_ = true; } } -inline void RleEncoder::Clear() { +inline void RleBitPackedEncoder::Clear() { buffer_full_ = false; current_value_ = 0; repeat_count_ = 0; diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index ce9e2bd4fbb1..ddb9f68720fc 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -211,8 +211,8 @@ TEST(BitUtil, RoundTripIntValues) { // expected_encoding != NULL, also validates that the encoded buffer is // exactly 'expected_encoding'. // if expected_len is not -1, it will validate the encoded size is correct. -void ValidateRle(const std::vector& values, int bit_width, - uint8_t* expected_encoding, int expected_len) { +void ValidateRleBitPacked(const std::vector& values, int bit_width, + uint8_t* expected_encoding, int expected_len) { const int len = 64 * 1024; #ifdef __EMSCRIPTEN__ // don't make this on the stack as it is @@ -224,7 +224,7 @@ void ValidateRle(const std::vector& values, int bit_width, #endif EXPECT_LE(expected_len, len); - RleEncoder encoder(buffer, len, bit_width); + RleBitPackedEncoder encoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { bool result = encoder.Put(values[i]); EXPECT_TRUE(result); @@ -271,7 +271,7 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { #else uint8_t buffer[len]; #endif - RleEncoder encoder(buffer, len, bit_width); + RleBitPackedEncoder encoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { bool result = encoder.Put(values[i]); if (!result) { @@ -308,7 +308,7 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { return true; } -TEST(Rle, SpecificSequences) { +TEST(RleBitPacked, SpecificSequences) { const int len = 1024; uint8_t expected_buffer[len]; std::vector values; @@ -328,12 +328,12 @@ TEST(Rle, SpecificSequences) { expected_buffer[2] = (50 << 1); expected_buffer[3] = 1; for (int width = 1; width <= 8; ++width) { - ValidateRle(values, width, expected_buffer, 4); + ValidateRleBitPacked(values, width, expected_buffer, 4); } for (int width = 9; width <= MAX_WIDTH; ++width) { - ValidateRle(values, width, nullptr, - 2 * (1 + static_cast(bit_util::CeilDiv(width, 8)))); + ValidateRleBitPacked(values, width, nullptr, + 2 * (1 + static_cast(bit_util::CeilDiv(width, 8)))); } // Test 100 0's and 1's alternating @@ -349,11 +349,11 @@ TEST(Rle, SpecificSequences) { expected_buffer[100 / 8 + 1] = 0x0A /* 0b00001010 */; // num_groups and expected_buffer only valid for bit width = 1 - ValidateRle(values, 1, expected_buffer, 1 + num_groups); + ValidateRleBitPacked(values, 1, expected_buffer, 1 + num_groups); for (int width = 2; width <= MAX_WIDTH; ++width) { int num_values = static_cast(bit_util::CeilDiv(100, 8)) * 8; - ValidateRle(values, width, nullptr, - 1 + static_cast(bit_util::CeilDiv(width * num_values, 8))); + ValidateRleBitPacked(values, width, nullptr, + 1 + static_cast(bit_util::CeilDiv(width * num_values, 8))); } // Test 16-bit values to confirm encoded values are stored in little endian @@ -371,7 +371,7 @@ TEST(Rle, SpecificSequences) { expected_buffer[4] = 0x55; expected_buffer[5] = 0xaa; - ValidateRle(values, 16, expected_buffer, 6); + ValidateRleBitPacked(values, 16, expected_buffer, 6); // Test 32-bit values to confirm encoded values are stored in little endian values.resize(28); @@ -392,7 +392,7 @@ TEST(Rle, SpecificSequences) { expected_buffer[8] = 0xaa; expected_buffer[9] = 0x5a; - ValidateRle(values, 32, expected_buffer, 10); + ValidateRleBitPacked(values, 32, expected_buffer, 10); } // ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value @@ -403,10 +403,10 @@ void TestRleValues(int bit_width, int num_vals, int value = -1) { for (int v = 0; v < num_vals; ++v) { values.push_back((value != -1) ? value : static_cast(v % mod)); } - ValidateRle(values, bit_width, NULL, -1); + ValidateRleBitPacked(values, bit_width, NULL, -1); } -TEST(Rle, TestValues) { +TEST(RleBitPacked, TestValues) { for (int width = 1; width <= MAX_WIDTH; ++width) { TestRleValues(width, 1); TestRleValues(width, 1024); @@ -415,7 +415,7 @@ TEST(Rle, TestValues) { } } -TEST(Rle, BitWidthZeroRepeated) { +TEST(RleBitPacked, BitWidthZeroRepeated) { uint8_t buffer[1]; const int num_values = 15; buffer[0] = num_values << 1; // repeated indicator byte @@ -429,7 +429,7 @@ TEST(Rle, BitWidthZeroRepeated) { EXPECT_FALSE(decoder.Get(&val)); } -TEST(Rle, BitWidthZeroLiteral) { +TEST(RleBitPacked, BitWidthZeroLiteral) { uint8_t buffer[1]; const int num_groups = 4; buffer[0] = num_groups << 1 | 1; // literal indicator byte @@ -450,13 +450,13 @@ TEST(BitRle, Flush) { std::vector values; for (int i = 0; i < 16; ++i) values.push_back(1); values.push_back(0); - ValidateRle(values, 1, NULL, -1); + ValidateRleBitPacked(values, 1, NULL, -1); values.push_back(1); - ValidateRle(values, 1, NULL, -1); + ValidateRleBitPacked(values, 1, NULL, -1); values.push_back(1); - ValidateRle(values, 1, NULL, -1); + ValidateRleBitPacked(values, 1, NULL, -1); values.push_back(1); - ValidateRle(values, 1, NULL, -1); + ValidateRleBitPacked(values, 1, NULL, -1); } // Test some random sequences. @@ -515,17 +515,17 @@ TEST(BitRle, RepeatedPattern) { } } - ValidateRle(values, 1, NULL, -1); + ValidateRleBitPacked(values, 1, NULL, -1); } TEST(BitRle, Overflow) { for (int bit_width = 1; bit_width < 32; bit_width += 3) { - int len = RleEncoder::MinBufferSize(bit_width); + int len = RleBitPackedEncoder::MinBufferSize(bit_width); std::vector buffer(len); int num_added = 0; bool parity = true; - RleEncoder encoder(buffer.data(), len, bit_width); + RleBitPackedEncoder encoder(buffer.data(), len, bit_width); // Insert alternating true/false until there is no space left while (true) { bool result = encoder.Put(parity); @@ -559,12 +559,12 @@ void CheckRoundTripSpaced(const Array& data, int bit_width) { using T = typename Type::c_type; int num_values = static_cast(data.length()); - int buffer_size = RleEncoder::MaxBufferSize(bit_width, num_values); + int buffer_size = RleBitPackedEncoder::MaxBufferSize(bit_width, num_values); const T* values = static_cast(data).raw_values(); std::vector buffer(buffer_size); - RleEncoder encoder(buffer.data(), buffer_size, bit_width); + RleBitPackedEncoder encoder(buffer.data(), buffer_size, bit_width); for (int i = 0; i < num_values; ++i) { if (data.IsValid(i)) { if (!encoder.Put(static_cast(values[i]))) { diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index f35f84f002bf..1f3d64f6228c 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -70,7 +70,7 @@ using arrow::bit_util::BitWriter; using arrow::internal::checked_cast; using arrow::internal::checked_pointer_cast; using arrow::util::Float16; -using arrow::util::RleEncoder; +using arrow::util::RleBitPackedEncoder; namespace bit_util = arrow::bit_util; @@ -168,7 +168,7 @@ void LevelEncoder::Init(Encoding::type encoding, int16_t max_level, encoding_ = encoding; switch (encoding) { case Encoding::RLE: { - rle_encoder_ = std::make_unique(data, data_size, bit_width_); + rle_encoder_ = std::make_unique(data, data_size, bit_width_); break; } case Encoding::BIT_PACKED: { @@ -190,8 +190,8 @@ int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level, case Encoding::RLE: { // TODO: Due to the way we currently check if the buffer is full enough, // we need to have MinBufferSize as head room. - num_bytes = RleEncoder::MaxBufferSize(bit_width, num_buffered_values) + - RleEncoder::MinBufferSize(bit_width); + num_bytes = RleBitPackedEncoder::MaxBufferSize(bit_width, num_buffered_values) + + RleBitPackedEncoder::MinBufferSize(bit_width); break; } case Encoding::BIT_PACKED: { diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index bd329d61053f..2a046a0ca5d5 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -36,7 +36,7 @@ class BitWriter; } // namespace bit_util namespace util { -class RleEncoder; +class RleBitPackedEncoder; class CodecOptions; } // namespace util @@ -80,7 +80,7 @@ class PARQUET_EXPORT LevelEncoder { int bit_width_; int rle_length_; Encoding::type encoding_; - std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_; + std::unique_ptr<::arrow::util::RleBitPackedEncoder> rle_encoder_; std::unique_ptr<::arrow::bit_util::BitWriter> bit_packed_encoder_; }; diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index 112b810a8f9d..831ddbddab13 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -438,8 +438,8 @@ int RlePreserveBufferSize(int num_values, int bit_width) { // is called, we have to reserve an extra "RleEncoder::MinBufferSize" // bytes. These extra bytes won't be used but not reserving them // would cause the encoder to fail. - return ::arrow::util::RleEncoder::MaxBufferSize(bit_width, num_values) + - ::arrow::util::RleEncoder::MinBufferSize(bit_width); + return ::arrow::util::RleBitPackedEncoder::MaxBufferSize(bit_width, num_values) + + ::arrow::util::RleBitPackedEncoder::MinBufferSize(bit_width); } /// See the dictionary encoding section of @@ -476,7 +476,7 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { ++buffer; --buffer_len; - ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width()); + ::arrow::util::RleBitPackedEncoder encoder(buffer, buffer_len, bit_width()); for (int32_t index : buffered_indices_) { if (ARROW_PREDICT_FALSE(!encoder.Put(index))) return -1; @@ -1717,8 +1717,9 @@ std::shared_ptr RleBooleanEncoder::FlushValues() { int rle_buffer_size_max = MaxRleBufferSize(); std::shared_ptr buffer = AllocateBuffer(this->pool_, rle_buffer_size_max + kRleLengthInBytes); - ::arrow::util::RleEncoder encoder(buffer->mutable_data() + kRleLengthInBytes, - rle_buffer_size_max, /*bit_width*/ kBitWidth); + ::arrow::util::RleBitPackedEncoder encoder(buffer->mutable_data() + kRleLengthInBytes, + rle_buffer_size_max, + /*bit_width*/ kBitWidth); for (bool value : buffered_append_values_) { encoder.Put(value ? 1 : 0); From 991b59cf198e4c64019d5494b8d3c9dd478f8c23 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 7 Aug 2025 11:00:20 +0200 Subject: [PATCH 12/56] Add Runs tests --- cpp/src/arrow/util/rle_encoding_test.cc | 67 +++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index ddb9f68720fc..7867a2f2fed4 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -207,6 +207,73 @@ TEST(BitUtil, RoundTripIntValues) { } } +/// A Rle run is a simple class owning some data and a repetition count. +/// It does not know how to read such data. +TEST(Rle, RleRun) { + const std::array value = {21, 2, 0, 0}; + + RleRun::values_count_type value_count = 12; + + // 12 times the value 21 fitting over 5 bits + auto const run_5 = RleRun(value.data(), value_count, /* value_bit_width= */ 5); + EXPECT_EQ(run_5.ValuesCount(), value_count); + EXPECT_EQ(run_5.ValuesBitWidth(), 5); + EXPECT_EQ(run_5.RawDataSize(), 1); // 5 bits fit in one byte + EXPECT_EQ(*run_5.RawDataPtr(), 21); + + // 12 times the value 21 fitting over 16 bits + auto const run_8 = RleRun(value.data(), value_count, /* value_bit_width= */ 8); + EXPECT_EQ(run_8.ValuesCount(), value_count); + EXPECT_EQ(run_8.ValuesBitWidth(), 8); + EXPECT_EQ(run_8.RawDataSize(), 1); // 8 bits fit in 1 byte + EXPECT_EQ(*run_8.RawDataPtr(), 21); + + // 12 times the value {21, 2} fitting over 10 bits + auto const run_10 = RleRun(value.data(), value_count, /* value_bit_width= */ 10); + + EXPECT_EQ(run_10.ValuesCount(), value_count); + EXPECT_EQ(run_10.ValuesBitWidth(), 10); + EXPECT_EQ(run_10.RawDataSize(), 2); // 10 bits fit in 2 bytes + EXPECT_EQ(*(run_10.RawDataPtr() + 0), 21); + EXPECT_EQ(*(run_10.RawDataPtr() + 1), 2); + + // 12 times the value {21, 2} fitting over 32 bits + auto const run_32 = RleRun(value.data(), value_count, /* value_bit_width= */ 32); + EXPECT_EQ(run_32.ValuesCount(), value_count); + EXPECT_EQ(run_32.ValuesBitWidth(), 32); + EXPECT_EQ(run_32.RawDataSize(), 4); // 32 bits fit in 4 bytes + EXPECT_EQ(*(run_32.RawDataPtr() + 0), 21); + EXPECT_EQ(*(run_32.RawDataPtr() + 1), 2); + EXPECT_EQ(*(run_32.RawDataPtr() + 2), 0); + EXPECT_EQ(*(run_32.RawDataPtr() + 3), 0); +} + +/// A BitPacked run is a simple class owning some data and its size. +/// It does not know how to read such data. +TEST(BitPacked, BitPackedRun) { + const std::array value = {0b10101010, 0, 0, 0b1111111}; + + /// 16 values of 1 bit for a total of 16 bits + BitPackedRun::values_count_type value_count_1 = 16; + auto const run_1 = BitPackedRun(value.data(), value_count_1, /* value_bit_width= */ 1); + EXPECT_EQ(run_1.ValuesCount(), value_count_1); + EXPECT_EQ(run_1.ValuesBitWidth(), 1); + EXPECT_EQ(run_1.RawDataSize(), 2); // 16 bits fit in 2 bytes + for (BitPackedRun::raw_data_size_type i = 0; i < run_1.RawDataSize(); ++i) { + EXPECT_EQ(*(run_1.RawDataPtr() + i), value[i]); + } + + /// 8 values of 3 bits for a total of 24 bits + BitPackedRun::values_count_type value_count_3 = 8; + auto const run_3 = BitPackedRun(value.data(), value_count_3, /* value_bit_width= */ 3); + EXPECT_EQ(run_3.ValuesCount(), value_count_3); + EXPECT_EQ(run_3.ValuesBitWidth(), 3); + EXPECT_EQ(run_3.RawDataSize(), 3); // 24 bits fit in 3 bytes + for (BitPackedRun::raw_data_size_type i = 0; i < run_3.RawDataSize(); ++i) { + EXPECT_EQ(*(run_3.RawDataPtr() + i), value[i]); + } +} + // Validates encoding of values by encoding and decoding them. If // expected_encoding != NULL, also validates that the encoded buffer is // exactly 'expected_encoding'. From c35db07ecab38700c8b515e18232c9c3893e44c1 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 7 Aug 2025 11:52:47 +0200 Subject: [PATCH 13/56] Add RleDecoder test --- cpp/src/arrow/util/rle_encoding_test.cc | 60 +++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 7867a2f2fed4..3f39f0332bb5 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -274,6 +274,66 @@ TEST(BitPacked, BitPackedRun) { } } +template +void TestRleDecoder(std::vector bytes, + RleRun::values_count_type value_count, + RleRun::bit_size_type bit_width) { + // Pre-requisite for this test + EXPECT_GT(value_count, 6); + + // Compute value associated with bytes encoded as little endian + T value = 0; + for (std::size_t i = 0; i < bytes.size(); ++i) { + value += static_cast(bytes.at(i)) << (8 * i); + } + + auto const run = RleRun(bytes.data(), value_count, bit_width); + + auto decoder = RleDecoder(run); + std::vector vals = {0, 0}; + + EXPECT_EQ(decoder.Remaining(), value_count); + + typename decltype(decoder)::values_count_type read = 0; + EXPECT_EQ(decoder.Get(vals.data()), 1); + read += 1; + EXPECT_EQ(vals.at(0), value); + EXPECT_EQ(decoder.Remaining(), value_count - read); + + EXPECT_EQ(decoder.Advance(3), 3); + read += 3; + EXPECT_EQ(decoder.Remaining(), value_count - read); + + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(vals.at(0), value); + EXPECT_EQ(vals.at(1), value); + read += static_cast(vals.size()); + EXPECT_EQ(decoder.Remaining(), value_count - read); + + // Exhaust iteration + EXPECT_EQ(decoder.Advance(value_count - read), value_count - read); + EXPECT_EQ(decoder.Remaining(), 0); + EXPECT_EQ(decoder.Advance(1), 0); + vals = {0, 0}; + EXPECT_EQ(decoder.Get(vals.data()), 0); + EXPECT_EQ(vals.at(0), 0); + + // Reset the decoder + decoder.Reset(run); + EXPECT_EQ(decoder.Remaining(), value_count); + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(vals.at(0), value); + EXPECT_EQ(vals.at(1), value); +} + +TEST(Rle, RleDecoder) { + TestRleDecoder({21, 0, 0}, /* value_count= */ 21, /* bit_width= */ 5); + TestRleDecoder({1, 0}, /* value_count= */ 13, /* bit_width= */ 1); + TestRleDecoder({21, 2, 0, 1}, /* value_count= */ 20, /* bit_width= */ 30); +} + // Validates encoding of values by encoding and decoding them. If // expected_encoding != NULL, also validates that the encoded buffer is // exactly 'expected_encoding'. From 5123cb1712ce56f0543bc8e7892e99bcdcdb9e73 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 7 Aug 2025 15:32:11 +0200 Subject: [PATCH 14/56] Add BitPackedDecoder test --- cpp/src/arrow/util/rle_encoding_test.cc | 74 +++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 3f39f0332bb5..4c2c688dadfd 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -334,6 +334,80 @@ TEST(Rle, RleDecoder) { TestRleDecoder({21, 2, 0, 1}, /* value_count= */ 20, /* bit_width= */ 30); } +template +void TestBitPackedDecoder(std::vector bytes, + BitPackedRun::values_count_type value_count, + BitPackedRun::bit_size_type bit_width, + std::vector expected) { + // Pre-requisite for this test + EXPECT_GT(value_count, 6); + + auto const run = BitPackedRun(bytes.data(), value_count, bit_width); + + auto decoder = BitPackedDecoder(run); + std::vector vals = {0, 0}; + + EXPECT_EQ(decoder.Remaining(), value_count); + + typename decltype(decoder)::values_count_type read = 0; + EXPECT_EQ(decoder.Get(vals.data()), 1); + EXPECT_EQ(vals.at(0), expected.at(0 + read)); + read += 1; + EXPECT_EQ(decoder.Remaining(), value_count - read); + + EXPECT_EQ(decoder.Advance(3), 3); + read += 3; + EXPECT_EQ(decoder.Remaining(), value_count - read); + + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(vals.at(0), expected.at(0 + read)); + EXPECT_EQ(vals.at(1), expected.at(1 + read)); + read += static_cast(vals.size()); + EXPECT_EQ(decoder.Remaining(), value_count - read); + + // Exhaust iteration + EXPECT_EQ(decoder.Advance(value_count - read), value_count - read); + EXPECT_EQ(decoder.Remaining(), 0); + EXPECT_EQ(decoder.Advance(1), 0); + vals = {0, 0}; + EXPECT_EQ(decoder.Get(vals.data()), 0); + EXPECT_EQ(vals.at(0), 0); + + // Reset the decoder + decoder.Reset(run); + read = 0; + EXPECT_EQ(decoder.Remaining(), value_count); + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(vals.at(0), expected.at(0 + read)); + EXPECT_EQ(vals.at(1), expected.at(1 + read)); +} + +TEST(BitPacked, BitPackedDecoder) { + /// See parquet encoding for bytes layout + TestBitPackedDecoder( + /* bytes= */ {0x88, 0xc6, 0xfa}, + /* values_count= */ 8, + /* bit_width= */ 3, + /* expected= */ {0, 1, 2, 3, 4, 5, 6, 7}); + TestBitPackedDecoder( + /* bytes= */ {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7}, + /* values_count= */ 8, + /* bit_width= */ 8, + /* expected= */ {0, 1, 2, 3, 4, 5, 6, 7}); + TestBitPackedDecoder( + /* bytes= */ {0x47, 0xc, 0x10, 0x35}, + /* values_count= */ 8, + /* bit_width= */ 4, + /* expected= */ {7, 4, 12, 0, 0, 1, 5, 3}); + TestBitPackedDecoder( + /* bytes= */ {0xe8, 0x7, 0x20, 0xc0, 0x0, 0x4, 0x14, 0x60, 0xc0, 0x1}, + /* values_count= */ 8, + /* bit_width= */ 10, + /* expected= */ {1000, 1, 2, 3, 4, 5, 6, 7}); +} + // Validates encoding of values by encoding and decoding them. If // expected_encoding != NULL, also validates that the encoded buffer is // exactly 'expected_encoding'. From f832d827fced11b9ec03983f5bca535920963aaa Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 7 Aug 2025 17:50:27 +0200 Subject: [PATCH 15/56] Add RleBitPackedParser test --- cpp/src/arrow/util/rle_encoding_test.cc | 97 +++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 4c2c688dadfd..f2dd79f28e5e 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -408,6 +408,103 @@ TEST(BitPacked, BitPackedDecoder) { /* expected= */ {1000, 1, 2, 3, 4, 5, 6, 7}); } +template +void TestRleBitPackedParser(std::vector bytes, + RleBitPackedParser::bit_size_type bit_width, + std::vector expected) { + auto parser = RleBitPackedParser( + bytes.data(), static_cast(bytes.size()), + bit_width); + EXPECT_FALSE(parser.Exhausted()); + + // Peek return the same data + auto run1 = parser.Peek(); + EXPECT_TRUE(run1.has_value()); + auto run2 = parser.Peek(); + EXPECT_TRUE(run2.has_value()); + auto ptr1 = std::visit([](auto const& r) { return r.RawDataPtr(); }, run1.value()); + auto size1 = std::visit([](auto const& r) { return r.RawDataSize(); }, run1.value()); + auto ptr2 = std::visit([](auto const& r) { return r.RawDataPtr(); }, run2.value()); + auto size2 = std::visit([](auto const& r) { return r.RawDataSize(); }, run2.value()); + EXPECT_TRUE(std::equal(ptr1, ptr1 + size1, ptr2, ptr2 + size2)); + EXPECT_FALSE(parser.Exhausted()); + + // Try to decode all data of all runs in the decoded vector + decltype(expected) decoded = {}; + auto rle_decoder = RleDecoder(); + auto bit_packed_decoder = BitPackedDecoder(); + // Iterate over all runs + while (auto run = parser.Next()) { + EXPECT_TRUE(run.has_value()); + + if (std::holds_alternative(run.value())) { + rle_decoder.Reset(std::get(run.value())); + + auto const n_decoded = decoded.size(); + auto const n_to_decode = rle_decoder.Remaining(); + decoded.resize(n_decoded + n_to_decode); + EXPECT_EQ(rle_decoder.GetBatch(decoded.data() + n_decoded, n_to_decode), + n_to_decode); + EXPECT_EQ(rle_decoder.Remaining(), 0); + } else { + bit_packed_decoder.Reset(std::get(run.value())); + + auto const n_decoded = decoded.size(); + auto const n_to_decode = bit_packed_decoder.Remaining(); + decoded.resize(n_decoded + n_to_decode); + EXPECT_EQ(bit_packed_decoder.GetBatch(decoded.data() + n_decoded, n_to_decode), + n_to_decode); + EXPECT_EQ(bit_packed_decoder.Remaining(), 0); + } + } + + EXPECT_TRUE(parser.Exhausted()); + EXPECT_EQ(decoded.size(), expected.size()); + EXPECT_EQ(decoded, expected); +} + +TEST(RleBitPacked, RleBitPackedParser) { + TestRleBitPackedParser( + /* bytes= */ + {/* LEB128 for 8 values bit packed marker */ 0x3, + /* Bitpacked run */ 0x88, 0xc6, 0xfa}, + /* bit_width= */ 3, + /* expected= */ {0, 1, 2, 3, 4, 5, 6, 7}); + + { + std::vector expected = {0, 1, 2, 3, 4, 5, 6, 7}; + expected.resize(expected.size() + 200, 5); + TestRleBitPackedParser( + /* bytes= */ + {/* LEB128 for 8 values bit packed marker */ 0x3, + /* Bitpacked run */ 0x88, 0xc6, 0xfa, + /* LEB128 for 200 RLE marker */ 0x90, 0x3, + /* Value 5 over paded to a byte*/ 0x5}, + /* bit_width= */ 3, + /* expected= */ expected); + } + + { + std::vector expected = {0, 0, 0, 0, 1, 1, 1, 1}; + expected.resize(expected.size() + 200, 1); + expected.resize(expected.size() + 10, 3); + std::array run2 = {1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2}; + expected.insert(expected.end(), run2.begin(), run2.end()); + TestRleBitPackedParser( + /* bytes= */ + {/* LEB128 for 8 values bit packed marker */ 0x3, + /* Bitpacked run */ 0x0, 0x55, + /* LEB128 for 200 RLE marker */ 0x90, 0x3, + /* Value 1 over paded to a byte*/ 0x1, + /* LEB128 for 10 RLE marker */ 0x14, + /* Value 3 over paded to a byte*/ 0x3, + /* LEB128 for 16 values bit packed marker */ 0x5, + /* Bitpacked run */ 0x99, 0x99, 0x99, 0x99}, + /* bit_width= */ 2, + /* expected= */ expected); + } +} + // Validates encoding of values by encoding and decoding them. If // expected_encoding != NULL, also validates that the encoded buffer is // exactly 'expected_encoding'. From 1f87da5066e1d355572de774e6107d30e03a943b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 14 Aug 2025 16:56:44 +0200 Subject: [PATCH 16/56] Strengthen RleBitPacked tests --- cpp/src/arrow/util/rle_encoding_test.cc | 247 ++++++++++++++++++++---- 1 file changed, 212 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index f2dd79f28e5e..7fc568426854 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -25,7 +25,9 @@ #include #include "arrow/array.h" +#include "arrow/array/concatenate.h" #include "arrow/buffer.h" +#include "arrow/scalar.h" #include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/util/bit_stream_utils_internal.h" @@ -851,46 +853,149 @@ TEST(BitRle, Overflow) { } } -template -void CheckRoundTripSpaced(const Array& data, int bit_width) { +/// Check RleBitPacked encoding/decoding round trip. +/// +/// \tparam kSpaced If set to false, treat Nulls in the input array as regular data. +/// \tparam kParts The number of parts in which the data will be decoded. +/// For number greater than one, this ensure that the decoder intermediary state +/// is valid. +template +void CheckRoundTrip(const Array& data, int bit_width) { using ArrayType = typename TypeTraits::ArrayType; using T = typename Type::c_type; - int num_values = static_cast(data.length()); - int buffer_size = RleBitPackedEncoder::MaxBufferSize(bit_width, num_values); + int const data_size = static_cast(data.length()); + int const data_values_count = + static_cast(data.length() - kSpaced * data.null_count()); + int const buffer_size = RleBitPackedEncoder::MaxBufferSize(bit_width, data_size); + ASSERT_GE(kParts, 1); + ASSERT_LE(kParts, data_size); - const T* values = static_cast(data).raw_values(); + const T* data_values = static_cast(data).raw_values(); + // Encode the data into ``buffer`` using the encoder. std::vector buffer(buffer_size); RleBitPackedEncoder encoder(buffer.data(), buffer_size, bit_width); - for (int i = 0; i < num_values; ++i) { - if (data.IsValid(i)) { - if (!encoder.Put(static_cast(values[i]))) { - FAIL() << "Encoding failed"; - } + int32_t encoded_values_size = 0; + for (int i = 0; i < data_size; ++i) { + // Depending on kSpaced we treat nulls as regular values. + if (data.IsValid(i) || !kSpaced) { + bool success = encoder.Put(static_cast(data_values[i])); + ASSERT_TRUE(success) << "Encoding failed in pos " << i; + ++encoded_values_size; } } - int encoded_size = encoder.Flush(); + int encoded_byte_size = encoder.Flush(); + ASSERT_EQ(encoded_values_size, data_values_count) + << "All values input were not encoded successfully by the encoder"; + + // On to verify batch read + RleBitPackedDecoder decoder(buffer.data(), encoded_byte_size, bit_width); + std::vector values_read(data_size); + + // We will read the data in kParts calls to make sure intermediate states are valid + int32_t actual_read_count = 0; + int32_t requested_read_count = 0; + while (requested_read_count < data_size) { + auto const remaining = data_size - requested_read_count; + auto to_read = data_size / kParts; + if (remaining / to_read == 1) { + to_read = remaining; + } - // Verify batch read - RleBitPackedDecoder decoder(buffer.data(), encoded_size, bit_width); - std::vector values_read(num_values); + auto* out = values_read.data() + requested_read_count; - if (num_values != decoder.GetBatchSpaced( - num_values, static_cast(data.null_count()), - data.null_bitmap_data(), data.offset(), values_read.data())) { - FAIL(); + auto read = 0; + if constexpr (kSpaced) { + // We need to slice the input array get the proper null count and bitmap + auto data_remaining = data.Slice(requested_read_count, to_read); + read = decoder.GetBatchSpaced( + to_read, static_cast(data_remaining->null_count()), + data_remaining->null_bitmap_data(), data_remaining->offset(), out); + } else { + read = decoder.GetBatch(out, to_read); + } + ASSERT_EQ(read, to_read) << "Decoder did not read as many values as requested"; + + actual_read_count += read; + requested_read_count += to_read; + } + EXPECT_EQ(requested_read_count, data_size) << "This test logic is wrong"; + EXPECT_EQ(actual_read_count, data_size) << "Total number of values read is off"; + + // Verify the round trip: encoded-decoded values must equal the original one + for (int64_t i = 0; i < data_size; ++i) { + if (data.IsValid(i) || !kSpaced) { + EXPECT_EQ(values_read[i], data_values[i]) + << "Encoded then decoded value at position " << i << " (" << values_read[i] + << ") differs from original value (" << data_values[i] << ")"; + } } +} + +template +struct DataTestRleBitPackedRandomPart { + using value_type = T; + + value_type max; + int32_t size; + double null_probability; +}; + +template +struct DataTestRleBitPackedRepeatPart { + using value_type = T; + + value_type value; + int32_t size; +}; + +template +struct DataTestRleBitPackedNullPart { + using value_type = T; + + int32_t size; +}; - for (int64_t i = 0; i < num_values; ++i) { - if (data.IsValid(i)) { - if (values_read[i] != values[i]) { - FAIL() << "Index " << i << " read " << values_read[i] << " but should be " - << values[i]; +template +struct DataTestRleBitPacked { + using value_type = T; + using ArrowType = typename arrow::CTypeTraits::ArrowType; + using RandomPart = DataTestRleBitPackedRandomPart; + using RepeatPart = DataTestRleBitPackedRepeatPart; + using NullPart = DataTestRleBitPackedNullPart; + + std::vector> parts; + int32_t bit_width; + + std::shared_ptr<::arrow::Array> MakeArray() const { + uint32_t kSeed = 1337; + ::arrow::random::RandomArrayGenerator rand(kSeed); + + std::vector> arrays = {}; + + for (auto const& dyn_part : parts) { + if (auto* part = std::get_if(&dyn_part)) { + auto arr = rand.Numeric(part->size, /* min= */ value_type(0), + part->max, part->null_probability); + arrays.push_back(std::move(arr)); + + } else if (auto* part = std::get_if(&dyn_part)) { + auto scalar = ::arrow::MakeScalar(part->value); + arrays.push_back(::arrow::MakeArrayFromScalar(*scalar, part->size).ValueOrDie()); + + } else if (auto* part = std::get_if(&dyn_part)) { + using Traits = arrow::TypeTraits; + auto null_scalar = ::arrow::MakeNullScalar(Traits::type_singleton()); + arrays.push_back( + ::arrow::MakeArrayFromScalar(*null_scalar, part->size).ValueOrDie()); } } + ARROW_DCHECK_EQ(parts.size(), arrays.size()); + + return ::arrow::Concatenate(arrays).ValueOrDie(); } -} +}; template struct GetBatchSpacedTestCase { @@ -900,20 +1005,92 @@ struct GetBatchSpacedTestCase { int bit_width; }; -TEST(RleDecoder, GetBatchSpaced) { - uint32_t kSeed = 1337; - ::arrow::random::RandomArrayGenerator rand(kSeed); - - std::vector> int32_cases{ - {1, 100000, 0.01, 1}, {1, 100000, 0.1, 1}, {1, 100000, 0.5, 1}, - {4, 100000, 0.05, 3}, {100, 100000, 0.05, 7}, +template +void DoTestGetBatchSpacedRoundtrip() { + using Data = DataTestRleBitPacked; + using ArrowType = typename Data::ArrowType; + using RandomPart = typename Data::RandomPart; + using NullPart = typename Data::NullPart; + using RepeatPart = typename Data::RepeatPart; + + std::vector test_cases = { + { + {RandomPart{/* max=*/1, /* size=*/400, /* null_proba= */ 0.1}}, + /* bit_width= */ 1, + }, + { + { + RandomPart{/* max=*/7, /* size=*/10037, /* null_proba= */ 0.1}, + NullPart{/* size= */ 1153}, + RandomPart{/* max=*/7, /* size=*/800, /* null_proba= */ 0.5}, + }, + /* bit_width= */ 3, + }, + { + { + NullPart{/* size= */ 80}, + RandomPart{/* max=*/7, /* size=*/800, /* null_proba= */ 0.01}, + NullPart{/* size= */ 1023}, + }, + /* bit_width= */ 3, + }, + { + {RepeatPart{/* value=*/13, /* size=*/100000}}, + /* bit_width= */ 10, + }, + { + { + NullPart{/* size= */ 1024}, + RepeatPart{/* value=*/10000, /* size=*/100000}, + NullPart{/* size= */ 77}, + }, + /* bit_width= */ 23, + }, + { + { + RepeatPart{/* value=*/13, /* size=*/100000}, + NullPart{/* size= */ 1153}, + RepeatPart{/* value=*/72, /* size=*/100799}, + }, + /* bit_width= */ 10, + }, + { + { + RandomPart{/* max=*/1, /* size=*/1013, /* null_proba= */ 0.01}, + NullPart{/* size=*/8}, + RepeatPart{1, /* size= */ 256}, + NullPart{/* size=*/128}, + RepeatPart{0, /* size= */ 256}, + NullPart{/* size=*/15}, + RandomPart{/* max=*/1, /* size=*/8 * 1024, /* null_proba= */ 0.01}, + }, + /* bit_width= */ 1, + }, }; - for (auto case_ : int32_cases) { - auto arr = rand.Int32(case_.size, /*min=*/0, case_.max_value, case_.null_probability); - CheckRoundTripSpaced(*arr, case_.bit_width); - CheckRoundTripSpaced(*arr->Slice(1), case_.bit_width); + + for (auto case_ : test_cases) { + if (static_cast(case_.bit_width) > sizeof(T)) { + continue; + } + + auto array = case_.MakeArray(); + CheckRoundTrip(*array, case_.bit_width); + CheckRoundTrip(*array, case_.bit_width); + CheckRoundTrip(*array, case_.bit_width); + CheckRoundTrip(*array, case_.bit_width); + CheckRoundTrip(*array->Slice(1), case_.bit_width); } } +TEST(RleBitPacked, GetBatchSpacedRoundtripUint16) { + DoTestGetBatchSpacedRoundtrip(); +} +TEST(RleBitPacked, GetBatchSpacedRoundtripInt32) { + DoTestGetBatchSpacedRoundtrip(); +} +TEST(RleBitPacked, GetBatchSpacedRoundtripUint64) { + DoTestGetBatchSpacedRoundtrip(); +} + } // namespace util } // namespace arrow From db2f11edb6f2bc544decfa224aa5400251b413be Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 11 Aug 2025 15:15:23 +0200 Subject: [PATCH 17/56] Remove BitBlockCounter from RleBitPackedDecoder Benchmarks show it is not helpful and even prevents some optimizations --- cpp/src/arrow/util/rle_encoding_internal.h | 54 +--------------------- 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index feb3b9660ec3..6dfd819e629e 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -27,7 +27,6 @@ #include #include -#include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_stream_utils_internal.h" #include "arrow/util/bit_util.h" @@ -695,33 +694,8 @@ inline int RleBitPackedDecoder::GetBatchSpaced(int batch_size, int null_count } PlainRleConverter converter; - arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset, - batch_size); - int total_processed = 0; - int processed = 0; - arrow::internal::BitBlockCount block; - - do { - block = block_counter.NextFourWords(); - if (block.length == 0) { - break; - } - if (block.AllSet()) { - processed = GetBatch(out, block.length); - } else if (block.NoneSet()) { - converter.FillZero(out, out + block.length); - processed = block.length; - } else { - processed = GetSpaced>( - converter, block.length, block.length - block.popcount, valid_bits, - valid_bits_offset, out); - } - total_processed += processed; - out += block.length; - valid_bits_offset += block.length; - } while (processed == block.length); - return total_processed; + return GetSpaced(converter, batch_size, null_count, valid_bits, valid_bits_offset, out); } static inline bool IndexInRange(int32_t idx, int32_t dictionary_length) { @@ -832,35 +806,11 @@ inline int RleBitPackedDecoder::GetBatchWithDictSpaced( if (null_count == 0) { return GetBatchWithDict(dictionary, dictionary_length, out, batch_size); } - arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset, - batch_size); DictionaryConverter converter; converter.dictionary = dictionary; converter.dictionary_length = dictionary_length; - int total_processed = 0; - int processed = 0; - arrow::internal::BitBlockCount block; - do { - block = block_counter.NextFourWords(); - if (block.length == 0) { - break; - } - if (block.AllSet()) { - processed = GetBatchWithDict(dictionary, dictionary_length, out, block.length); - } else if (block.NoneSet()) { - converter.FillZero(out, out + block.length); - processed = block.length; - } else { - processed = GetSpaced>( - converter, block.length, block.length - block.popcount, valid_bits, - valid_bits_offset, out); - } - total_processed += processed; - out += block.length; - valid_bits_offset += block.length; - } while (processed == block.length); - return total_processed; + return GetSpaced(converter, batch_size, null_count, valid_bits, valid_bits_offset, out); } template From e72dfd481f95b44e2be28d7622bc0416c279293e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 13 Aug 2025 16:26:52 +0200 Subject: [PATCH 18/56] Plug new parser+decoder in RleBitPackedDecoder --- CPPLINT.cfg | 2 + cpp/src/arrow/util/bit_run_reader.h | 4 + cpp/src/arrow/util/bitmap_reader.h | 2 + cpp/src/arrow/util/rle_encoding_internal.h | 839 +++++++++++++-------- 4 files changed, 547 insertions(+), 300 deletions(-) diff --git a/CPPLINT.cfg b/CPPLINT.cfg index 2f47b4dbf57b..dd1139ac7f80 100644 --- a/CPPLINT.cfg +++ b/CPPLINT.cfg @@ -26,5 +26,7 @@ filter = -readability/alt_tokens filter = -readability/casting filter = -readability/todo filter = -runtime/references +# Let the formatter do the job for whitespaces filter = -whitespace/comments +filter = -whitespace/braces linelength = 90 diff --git a/cpp/src/arrow/util/bit_run_reader.h b/cpp/src/arrow/util/bit_run_reader.h index a2cbad5b294b..ed7be940a543 100644 --- a/cpp/src/arrow/util/bit_run_reader.h +++ b/cpp/src/arrow/util/bit_run_reader.h @@ -52,6 +52,8 @@ inline bool operator!=(const BitRun& lhs, const BitRun& rhs) { class BitRunReaderLinear { public: + BitRunReaderLinear() = default; + BitRunReaderLinear(const uint8_t* bitmap, int64_t start_offset, int64_t length) : reader_(bitmap, start_offset, length) {} @@ -74,6 +76,8 @@ class BitRunReaderLinear { /// in a bitmap. class ARROW_EXPORT BitRunReader { public: + BitRunReader() = default; + /// \brief Constructs new BitRunReader. /// /// \param[in] bitmap source data diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h index 5526c87dbcaf..d95fd921f480 100644 --- a/cpp/src/arrow/util/bitmap_reader.h +++ b/cpp/src/arrow/util/bitmap_reader.h @@ -31,6 +31,8 @@ namespace internal { class BitmapReader { public: + BitmapReader() = default; + BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length) : bitmap_(bitmap), position_(0), length_(length) { current_byte_ = 0; diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 6dfd819e629e..146fa0da13cb 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -305,30 +305,32 @@ class RleBitPackedDecoder { public: /// The type in which the data should be decoded. using value_type = T; + using byte = RleBitPackedParser::byte; + using raw_data_const_pointer = RleBitPackedParser::raw_data_const_pointer; + using raw_data_size_type = RleBitPackedParser::raw_data_size_type; + using bit_size_type = RleBitPackedParser::bit_size_type; + using dynamic_run_type = RleBitPackedParser::dynamic_run_type; + /// The type of the size of either run, between 1 and 2^31-1 as per Parquet spec + using values_count_type = int32_t; - /// Create a decoder object. buffer/buffer_len is the decoded data. - /// bit_width is the width of each value (before encoding). - RleBitPackedDecoder(const uint8_t* buffer, int buffer_len, int bit_width) - : bit_reader_(buffer, buffer_len), - bit_width_(bit_width), - current_value_(0), - repeat_count_(0), - literal_count_(0) { - ARROW_DCHECK_GE(bit_width_, 0); - ARROW_DCHECK_LE(bit_width_, 64); - } + RleBitPackedDecoder() noexcept = default; + + /// Create a decoder object. + /// + /// data and data_size are the raw bytes to decode. + /// value_bit_width is the size in bits of each encoded value. + RleBitPackedDecoder(raw_data_const_pointer data, raw_data_size_type data_size, + bit_size_type value_bit_width) noexcept; - RleBitPackedDecoder() : bit_width_(-1) {} + void Reset(raw_data_const_pointer data, raw_data_size_type data_size, + bit_size_type value_bit_width_) noexcept; - void Reset(const uint8_t* buffer, int buffer_len, int bit_width) { - ARROW_DCHECK_GE(bit_width, 0); - ARROW_DCHECK_LE(bit_width, 64); - bit_reader_.Reset(buffer, buffer_len); - bit_width_ = bit_width; - current_value_ = 0; - repeat_count_ = 0; - literal_count_ = 0; - } + /// Whether there is still runs to iterate over. + /// + /// WARN: Due to lack of proper error handling, iteration with Get methods could return + /// no data while the parser is not exhausted. + /// This is how one can check for errors. + [[nodiscard]] bool Exhausted() const; /// Gets the next value. Returns false if there are no more. /// @@ -337,46 +339,65 @@ class RleBitPackedDecoder { /// input with zeros. Since the encoding does not differentiate between /// input values and padding, Get() returns true even for these padding /// values. - bool Get(value_type* val); + [[nodiscard]] bool Get(value_type* val); - /// Gets a batch of values. Returns the number of decoded elements. - int GetBatch(value_type* values, int batch_size); + /// Get a batch of values return the number of decoded elements. + [[nodiscard]] values_count_type GetBatch(value_type* out, values_count_type batch_size); /// Like GetBatch but add spacing for null entries - int GetBatchSpaced(int batch_size, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, value_type* out); + [[nodiscard]] values_count_type GetBatchSpaced(values_count_type batch_size, + values_count_type null_count, + const byte* valid_bits, + int64_t valid_bits_offset, + value_type* out); /// Like GetBatch but the values are then decoded using the provided dictionary template - int GetBatchWithDict(const V* dictionary, int32_t dictionary_length, V* values, - int batch_size); + [[nodiscard]] values_count_type GetBatchWithDict(const V* dictionary, + int32_t dictionary_length, V* out, + values_count_type batch_size); /// Like GetBatchWithDict but add spacing for null entries /// /// Null entries will be zero-initialized in `values` to avoid leaking /// private data. template - int GetBatchWithDictSpaced(const V* dictionary, int32_t dictionary_length, V* values, - int batch_size, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset); - - protected: - ::arrow::bit_util::BitReader bit_reader_; - /// Number of bits needed to encode the value. Must be between 0 and 64. - int bit_width_; - uint64_t current_value_; - int32_t repeat_count_; - int32_t literal_count_; + [[nodiscard]] values_count_type GetBatchWithDictSpaced( + const V* dictionary, int32_t dictionary_length, V* out, + values_count_type batch_size, values_count_type null_count, const byte* valid_bits, + int64_t valid_bits_offset); private: - /// Fills literal_count_ and repeat_count_ with next values. Returns false if there - /// are no more. - bool NextCounts(); + RleBitPackedParser parser_ = {}; + std::variant, BitPackedDecoder> decoder_ = {}; + + /// Return the number of values that are remaining in the current run. + [[nodiscard]] values_count_type RunRemaining() const; + + /// Get a batch of values from the current run and return the number elements read. + [[nodiscard]] values_count_type RunGetBatch(value_type* out, + values_count_type batch_size); + + /// Return the number of values that are remaining in the current run. + [[nodiscard]] bool ParseAndResetDecoder(); + + /// Utility methods for retrieving spaced values within a single run. + template + [[nodiscard]] auto GetSpacedRun(Converter& converter, typename Converter::out_type* out, + values_count_type batch_size, + values_count_type null_count, + arrow::internal::BitRunReader& validity_reader, + arrow::internal::BitRun& validity_run) + -> std::pair; /// Utility methods for retrieving spaced values. - template - int GetSpaced(Converter converter, int batch_size, int null_count, - const uint8_t* valid_bits, int64_t valid_bits_offset, V* out); + template + [[nodiscard]] values_count_type GetSpaced(Converter converter, + typename Converter::out_type* out, + values_count_type batch_size, + const uint8_t* valid_bits, + int64_t valid_bits_offset, + values_count_type null_count); }; /// Class to incrementally build the rle data. This class does not allocate any memory. @@ -514,284 +535,541 @@ class RleBitPackedEncoder { uint8_t* literal_indicator_byte_; }; +/************************* + * RleBitPackedDecoder * + *************************/ + template -inline bool RleBitPackedDecoder::Get(value_type* val) { - return GetBatch(val, 1) == 1; +RleBitPackedDecoder::RleBitPackedDecoder(raw_data_const_pointer data, + raw_data_size_type data_size, + bit_size_type value_bit_width) noexcept { + Reset(data, data_size, value_bit_width); } template -inline int RleBitPackedDecoder::GetBatch(value_type* values, int batch_size) { - ARROW_DCHECK_GE(bit_width_, 0); - int values_read = 0; +void RleBitPackedDecoder::Reset(raw_data_const_pointer data, + raw_data_size_type data_size, + bit_size_type value_bit_width) noexcept { + ARROW_DCHECK_GE(value_bit_width, 0); + ARROW_DCHECK_LE(value_bit_width, 64); + parser_.Reset(data, data_size, value_bit_width); + decoder_ = {}; +} - auto* out = values; +template +auto RleBitPackedDecoder::RunRemaining() const -> values_count_type { + return std::visit([](auto const& dec) { return dec.Remaining(); }, decoder_); +} - while (values_read < batch_size) { - int remaining = batch_size - values_read; - - if (repeat_count_ > 0) { // Repeated value case. - int repeat_batch = std::min(remaining, repeat_count_); - std::fill(out, out + repeat_batch, static_cast(current_value_)); - - repeat_count_ -= repeat_batch; - values_read += repeat_batch; - out += repeat_batch; - } else if (literal_count_ > 0) { - int literal_batch = std::min(remaining, literal_count_); - int actual_read = bit_reader_.GetBatch(bit_width_, out, literal_batch); - if (actual_read != literal_batch) { - return values_read; - } +template +bool RleBitPackedDecoder::Exhausted() const { + return (RunRemaining() == 0) && parser_.Exhausted(); +} - literal_count_ -= literal_batch; - values_read += literal_batch; - out += literal_batch; - } else { - if (!NextCounts()) return values_read; +template +bool RleBitPackedDecoder::ParseAndResetDecoder() { + auto dyn_run = parser_.Next(); + if (!dyn_run.has_value()) { + return false; + } + + if (auto* rle_run = std::get_if(dyn_run.operator->())) { + decoder_ = {BitPackedDecoder(*rle_run)}; + return true; + } + + auto* bit_packed_run = std::get_if(dyn_run.operator->()); + ARROW_DCHECK(bit_packed_run); // Only two possibilities in the variant + decoder_ = {RleDecoder(*bit_packed_run)}; + return true; +} + +template +auto RleBitPackedDecoder::RunGetBatch(value_type* out, values_count_type batch_size) + -> values_count_type { + return std::visit([&](auto& dec) { return dec.GetBatch(out, batch_size); }, decoder_); +} + +template +bool RleBitPackedDecoder::Get(value_type* val) { + return GetBatch(val, 1) == 1; +} + +template +auto RleBitPackedDecoder::GetBatch(value_type* out, values_count_type batch_size) + -> values_count_type { + values_count_type values_read = 0; + + while (values_read < batch_size) { + // Try to get as much as possible from current run + if (auto const read = RunGetBatch(out, batch_size - values_read); read > 0) { + values_read += read; + out += read; + // Get the next run from the batch, it will be read in the next loop iteration + } else if (!ParseAndResetDecoder()) { + // If there are no more run this is the end + break; } } return values_read; } +namespace internal { + +/// Utility class to safely handle values and null count without too error-prone +/// verbosity. +class BatchCounter { + public: + using size_type = int32_t; + + [[nodiscard]] static constexpr BatchCounter FromBatchSizeAndNulls( + size_type batch_size, size_type null_count) { + ARROW_DCHECK_LE(null_count, batch_size); + return {batch_size - null_count, null_count}; + } + + constexpr BatchCounter(size_type values_count, size_type null_count) noexcept + : values_count_(values_count), null_count_(null_count) {} + + [[nodiscard]] constexpr size_type ValuesCount() const noexcept { return values_count_; } + + [[nodiscard]] constexpr size_type ValuesRead() const noexcept { return values_read_; } + + [[nodiscard]] constexpr size_type ValuesRemaining() const noexcept { + ARROW_DCHECK_LE(values_read_, values_count_); + return values_count_ - values_read_; + } + + constexpr void AccrueReadValues(size_type to_read) noexcept { + ARROW_DCHECK_LE(to_read, ValuesRemaining()); + values_read_ += to_read; + } + + [[nodiscard]] constexpr size_type NullCount() const noexcept { return null_count_; } + + [[nodiscard]] constexpr size_type NullRead() const noexcept { return null_read_; } + + [[nodiscard]] constexpr size_type NullRemaining() const noexcept { + ARROW_DCHECK_LE(null_read_, null_count_); + return null_count_ - null_read_; + } + + constexpr void AccrueReadNulls(size_type to_read) noexcept { + ARROW_DCHECK_LE(to_read, NullRemaining()); + null_read_ += to_read; + } + + [[nodiscard]] constexpr size_type TotalRemaining() const noexcept { + return ValuesRemaining() + NullRemaining(); + } + + [[nodiscard]] constexpr size_type TotalRead() const noexcept { + return values_read_ + null_read_; + } + + [[nodiscard]] constexpr bool IsFullyNull() const noexcept { + return ValuesRemaining() == 0; + } + + [[nodiscard]] constexpr bool IsDone() const noexcept { return TotalRemaining() == 0; } + + private: + size_type values_count_ = 0; + size_type values_read_ = 0; + size_type null_count_ = 0; + size_type null_read_ = 0; +}; + +// The maximal unsigned size that a variable can fit. template -template -inline int RleBitPackedDecoder::GetSpaced(Converter converter, int batch_size, - int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, V* out) { - if (ARROW_PREDICT_FALSE(null_count == batch_size)) { - converter.FillZero(out, out + batch_size); - return batch_size; +constexpr auto max_size_for_v = + static_cast>(std::numeric_limits::max()); + +template +auto GetSpacedRle(Converter& converter, typename Converter::out_type* out, + values_count_type batch_size, values_count_type null_count, + arrow::internal::BitRunReader& validity_reader, + arrow::internal::BitRun& validity_run, RleDecoder& decoder) + -> std::pair { + ARROW_DCHECK_GT(batch_size, 0); + // The equality case is handled in the main loop in GetSpaced + ARROW_DCHECK_LT(null_count, batch_size); + + auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); + + values_count_type const values_available = decoder.Remaining(); + ARROW_DCHECK_GT(values_available, 0); + auto values_remaining_run = [&]() { + auto out = values_available - batch.ValuesRead(); + ARROW_DCHECK_GE(out, 0); + return out; + }; + + // Consume as much as possible from the repeated run. + // We only need to count the number of nulls and non-nulls because we can fill in the + // same value for nulls and non-nulls. + // This proves to be a big efficiency win. + while (values_remaining_run() > 0 && !batch.IsDone()) { + ARROW_DCHECK_GE(validity_run.length, 0); + ARROW_DCHECK_LT(validity_run.length, max_size_for_v); + ARROW_DCHECK_LE(validity_run.length, batch.TotalRemaining()); + auto const& validity_run_size = static_cast(validity_run.length); + + if (validity_run.set) { + // We may end the current RLE run in the middle of the validity run + auto update_size = std::min(validity_run_size, values_remaining_run()); + batch.AccrueReadValues(update_size); + validity_run.length -= update_size; + } else { + // We can consume all nulls here because it does not matter if we consume on this + // RLE run, or an a next encoded run. The value filled does not matter. + auto update_size = std::min(validity_run_size, batch.NullRemaining()); + batch.AccrueReadNulls(update_size); + validity_run.length -= update_size; + } + + if (validity_run.length == 0) { + validity_run = validity_reader.NextRun(); + } } - ARROW_DCHECK_GE(bit_width_, 0); - int values_read = 0; - int values_remaining = batch_size - null_count; + value_type const value = decoder.Value(); + if (ARROW_PREDICT_FALSE(!converter.InputIsValid(value))) { + return {batch.ValuesRead(), batch.NullRead()}; + } + converter.WriteRepeated(out, out + batch.TotalRead(), value); + auto const actual_values_read = decoder.Advance(batch.ValuesRead()); + // We always cropped the number of values_read by the remaining values in the run. + // What's more the RLE decoder should not encounter any errors. + ARROW_DCHECK_EQ(actual_values_read, batch.ValuesRead()); - // Assume no bits to start. - arrow::internal::BitRunReader bit_reader(valid_bits, valid_bits_offset, - /*length=*/batch_size); - arrow::internal::BitRun valid_run = bit_reader.NextRun(); - while (values_read < batch_size) { - if (ARROW_PREDICT_FALSE(valid_run.length == 0)) { - valid_run = bit_reader.NextRun(); + return {batch.ValuesRead(), batch.NullRead()}; +} + +template +[[nodiscard]] constexpr T min(T x, Ts... ys) { + ((x = std::min(x, ys)), ...); + return x; +} + +static_assert(min(5) == 5); +static_assert(min(5, 4, -1) == -1); +static_assert(min(5, 41) == 5); + +template +auto GetSpacedBitPacked(Converter& converter, typename Converter::out_type* out, + values_count_type batch_size, values_count_type null_count, + arrow::internal::BitRunReader& validity_reader, + arrow::internal::BitRun& validity_run, + BitPackedDecoder& decoder) + -> std::pair { + ARROW_DCHECK_GT(batch_size, 0); + // The equality case is handled in the main loop in GetSpaced + ARROW_DCHECK_LT(null_count, batch_size); + + auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); + + values_count_type const values_available = decoder.Remaining(); + ARROW_DCHECK_GT(values_available, 0); + auto run_values_remaining = [&]() { + auto out = values_available - batch.ValuesRead(); + ARROW_DCHECK_GE(out, 0); + return out; + }; + + while (run_values_remaining() > 0 && batch.ValuesRemaining() > 0) { + // TODO should this size be tune depending on sizeof(value_size)? cpu cache size? + // Pull a batch of values from the bit packed encoded data and store it in a local + // buffer to benefit from unpacking intrinsics and data locality. + static constexpr values_count_type kBufferCapacity = 1024; + std::array buffer = {}; + + values_count_type buffer_start = 0; + values_count_type buffer_end = 0; + auto buffer_size = [&]() { + auto out = buffer_end - buffer_start; + ARROW_DCHECK_GE(out, 0); + return out; + }; + + // buffer_start is 0 at this point so size is end + buffer_end = min(run_values_remaining(), batch.ValuesRemaining(), kBufferCapacity); + buffer_end = decoder.GetBatch(buffer.data(), buffer_size()); + ARROW_DCHECK_LE(buffer_size(), kBufferCapacity); + + if (ARROW_PREDICT_FALSE(!converter.InputIsValid(buffer.data(), buffer_size()))) { + return {batch.ValuesRead(), batch.NullRead()}; } - ARROW_DCHECK_GT(batch_size, 0); - ARROW_DCHECK_GT(valid_run.length, 0); + // Copy chunks of valid values into the output, while adjusting spacing for null + // values. + while (buffer_size() > 0) { + ARROW_DCHECK_GE(validity_run.length, 0); + ARROW_DCHECK_LT(validity_run.length, max_size_for_v); + ARROW_DCHECK_LE(validity_run.length, batch.TotalRemaining()); + auto const validity_run_length = + static_cast(validity_run.length); + + // Copy as much as possible from the buffer into the output while not exceeding + // validity run + if (validity_run.set) { + auto const update_size = std::min(validity_run_length, buffer_size()); + converter.WriteRange(out, buffer.data() + buffer_start, update_size); + buffer_start += update_size; + batch.AccrueReadValues(update_size); + out += update_size; + validity_run.length -= update_size; + // Simply write zeros in the output + } else { + auto const update_size = std::min(validity_run_length, batch.NullRemaining()); + converter.WriteZero(out, out + update_size); + batch.AccrueReadNulls(update_size); + out += update_size; + validity_run.length -= update_size; + } - if (valid_run.set) { - if ((repeat_count_ == 0) && (literal_count_ == 0)) { - if (!NextCounts()) return values_read; - ARROW_DCHECK((repeat_count_ > 0) ^ (literal_count_ > 0)); + if (validity_run.length == 0) { + validity_run = validity_reader.NextRun(); } + } + + ARROW_DCHECK_EQ(buffer_size(), 0); + } + + ARROW_DCHECK_EQ(values_available - decoder.Remaining(), batch.ValuesRead()); + ARROW_DCHECK_LE(batch.TotalRead(), batch_size); + ARROW_DCHECK_LE(batch.NullRead(), batch.NullCount()); + + return {batch.ValuesRead(), batch.NullRead()}; +} +} // namespace internal + +template +template +[[nodiscard]] auto RleBitPackedDecoder::GetSpacedRun( + Converter& converter, typename Converter::out_type* out, values_count_type batch_size, + values_count_type null_count, arrow::internal::BitRunReader& validity_reader, + arrow::internal::BitRun& validity_run) + -> std::pair { + if (auto* rle_decoder = std::get_if>(&decoder_)) { + if (rle_decoder->Remaining() > 0) { + return internal::GetSpacedRle(converter, out, batch_size, null_count, + validity_reader, validity_run, *rle_decoder); + } + } else { + auto* bit_packed_decoder = std::get_if>(&decoder_); + ARROW_DCHECK(bit_packed_decoder); // Only two possibilities in the variant + if (bit_packed_decoder->Remaining() > 0) { + return internal::GetSpacedBitPacked(converter, out, batch_size, null_count, + validity_reader, validity_run, + *bit_packed_decoder); + } + } + + return {0, 0}; +} - if (repeat_count_ > 0) { - int repeat_batch = 0; - // Consume the entire repeat counts incrementing repeat_batch to - // be the total of nulls + values consumed, we only need to - // get the total count because we can fill in the same value for - // nulls and non-nulls. This proves to be a big efficiency win. - while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) { - ARROW_DCHECK_GT(valid_run.length, 0); - if (valid_run.set) { - int update_size = std::min(static_cast(valid_run.length), repeat_count_); - repeat_count_ -= update_size; - repeat_batch += update_size; - valid_run.length -= update_size; - values_remaining -= update_size; - } else { - // We can consume all nulls here because we would do so on - // the next loop anyways. - repeat_batch += static_cast(valid_run.length); - valid_run.length = 0; - } - if (valid_run.length == 0) { - valid_run = bit_reader.NextRun(); - } - } - value_type current_value = static_cast(current_value_); - if (ARROW_PREDICT_FALSE(!converter.IsValid(current_value))) { - return values_read; - } - converter.Fill(out, out + repeat_batch, current_value); - out += repeat_batch; - values_read += repeat_batch; - } else if (literal_count_ > 0) { - int literal_batch = std::min(values_remaining, literal_count_); - ARROW_DCHECK_GT(literal_batch, 0); - - // Decode the literals - constexpr int kBufferSize = 1024; - value_type indices[kBufferSize]; - literal_batch = std::min(literal_batch, kBufferSize); - int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch); - if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) { - return values_read; - } - if (!converter.IsValid(indices, /*length=*/actual_read)) { - return values_read; - } - int skipped = 0; - int literals_read = 0; - while (literals_read < literal_batch) { - if (valid_run.set) { - int update_size = std::min(literal_batch - literals_read, - static_cast(valid_run.length)); - converter.Copy(out, indices + literals_read, update_size); - literals_read += update_size; - out += update_size; - valid_run.length -= update_size; - } else { - converter.FillZero(out, out + valid_run.length); - out += valid_run.length; - skipped += static_cast(valid_run.length); - valid_run.length = 0; - } - if (valid_run.length == 0) { - valid_run = bit_reader.NextRun(); - } - } - literal_count_ -= literal_batch; - values_remaining -= literal_batch; - values_read += literal_batch + skipped; +template +template +auto RleBitPackedDecoder::GetSpaced( + Converter converter, typename Converter::out_type* out, values_count_type batch_size, + const byte* validity_bits, int64_t validity_bits_offset, values_count_type null_count) + -> values_count_type { + ARROW_DCHECK_GT(batch_size, 0); + + auto batch = internal::BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); + + if (ARROW_PREDICT_FALSE(batch.IsFullyNull())) { + converter.WriteZero(out, out + batch.NullRemaining()); + return batch.NullRemaining(); + } + + arrow::internal::BitRunReader validity_reader(validity_bits, validity_bits_offset, + /*length=*/batch.TotalRemaining()); + arrow::internal::BitRun validity_run = validity_reader.NextRun(); + + while (batch.TotalRead() < batch_size) { + auto const [run_values_read, run_null_read] = + GetSpacedRun(converter, out, batch.TotalRemaining(), batch.NullRemaining(), + validity_reader, validity_run); + + batch.AccrueReadNulls(run_null_read); + batch.AccrueReadValues(run_values_read); + auto const run_total_read = run_values_read + run_null_read; + out += run_total_read; + + // There may be remaining null if they are not greedily filled by either decoder calls + if (ARROW_PREDICT_FALSE(batch.IsFullyNull())) { + ARROW_DCHECK(validity_run.length == 0 || !validity_run.set); + ARROW_DCHECK_GE(validity_run.length, batch.NullRemaining()); + converter.WriteZero(out, out + batch.NullRemaining()); + + // Not necessary since the loop is over but good for sanity check + out += batch.NullRemaining(); + batch.AccrueReadNulls(batch.NullRemaining()); + + break; + } + + // Get the next run from the batch, it will be read in the next loop iteration. + // There could be no values read if it is only null remaining in the batch. + if (RunRemaining() == 0) { + auto success = ParseAndResetDecoder(); + if (!success) { + // If there are no more run this is the end + break; } - } else { - converter.FillZero(out, out + valid_run.length); - out += valid_run.length; - values_read += static_cast(valid_run.length); - valid_run.length = 0; } } - ARROW_DCHECK_EQ(valid_run.length, 0); - ARROW_DCHECK_EQ(values_remaining, 0); - return values_read; + + ARROW_DCHECK(batch.IsDone() || Exhausted()); + // batch.Done() => batch.NullRemaining() == 0 + ARROW_DCHECK(!batch.IsDone() || (batch.NullRemaining() == 0)); + return batch.TotalRead(); } +namespace internal { + // Converter for GetSpaced that handles runs that get returned // directly as output. template -struct PlainRleConverter { - T kZero = {}; - inline bool IsValid(const T& values) const { return true; } - inline bool IsValid(const T* values, int32_t length) const { return true; } - inline void Fill(T* begin, T* end, const T& run_value) const { +struct NoOpConverter { + using in_type = T; + using out_type = T; + using size_type = int32_t; + + [[nodiscard]] static constexpr bool InputIsValid(const in_type& values) { return true; } + + [[nodiscard]] static constexpr bool InputIsValid(const in_type* values, + size_type length) { + return true; + } + + static void WriteRepeated(out_type* begin, out_type* end, in_type run_value) { std::fill(begin, end, run_value); } - inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); } - inline void Copy(T* out, const T* values, int length) const { - std::memcpy(out, values, length * sizeof(T)); + + static void WriteZero(out_type* begin, out_type* end) { + std::fill(begin, end, out_type{}); + } + + static void WriteRange(out_type* out, const in_type* values, size_type length) { + std::memcpy(out, values, length * sizeof(out_type)); } }; +} // namespace internal + template -inline int RleBitPackedDecoder::GetBatchSpaced(int batch_size, int null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset, - value_type* out) { +auto RleBitPackedDecoder::GetBatchSpaced(values_count_type batch_size, + values_count_type null_count, + const byte* valid_bits, + int64_t valid_bits_offset, value_type* out) + -> values_count_type { if (null_count == 0) { return GetBatch(out, batch_size); } - PlainRleConverter converter; + internal::NoOpConverter converter; - return GetSpaced(converter, batch_size, null_count, valid_bits, valid_bits_offset, out); + return GetSpaced(converter, out, batch_size, valid_bits, valid_bits_offset, null_count); } -static inline bool IndexInRange(int32_t idx, int32_t dictionary_length) { - return idx >= 0 && idx < dictionary_length; +namespace internal { + +template +bool IndexInRange(I idx, int32_t dictionary_length) { + ARROW_DCHECK_GT(dictionary_length, 0); + using T = std::common_type_t; + return idx >= 0 && static_cast(idx) < static_cast(dictionary_length); } // Converter for GetSpaced that handles runs of returned dictionary // indices. -template +template struct DictionaryConverter { - T kZero = {}; - const T* dictionary; - int32_t dictionary_length; - - inline bool IsValid(int32_t value) { return IndexInRange(value, dictionary_length); } - - inline bool IsValid(const int32_t* values, int32_t length) const { - using IndexType = int32_t; - IndexType min_index = std::numeric_limits::max(); - IndexType max_index = std::numeric_limits::min(); - for (int x = 0; x < length; x++) { - min_index = std::min(values[x], min_index); - max_index = std::max(values[x], max_index); + using out_type = V; + using in_type = I; + using size_type = int32_t; + + const out_type* dictionary; + size_type dictionary_length; + + [[nodiscard]] bool InputIsValid(in_type idx) const { + return IndexInRange(idx, dictionary_length); + } + + [[nodiscard]] bool InputIsValid(const in_type* indices, size_type length) const { + in_type min_index = std::numeric_limits::max(); + in_type max_index = std::numeric_limits::min(); + for (size_type x = 0; x < length; x++) { + min_index = std::min(indices[x], min_index); + max_index = std::max(indices[x], max_index); } return IndexInRange(min_index, dictionary_length) && IndexInRange(max_index, dictionary_length); } - inline void Fill(T* begin, T* end, const int32_t& run_value) const { + + void WriteRepeated(out_type* begin, out_type* end, in_type run_value) const { std::fill(begin, end, dictionary[run_value]); } - inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); } - inline void Copy(T* out, const int32_t* values, int length) const { - for (int x = 0; x < length; x++) { + static void WriteZero(out_type* begin, out_type* end) { + std::fill(begin, end, out_type{}); + } + + void WriteRange(out_type* out, const in_type* values, size_type length) const { + for (size_type x = 0; x < length; x++) { out[x] = dictionary[values[x]]; } } }; +} // namespace internal + template template -inline int RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, - int32_t dictionary_length, V* values, - int batch_size) { - // Per https://github.com/apache/parquet-format/blob/master/Encodings.md, - // the maximum dictionary index width in Parquet is 32 bits. - using IndexType = value_type; - DictionaryConverter converter; - converter.dictionary = dictionary; - converter.dictionary_length = dictionary_length; - - ARROW_DCHECK_GE(bit_width_, 0); - int values_read = 0; +auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, + int32_t dictionary_length, V* out, + values_count_type batch_size) + -> values_count_type { + if (ARROW_PREDICT_FALSE(batch_size <= 0)) { + return 0; + } - auto* out = values; + internal::DictionaryConverter converter{dictionary, dictionary_length}; - while (values_read < batch_size) { - int remaining = batch_size - values_read; + // BitRun is a lightweight class, we set it to a full run without nulls. + // In this way, the BitRunReader will never be called, and this code should not suffer + // from calling a method intended for spaced output. + arrow::internal::BitRunReader validity_reader{}; // Dummy, must not be used + arrow::internal::BitRun validity_run = {batch_size, /* set=*/true}; - if (repeat_count_ > 0) { - auto idx = static_cast(current_value_); - if (ARROW_PREDICT_FALSE(!IndexInRange(idx, dictionary_length))) { - return values_read; - } - V val = dictionary[idx]; + values_count_type values_read = 0; + auto batch_values_remaining = [&]() { + ARROW_DCHECK_LE(values_read, batch_size); + return batch_size - values_read; + }; - int repeat_batch = std::min(remaining, repeat_count_); - std::fill(out, out + repeat_batch, val); + while (values_read < batch_size) { + auto const [run_values_read, run_null_read] = + GetSpacedRun(converter, out, batch_values_remaining(), /* null_count= */ 0, + validity_reader, validity_run); - /* Upkeep counters */ - repeat_count_ -= repeat_batch; - values_read += repeat_batch; - out += repeat_batch; - } else if (literal_count_ > 0) { - constexpr int kBufferSize = 1024; - IndexType indices[kBufferSize]; + ARROW_DCHECK_EQ(run_null_read, 0); - int literal_batch = std::min(remaining, literal_count_); - literal_batch = std::min(literal_batch, kBufferSize); + values_read += run_values_read; + out += run_values_read; - int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch); - if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) { - return values_read; + // Get the next run from the batch, it will be read in the next loop iteration + if (ARROW_PREDICT_TRUE(RunRemaining() == 0)) { + auto success = ParseAndResetDecoder(); + if (ARROW_PREDICT_FALSE(!success)) { + // If there are no more run this is the end + break; } - if (ARROW_PREDICT_FALSE(!converter.IsValid(indices, /*length=*/literal_batch))) { - return values_read; - } - converter.Copy(out, indices, literal_batch); - - /* Upkeep counters */ - literal_count_ -= literal_batch; - values_read += literal_batch; - out += literal_batch; - } else { - if (!NextCounts()) return values_read; } } @@ -800,47 +1078,16 @@ inline int RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, template template -inline int RleBitPackedDecoder::GetBatchWithDictSpaced( - const V* dictionary, int32_t dictionary_length, V* out, int batch_size, - int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset) { +auto RleBitPackedDecoder::GetBatchWithDictSpaced( + const V* dictionary, int32_t dictionary_length, V* out, values_count_type batch_size, + values_count_type null_count, const uint8_t* valid_bits, int64_t valid_bits_offset) + -> values_count_type { if (null_count == 0) { return GetBatchWithDict(dictionary, dictionary_length, out, batch_size); } - DictionaryConverter converter; - converter.dictionary = dictionary; - converter.dictionary_length = dictionary_length; + internal::DictionaryConverter converter{dictionary, dictionary_length}; - return GetSpaced(converter, batch_size, null_count, valid_bits, valid_bits_offset, out); -} - -template -bool RleBitPackedDecoder::NextCounts() { - // Read the next run's indicator int, it could be a literal or repeated run. - // The int is encoded as a vlq-encoded value. - uint32_t indicator_value = 0; - if (!bit_reader_.GetVlqInt(&indicator_value)) return false; - - // lsb indicates if it is a literal run or repeated run - bool is_literal = indicator_value & 1; - uint32_t count = indicator_value >> 1; - if (is_literal) { - if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast(INT32_MAX) / 8)) { - return false; - } - literal_count_ = count * 8; - } else { - if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast(INT32_MAX))) { - return false; - } - repeat_count_ = count; - T value = {}; - if (!bit_reader_.GetAligned( - static_cast(::arrow::bit_util::CeilDiv(bit_width_, 8)), &value)) { - return false; - } - current_value_ = static_cast(value); - } - return true; + return GetSpaced(converter, out, batch_size, valid_bits, valid_bits_offset, null_count); } /************ @@ -938,14 +1185,6 @@ inline bool RleBitPackedParser::Advance() { return Next().has_value(); } inline bool RleBitPackedParser::Exhausted() const { return data_size_ == 0; } -namespace internal { -// The maximal unsigned size that a variable can fit. -template -constexpr auto max_size_for_v = - static_cast>(std::numeric_limits::max()); - -} // namespace internal - inline auto RleBitPackedParser::PeekCount() const -> std::pair, raw_data_size_type> { if (ARROW_PREDICT_FALSE(Exhausted())) { @@ -1120,9 +1359,9 @@ auto BitPackedDecoder::GetBatch(value_type* out, values_count_type batch_size return actual_read; } -/**************** - * RleEncoder * - ****************/ +/************************* + * RleBitPackedEncoder * + *************************/ /// This function buffers input values 8 at a time. After seeing all 8 values, /// it decides whether they should be encoded as a literal or repeated run. From fa829bc856fd3994ecf3945109db12a9a315a99b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 19 Aug 2025 09:40:06 +0200 Subject: [PATCH 19/56] Dict RleEncoding tests --- cpp/src/arrow/util/rle_encoding_test.cc | 79 +++++++++++++++++++------ 1 file changed, 60 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 7fc568426854..8a7544269b29 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -860,9 +860,10 @@ TEST(BitRle, Overflow) { /// For number greater than one, this ensure that the decoder intermediary state /// is valid. template -void CheckRoundTrip(const Array& data, int bit_width) { +void CheckRoundTrip(const Array& data, int bit_width, + std::shared_ptr dict = {}) { using ArrayType = typename TypeTraits::ArrayType; - using T = typename Type::c_type; + using value_type = typename Type::c_type; int const data_size = static_cast(data.length()); int const data_values_count = @@ -871,7 +872,7 @@ void CheckRoundTrip(const Array& data, int bit_width) { ASSERT_GE(kParts, 1); ASSERT_LE(kParts, data_size); - const T* data_values = static_cast(data).raw_values(); + const value_type* data_values = static_cast(data).raw_values(); // Encode the data into ``buffer`` using the encoder. std::vector buffer(buffer_size); @@ -890,8 +891,15 @@ void CheckRoundTrip(const Array& data, int bit_width) { << "All values input were not encoded successfully by the encoder"; // On to verify batch read - RleBitPackedDecoder decoder(buffer.data(), encoded_byte_size, bit_width); - std::vector values_read(data_size); + RleBitPackedDecoder decoder(buffer.data(), encoded_byte_size, bit_width); + // We will only use one of them depending on whether this is a dictonnary tests + std::vector dict_read; + std::vector values_read; + if (dict) { + dict_read.resize(data_size); + } else { + values_read.resize(data_size); + } // We will read the data in kParts calls to make sure intermediate states are valid int32_t actual_read_count = 0; @@ -904,16 +912,30 @@ void CheckRoundTrip(const Array& data, int bit_width) { } auto* out = values_read.data() + requested_read_count; + auto* dict_out = dict_read.data() + requested_read_count; auto read = 0; if constexpr (kSpaced) { // We need to slice the input array get the proper null count and bitmap auto data_remaining = data.Slice(requested_read_count, to_read); - read = decoder.GetBatchSpaced( - to_read, static_cast(data_remaining->null_count()), - data_remaining->null_bitmap_data(), data_remaining->offset(), out); + + if (dict) { + read = decoder.GetBatchWithDictSpaced( + dict->raw_values(), static_cast(dict->length()), dict_out, to_read, + static_cast(data_remaining->null_count()), + data_remaining->null_bitmap_data(), data_remaining->offset()); + } else { + read = decoder.GetBatchSpaced( + to_read, static_cast(data_remaining->null_count()), + data_remaining->null_bitmap_data(), data_remaining->offset(), out); + } } else { - read = decoder.GetBatch(out, to_read); + if (dict) { + read = decoder.GetBatchWithDict( + dict->raw_values(), static_cast(dict->length()), dict_out, to_read); + } else { + read = decoder.GetBatch(out, to_read); + } } ASSERT_EQ(read, to_read) << "Decoder did not read as many values as requested"; @@ -926,9 +948,16 @@ void CheckRoundTrip(const Array& data, int bit_width) { // Verify the round trip: encoded-decoded values must equal the original one for (int64_t i = 0; i < data_size; ++i) { if (data.IsValid(i) || !kSpaced) { - EXPECT_EQ(values_read[i], data_values[i]) - << "Encoded then decoded value at position " << i << " (" << values_read[i] - << ") differs from original value (" << data_values[i] << ")"; + if (dict) { + EXPECT_EQ(dict_read.at(i), dict->Value(data_values[i])) + << "Encoded then decoded and mapped value at position " << i << " (" + << values_read[i] << ") differs from original value (" << data_values[i] + << " mapped to " << dict->Value(data_values[i]) << ")"; + } else { + EXPECT_EQ(values_read.at(i), data_values[i]) + << "Encoded then decoded value at position " << i << " (" << values_read.at(i) + << ") differs from original value (" << data_values[i] << ")"; + } } } } @@ -968,10 +997,8 @@ struct DataTestRleBitPacked { std::vector> parts; int32_t bit_width; - std::shared_ptr<::arrow::Array> MakeArray() const { - uint32_t kSeed = 1337; - ::arrow::random::RandomArrayGenerator rand(kSeed); - + std::shared_ptr<::arrow::Array> MakeArray( + ::arrow::random::RandomArrayGenerator& rand) const { std::vector> arrays = {}; for (auto const& dyn_part : parts) { @@ -1029,10 +1056,10 @@ void DoTestGetBatchSpacedRoundtrip() { { { NullPart{/* size= */ 80}, - RandomPart{/* max=*/7, /* size=*/800, /* null_proba= */ 0.01}, + RandomPart{/* max=*/1023, /* size=*/800, /* null_proba= */ 0.01}, NullPart{/* size= */ 1023}, }, - /* bit_width= */ 3, + /* bit_width= */ 11, }, { {RepeatPart{/* value=*/13, /* size=*/100000}}, @@ -1068,17 +1095,31 @@ void DoTestGetBatchSpacedRoundtrip() { }, }; + ::arrow::random::RandomArrayGenerator rand(/* seed= */ 12); + // FRAGILE: Large enough so that it can be indexed by any value in all cases + auto dict = std::static_pointer_cast(rand.Float32(20000, -1.0, 1.0)); + for (auto case_ : test_cases) { if (static_cast(case_.bit_width) > sizeof(T)) { continue; } - auto array = case_.MakeArray(); + auto array = case_.MakeArray(rand); + + // Tests for GetBatch CheckRoundTrip(*array, case_.bit_width); CheckRoundTrip(*array, case_.bit_width); + + // Tests for GetBatchSpaced CheckRoundTrip(*array, case_.bit_width); CheckRoundTrip(*array, case_.bit_width); CheckRoundTrip(*array->Slice(1), case_.bit_width); + + // Cannot test GetBatchWithDict with this method since unknown null values + + // Tests for GetBatchWithDictSpaced + CheckRoundTrip(*array, case_.bit_width, dict); + CheckRoundTrip(*array, case_.bit_width, dict); } } From ce4232d096beca50bc9f70c9294e807b3753284a Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 20 Aug 2025 18:05:07 +0200 Subject: [PATCH 20/56] Perf: Fewer variant calls --- cpp/src/arrow/util/rle_encoding_internal.h | 114 ++++++++++----------- 1 file changed, 54 insertions(+), 60 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 146fa0da13cb..ac4fb637136e 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -381,15 +381,6 @@ class RleBitPackedDecoder { /// Return the number of values that are remaining in the current run. [[nodiscard]] bool ParseAndResetDecoder(); - /// Utility methods for retrieving spaced values within a single run. - template - [[nodiscard]] auto GetSpacedRun(Converter& converter, typename Converter::out_type* out, - values_count_type batch_size, - values_count_type null_count, - arrow::internal::BitRunReader& validity_reader, - arrow::internal::BitRun& validity_run) - -> std::pair; - /// Utility methods for retrieving spaced values. template [[nodiscard]] values_count_type GetSpaced(Converter converter, @@ -845,31 +836,6 @@ auto GetSpacedBitPacked(Converter& converter, typename Converter::out_type* out, } } // namespace internal -template -template -[[nodiscard]] auto RleBitPackedDecoder::GetSpacedRun( - Converter& converter, typename Converter::out_type* out, values_count_type batch_size, - values_count_type null_count, arrow::internal::BitRunReader& validity_reader, - arrow::internal::BitRun& validity_run) - -> std::pair { - if (auto* rle_decoder = std::get_if>(&decoder_)) { - if (rle_decoder->Remaining() > 0) { - return internal::GetSpacedRle(converter, out, batch_size, null_count, - validity_reader, validity_run, *rle_decoder); - } - } else { - auto* bit_packed_decoder = std::get_if>(&decoder_); - ARROW_DCHECK(bit_packed_decoder); // Only two possibilities in the variant - if (bit_packed_decoder->Remaining() > 0) { - return internal::GetSpacedBitPacked(converter, out, batch_size, null_count, - validity_reader, validity_run, - *bit_packed_decoder); - } - } - - return {0, 0}; -} - template template auto RleBitPackedDecoder::GetSpaced( @@ -890,9 +856,33 @@ auto RleBitPackedDecoder::GetSpaced( arrow::internal::BitRun validity_run = validity_reader.NextRun(); while (batch.TotalRead() < batch_size) { - auto const [run_values_read, run_null_read] = - GetSpacedRun(converter, out, batch.TotalRemaining(), batch.NullRemaining(), - validity_reader, validity_run); + values_count_type run_values_read = 0; + values_count_type run_null_read = 0; + + if (auto* rle_decoder = std::get_if>(&decoder_)) { + if (rle_decoder->Remaining() > 0) { + std::tie(run_values_read, run_null_read) = internal::GetSpacedRle( + converter, out, batch.TotalRemaining(), batch.NullRemaining(), + validity_reader, validity_run, *rle_decoder); + } else if (!ParseAndResetDecoder()) { + // We try to get the next run from the batch, it will be read in the next loop. + // Otherwise there may be remaining null not greedily filled. + break; + } + + } else { + auto* bit_packed_decoder = std::get_if>(&decoder_); + ARROW_DCHECK(bit_packed_decoder); // Only two possibilities in the variant + if (bit_packed_decoder->Remaining() > 0) { + std::tie(run_values_read, run_null_read) = internal::GetSpacedBitPacked( + converter, out, batch.TotalRemaining(), batch.NullRemaining(), + validity_reader, validity_run, *bit_packed_decoder); + } else if (!ParseAndResetDecoder()) { + // We try to get the next run from the batch, it will be read in the next loop. + // Otherwise there may be remaining null not greedily filled. + break; + } + } batch.AccrueReadNulls(run_null_read); batch.AccrueReadValues(run_values_read); @@ -911,16 +901,6 @@ auto RleBitPackedDecoder::GetSpaced( break; } - - // Get the next run from the batch, it will be read in the next loop iteration. - // There could be no values read if it is only null remaining in the batch. - if (RunRemaining() == 0) { - auto success = ParseAndResetDecoder(); - if (!success) { - // If there are no more run this is the end - break; - } - } } ARROW_DCHECK(batch.IsDone() || Exhausted()); @@ -1054,23 +1034,37 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, }; while (values_read < batch_size) { - auto const [run_values_read, run_null_read] = - GetSpacedRun(converter, out, batch_values_remaining(), /* null_count= */ 0, - validity_reader, validity_run); - - ARROW_DCHECK_EQ(run_null_read, 0); - - values_read += run_values_read; - out += run_values_read; + values_count_type run_values_read = 0; + values_count_type run_null_read = 0; + + if (auto* rle_decoder = std::get_if>(&decoder_)) { + if (rle_decoder->Remaining() > 0) { + std::tie(run_values_read, run_null_read) = internal::GetSpacedRle( + converter, out, batch_values_remaining(), /* null_count= */ 0, + validity_reader, validity_run, *rle_decoder); + } else if (!ParseAndResetDecoder()) { + // We try to get the next run from the batch, it will be read in the next loop. + // Otherwise there may be remaining null not greedily filled. + break; + } - // Get the next run from the batch, it will be read in the next loop iteration - if (ARROW_PREDICT_TRUE(RunRemaining() == 0)) { - auto success = ParseAndResetDecoder(); - if (ARROW_PREDICT_FALSE(!success)) { - // If there are no more run this is the end + } else { + auto* bit_packed_decoder = std::get_if>(&decoder_); + ARROW_DCHECK(bit_packed_decoder); // Only two possibilities in the variant + if (bit_packed_decoder->Remaining() > 0) { + std::tie(run_values_read, run_null_read) = internal::GetSpacedBitPacked( + converter, out, batch_values_remaining(), /* null_count= */ 0, + validity_reader, validity_run, *bit_packed_decoder); + } else if (!ParseAndResetDecoder()) { + // We try to get the next run from the batch, it will be read in the next loop. + // Otherwise there may be remaining null not greedily filled. break; } } + + ARROW_DCHECK_EQ(run_null_read, 0); + values_read += run_values_read; + out += run_values_read; } return values_read; From 585c3a7a6109be2dc5c7470072f08a2d17b62af4 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 21 Aug 2025 12:21:07 +0200 Subject: [PATCH 21/56] Perf: template bit run readers --- cpp/src/arrow/util/rle_encoding_internal.h | 32 ++++++++++++++-------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index ac4fb637136e..dea1539b6a4b 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -677,11 +677,12 @@ template constexpr auto max_size_for_v = static_cast>(std::numeric_limits::max()); -template +template auto GetSpacedRle(Converter& converter, typename Converter::out_type* out, values_count_type batch_size, values_count_type null_count, - arrow::internal::BitRunReader& validity_reader, - arrow::internal::BitRun& validity_run, RleDecoder& decoder) + BitRunReader&& validity_reader, BitRun&& validity_run, + RleDecoder& decoder) -> std::pair { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced @@ -748,11 +749,11 @@ static_assert(min(5) == 5); static_assert(min(5, 4, -1) == -1); static_assert(min(5, 41) == 5); -template +template auto GetSpacedBitPacked(Converter& converter, typename Converter::out_type* out, values_count_type batch_size, values_count_type null_count, - arrow::internal::BitRunReader& validity_reader, - arrow::internal::BitRun& validity_run, + BitRunReader&& validity_reader, BitRun&& validity_run, BitPackedDecoder& decoder) -> std::pair { ARROW_DCHECK_GT(batch_size, 0); @@ -1007,6 +1008,17 @@ struct DictionaryConverter { } }; +/// Dummy imitation of BitRun that is all set. +struct AllSetBitRun { + static constexpr bool set = true; + int64_t length = 0; +}; + +/// Dummy imitation of BitRunReader that should never be called. +struct UnreachableBitRunReader { + constexpr static AllSetBitRun NextRun() { return {}; } +}; + } // namespace internal template @@ -1021,11 +1033,9 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, internal::DictionaryConverter converter{dictionary, dictionary_length}; - // BitRun is a lightweight class, we set it to a full run without nulls. - // In this way, the BitRunReader will never be called, and this code should not suffer - // from calling a method intended for spaced output. - arrow::internal::BitRunReader validity_reader{}; // Dummy, must not be used - arrow::internal::BitRun validity_run = {batch_size, /* set=*/true}; + // Make lightweight BitRun class to reuse previous methods. + constexpr internal::UnreachableBitRunReader validity_reader{}; + internal::AllSetBitRun validity_run = {batch_size}; values_count_type values_read = 0; auto batch_values_remaining = [&]() { From 37d0c7b4c425a5950af37244c8e9dfcf218f4738 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 21 Aug 2025 16:37:32 +0200 Subject: [PATCH 22/56] Perf: separate impl for identity converter --- cpp/src/arrow/util/rle_encoding_internal.h | 95 +++++++++++++++++++++- 1 file changed, 91 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index dea1539b6a4b..d16390ce154a 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -751,10 +751,74 @@ static_assert(min(5, 41) == 5); template -auto GetSpacedBitPacked(Converter& converter, typename Converter::out_type* out, - values_count_type batch_size, values_count_type null_count, - BitRunReader&& validity_reader, BitRun&& validity_run, - BitPackedDecoder& decoder) +auto GetSpacedBitPackedIdentity(Converter& converter, typename Converter::out_type* out, + values_count_type batch_size, + values_count_type null_count, + BitRunReader&& validity_reader, BitRun&& validity_run, + BitPackedDecoder& decoder) + -> std::pair { + ARROW_DCHECK_GT(batch_size, 0); + // The equality case is handled in the main loop in GetSpaced + ARROW_DCHECK_LT(null_count, batch_size); + + auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); + + values_count_type const values_available = decoder.Remaining(); + ARROW_DCHECK_GT(values_available, 0); + auto run_values_remaining = [&]() { + auto out = values_available - batch.ValuesRead(); + ARROW_DCHECK_GE(out, 0); + return out; + }; + + while (run_values_remaining() > 0 && batch.ValuesRemaining() > 0) { + ARROW_DCHECK_GE(validity_run.length, 0); + ARROW_DCHECK_LT(validity_run.length, max_size_for_v); + ARROW_DCHECK_LE(validity_run.length, batch.TotalRemaining()); + auto const validity_run_length = static_cast(validity_run.length); + + // Copy as much as possible from the buffer into the output while not exceeding + // validity run + if (validity_run.set) { + auto const requested_read = std::min(validity_run_length, run_values_remaining()); + // Since this is identity, we can write directly to the output + auto const actual_read = decoder.GetBatch(out, requested_read); + + if (ARROW_PREDICT_FALSE(!converter.InputIsValid(out, actual_read))) { + return {batch.ValuesRead(), batch.NullRead()}; + } + + batch.AccrueReadValues(actual_read); + out += actual_read; + validity_run.length -= actual_read; + + // Simply write zeros in the output + } else { + auto const update_size = std::min(validity_run_length, batch.NullRemaining()); + converter.WriteZero(out, out + update_size); + batch.AccrueReadNulls(update_size); + out += update_size; + validity_run.length -= update_size; + } + + if (validity_run.length == 0) { + validity_run = validity_reader.NextRun(); + } + } + + ARROW_DCHECK_EQ(values_available - decoder.Remaining(), batch.ValuesRead()); + ARROW_DCHECK_LE(batch.TotalRead(), batch_size); + ARROW_DCHECK_LE(batch.NullRead(), batch.NullCount()); + + return {batch.ValuesRead(), batch.NullRead()}; +} + +template +auto GetSpacedBitPackedDefault(Converter& converter, typename Converter::out_type* out, + values_count_type batch_size, values_count_type null_count, + BitRunReader&& validity_reader, BitRun&& validity_run, + BitPackedDecoder& decoder) -> std::pair { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced @@ -835,6 +899,25 @@ auto GetSpacedBitPacked(Converter& converter, typename Converter::out_type* out, return {batch.ValuesRead(), batch.NullRead()}; } + +template +auto GetSpacedBitPacked(Converter& converter, typename Converter::out_type* out, + values_count_type batch_size, values_count_type null_count, + BitRunReader&& validity_reader, BitRun&& validity_run, + BitPackedDecoder& decoder) + -> std::pair { + if constexpr (Converter::kIsIdentity) { + // An optimization + return GetSpacedBitPackedIdentity(converter, out, batch_size, null_count, + std::forward(validity_reader), + std::forward(validity_run), decoder); + } else { + return GetSpacedBitPackedDefault(converter, out, batch_size, null_count, + std::forward(validity_reader), + std::forward(validity_run), decoder); + } +} } // namespace internal template @@ -920,6 +1003,8 @@ struct NoOpConverter { using out_type = T; using size_type = int32_t; + static constexpr bool kIsIdentity = true; + [[nodiscard]] static constexpr bool InputIsValid(const in_type& values) { return true; } [[nodiscard]] static constexpr bool InputIsValid(const in_type* values, @@ -974,6 +1059,8 @@ struct DictionaryConverter { using in_type = I; using size_type = int32_t; + static constexpr bool kIsIdentity = false; + const out_type* dictionary; size_type dictionary_length; From b4806b49960d77f79677d73075c1df95707805c8 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 22 Aug 2025 10:16:12 +0200 Subject: [PATCH 23/56] Perf: GetBatch fewer variant calls --- cpp/src/arrow/util/rle_encoding_internal.h | 33 +++++++++++++++------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index d16390ce154a..9c3e6e7a5482 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -592,14 +592,27 @@ auto RleBitPackedDecoder::GetBatch(value_type* out, values_count_type batch_s values_count_type values_read = 0; while (values_read < batch_size) { - // Try to get as much as possible from current run - if (auto const read = RunGetBatch(out, batch_size - values_read); read > 0) { - values_read += read; - out += read; - // Get the next run from the batch, it will be read in the next loop iteration - } else if (!ParseAndResetDecoder()) { - // If there are no more run this is the end - break; + if (auto* rle_decoder = std::get_if>(&decoder_)) { + if (rle_decoder->Remaining() > 0) { + // Try to get as much as possible from current run + auto const read = rle_decoder->GetBatch(out, batch_size - values_read); + values_read += read; + out += read; + } else if (!ParseAndResetDecoder()) { + // We try to get the next run from the batch, it will be read in the next loop. + break; + } + } else { + auto* bit_packed_decoder = std::get_if>(&decoder_); + ARROW_DCHECK(bit_packed_decoder); // Only two possibilities in the variant + if (bit_packed_decoder->Remaining() > 0) { + auto const read = bit_packed_decoder->GetBatch(out, batch_size - values_read); + values_read += read; + out += read; + } else if (!ParseAndResetDecoder()) { + // We try to get the next run from the batch, it will be read in the next loop. + break; + } } } @@ -1382,7 +1395,7 @@ constexpr bool RleDecoder::Get(value_type* out_value) { template auto RleDecoder::GetBatch(value_type* out, values_count_type batch_size) -> values_count_type { - if (remaining_count_ == 0) { + if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { return 0; } @@ -1438,7 +1451,7 @@ bool BitPackedDecoder::Get(value_type* out_value) { template auto BitPackedDecoder::GetBatch(value_type* out, values_count_type batch_size) -> values_count_type { - if (remaining_count_ == 0) { + if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { return 0; } From b9abef3be0fe0f06581f9e4343125524673a015b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 22 Aug 2025 12:13:09 +0200 Subject: [PATCH 24/56] Perf: predict no bad data --- cpp/src/arrow/util/rle_encoding_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 9c3e6e7a5482..0f221ed55156 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -1299,7 +1299,7 @@ inline auto RleBitPackedParser::PeekCount() const uint32_t run_len_type = 0; auto const header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type); - if (header_bytes == 0) { + if (ARROW_PREDICT_FALSE(header_bytes == 0)) { // Malfomrmed LEB128 data return {}; } From dc3c16dd6f3db6ebe9a1b3387334a636184a778e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 22 Aug 2025 15:47:27 +0200 Subject: [PATCH 25/56] Perf: predict new bit run in RleSpaced --- cpp/src/arrow/util/rle_encoding_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 0f221ed55156..008e99bf81f4 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -734,7 +734,7 @@ auto GetSpacedRle(Converter& converter, typename Converter::out_type* out, validity_run.length -= update_size; } - if (validity_run.length == 0) { + if (ARROW_PREDICT_TRUE(validity_run.length == 0)) { validity_run = validity_reader.NextRun(); } } From 5b93a7e6ac9db3d5dd8a5a9564b2b2552492a176 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 22 Aug 2025 15:50:45 +0200 Subject: [PATCH 26/56] Fix RleSpaced --- cpp/src/arrow/util/rle_encoding_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 008e99bf81f4..5780656eba35 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -741,7 +741,7 @@ auto GetSpacedRle(Converter& converter, typename Converter::out_type* out, value_type const value = decoder.Value(); if (ARROW_PREDICT_FALSE(!converter.InputIsValid(value))) { - return {batch.ValuesRead(), batch.NullRead()}; + return {0, 0}; } converter.WriteRepeated(out, out + batch.TotalRead(), value); auto const actual_values_read = decoder.Advance(batch.ValuesRead()); From fab7fdfe8bbfc3b425b06609dec8b3825eefba3f Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 22 Aug 2025 15:16:17 +0200 Subject: [PATCH 27/56] Refactor parsing with handler --- cpp/src/arrow/util/rle_encoding_internal.h | 375 ++++++++++++++------- 1 file changed, 261 insertions(+), 114 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 5780656eba35..7cde0cff5da2 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -204,6 +204,31 @@ class RleBitPackedParser { /// This is how one can check for errors. [[nodiscard]] bool Exhausted() const; + /// Enum to return from an ``Parse`` handler. + /// + /// Since a callback has no way to know when to stop, the handler must return + /// a value indicating to the ``Parse`` function whether to stop or continue. + enum class ControlFlow { + Continue, + Break, + }; + + /// A callback approach to parsing. + /// + /// This approach is used to reduce the number of dynamic lookups involved with using a + /// variant. + /// + /// The handler must be of the form + /// ```cpp` + /// struct Handler { + /// ControlFlow OnBitPackedRun(BitPackedRun run); + /// + /// ControlFlow OnRleRun(RleRun run); + /// }; + /// ``` + template + void Parse(Handler&& handler); + private: /// The pointer to the beginning of the run raw_data_const_pointer data_ = nullptr; @@ -212,9 +237,10 @@ class RleBitPackedParser { /// The size in bit of a packed value in the run bit_size_type value_bit_width_ = 0; - /// Like Peek but also return the number of bytes to advance after. - [[nodiscard]] std::pair, raw_data_size_type> PeekCount() - const; + /// Run the handler on the run read and return the number of values read. + /// Does not advance the parser. + template + std::pair PeekImpl(Handler&&) const; }; /// Decoder class for RLE encoded data. @@ -586,36 +612,83 @@ bool RleBitPackedDecoder::Get(value_type* val) { return GetBatch(val, 1) == 1; } +namespace internal { + +/// A ``Parse`` handler that calls a single lambda. +/// +/// This lambda would typically take the input run as ``auto run`` (i.e. the lambda is +/// templated) and deduce other types from it. +template +struct LambdaHandler { + Lambda handlder_; + + auto OnBitPackedRun(BitPackedRun run) { return handlder_(std::move(run)); } + + auto OnRleRun(RleRun run) { return handlder_(std::move(run)); } +}; + +template +LambdaHandler(Lambda) -> LambdaHandler; + +template +struct decoder_for; + +template +struct decoder_for { + using type = BitPackedDecoder; +}; + +template +struct decoder_for { + using type = RleDecoder; +}; + +template +using decoder_for_t = typename decoder_for::type; + +} // namespace internal + template auto RleBitPackedDecoder::GetBatch(value_type* out, values_count_type batch_size) -> values_count_type { + using ControlFlow = RleBitPackedParser::ControlFlow; + values_count_type values_read = 0; - while (values_read < batch_size) { - if (auto* rle_decoder = std::get_if>(&decoder_)) { - if (rle_decoder->Remaining() > 0) { - // Try to get as much as possible from current run - auto const read = rle_decoder->GetBatch(out, batch_size - values_read); - values_read += read; - out += read; - } else if (!ParseAndResetDecoder()) { - // We try to get the next run from the batch, it will be read in the next loop. - break; - } - } else { - auto* bit_packed_decoder = std::get_if>(&decoder_); - ARROW_DCHECK(bit_packed_decoder); // Only two possibilities in the variant - if (bit_packed_decoder->Remaining() > 0) { - auto const read = bit_packed_decoder->GetBatch(out, batch_size - values_read); - values_read += read; - out += read; - } else if (!ParseAndResetDecoder()) { - // We try to get the next run from the batch, it will be read in the next loop. - break; - } + // Remaining from a previous call that would have left some unread data from a run. + if (ARROW_PREDICT_FALSE(RunRemaining() > 0)) { + auto const read = RunGetBatch(out, batch_size); + values_read += read; + out += read; + + // Either we fulfilled all the batch to be read or we finished remaining run. + if (ARROW_PREDICT_FALSE(values_read == batch_size)) { + return values_read; } + ARROW_DCHECK(RunRemaining() == 0); } + auto handler = internal::LambdaHandler{ + [&](auto run) { + ARROW_DCHECK_LT(values_read, batch_size); + internal::decoder_for_t decoder(run); + auto const read = decoder.GetBatch(out, batch_size - values_read); + ARROW_DCHECK_LE(read, batch_size - values_read); + values_read += read; + out += read; + + // Stop reading and store remaining decoder + if (ARROW_PREDICT_FALSE(values_read == batch_size || read == 0)) { + decoder_ = std::move(decoder); + return ControlFlow::Break; + } + + return ControlFlow::Continue; + }, + }; + + parser_.Parse(handler); + return values_read; } @@ -690,9 +763,10 @@ template constexpr auto max_size_for_v = static_cast>(std::numeric_limits::max()); +/// Overload for GetSpaced for a single run in a RleDecoder template -auto GetSpacedRle(Converter& converter, typename Converter::out_type* out, +auto RunGetSpaced(Converter& converter, typename Converter::out_type* out, values_count_type batch_size, values_count_type null_count, BitRunReader&& validity_reader, BitRun&& validity_run, RleDecoder& decoder) @@ -913,12 +987,13 @@ auto GetSpacedBitPackedDefault(Converter& converter, typename Converter::out_typ return {batch.ValuesRead(), batch.NullRead()}; } +/// Overload for GetSpaced for a single run in a BitPackedDecoder template -auto GetSpacedBitPacked(Converter& converter, typename Converter::out_type* out, - values_count_type batch_size, values_count_type null_count, - BitRunReader&& validity_reader, BitRun&& validity_run, - BitPackedDecoder& decoder) +auto RunGetSpaced(Converter& converter, typename Converter::out_type* out, + values_count_type batch_size, values_count_type null_count, + BitRunReader&& validity_reader, BitRun&& validity_run, + BitPackedDecoder& decoder) -> std::pair { if constexpr (Converter::kIsIdentity) { // An optimization @@ -931,6 +1006,24 @@ auto GetSpacedBitPacked(Converter& converter, typename Converter::out_type* out, std::forward(validity_run), decoder); } } + +/// Overload for GetSpaced for a single run in a decoder variant +template +auto RunGetSpaced( + Converter& converter, typename Converter::out_type* out, values_count_type batch_size, + values_count_type null_count, BitRunReader&& validity_reader, BitRun&& validity_run, + std::variant, BitPackedDecoder>& decoder) + -> std::pair { + return std::visit( + [&](auto& dec) { + ARROW_DCHECK_GT(dec.Remaining(), 0); + return RunGetSpaced(converter, out, batch_size, null_count, validity_reader, + validity_run, dec); + }, + decoder); +} + } // namespace internal template @@ -939,6 +1032,8 @@ auto RleBitPackedDecoder::GetSpaced( Converter converter, typename Converter::out_type* out, values_count_type batch_size, const byte* validity_bits, int64_t validity_bits_offset, values_count_type null_count) -> values_count_type { + using ControlFlow = RleBitPackedParser::ControlFlow; + ARROW_DCHECK_GT(batch_size, 0); auto batch = internal::BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); @@ -952,54 +1047,65 @@ auto RleBitPackedDecoder::GetSpaced( /*length=*/batch.TotalRemaining()); arrow::internal::BitRun validity_run = validity_reader.NextRun(); - while (batch.TotalRead() < batch_size) { - values_count_type run_values_read = 0; - values_count_type run_null_read = 0; - - if (auto* rle_decoder = std::get_if>(&decoder_)) { - if (rle_decoder->Remaining() > 0) { - std::tie(run_values_read, run_null_read) = internal::GetSpacedRle( - converter, out, batch.TotalRemaining(), batch.NullRemaining(), - validity_reader, validity_run, *rle_decoder); - } else if (!ParseAndResetDecoder()) { - // We try to get the next run from the batch, it will be read in the next loop. - // Otherwise there may be remaining null not greedily filled. - break; - } - - } else { - auto* bit_packed_decoder = std::get_if>(&decoder_); - ARROW_DCHECK(bit_packed_decoder); // Only two possibilities in the variant - if (bit_packed_decoder->Remaining() > 0) { - std::tie(run_values_read, run_null_read) = internal::GetSpacedBitPacked( - converter, out, batch.TotalRemaining(), batch.NullRemaining(), - validity_reader, validity_run, *bit_packed_decoder); - } else if (!ParseAndResetDecoder()) { - // We try to get the next run from the batch, it will be read in the next loop. - // Otherwise there may be remaining null not greedily filled. - break; - } - } - - batch.AccrueReadNulls(run_null_read); - batch.AccrueReadValues(run_values_read); - auto const run_total_read = run_values_read + run_null_read; - out += run_total_read; - - // There may be remaining null if they are not greedily filled by either decoder calls - if (ARROW_PREDICT_FALSE(batch.IsFullyNull())) { + auto const check_and_handle_fully_null_remaining = [&]() { + if (batch.IsFullyNull()) { ARROW_DCHECK(validity_run.length == 0 || !validity_run.set); ARROW_DCHECK_GE(validity_run.length, batch.NullRemaining()); - converter.WriteZero(out, out + batch.NullRemaining()); - // Not necessary since the loop is over but good for sanity check + converter.WriteZero(out, out + batch.NullRemaining()); out += batch.NullRemaining(); batch.AccrueReadNulls(batch.NullRemaining()); + } + }; - break; + // Remaining from a previous call that would have left some unread data from a run. + if (ARROW_PREDICT_FALSE(RunRemaining() > 0)) { + auto const [values_read, null_read] = + RunGetSpaced(converter, out, batch.TotalRemaining(), batch.NullRemaining(), + validity_reader, validity_run, decoder_); + + batch.AccrueReadNulls(null_read); + batch.AccrueReadValues(values_read); + out += values_read + null_read; + + // Either we fulfilled all the batch values to be read + if (ARROW_PREDICT_FALSE(batch.ValuesRemaining() == 0)) { + // There may be remaining null if they are not greedily filled + check_and_handle_fully_null_remaining(); + return batch.TotalRead(); } + + /// We finished the remaining run + ARROW_DCHECK(RunRemaining() == 0); } + auto handler = internal::LambdaHandler{ + [&](auto run) { + internal::decoder_for_t decoder(run); + + auto const [values_read, null_read] = internal::RunGetSpaced( + converter, out, batch.TotalRemaining(), batch.NullRemaining(), + validity_reader, validity_run, decoder); + + batch.AccrueReadNulls(null_read); + batch.AccrueReadValues(values_read); + out += values_read + null_read; + + // Stop reading and store remaining decoder + if (ARROW_PREDICT_FALSE(values_read == 0 || batch.ValuesRemaining() == 0)) { + decoder_ = std::move(decoder); + return ControlFlow::Break; + } + + return ControlFlow::Continue; + }, + }; + + parser_.Parse(handler); + + // There may be remaining null if they are not greedily filled by either decoder calls + check_and_handle_fully_null_remaining(); + ARROW_DCHECK(batch.IsDone() || Exhausted()); // batch.Done() => batch.NullRemaining() == 0 ARROW_DCHECK(!batch.IsDone() || (batch.NullRemaining() == 0)); @@ -1127,6 +1233,8 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, int32_t dictionary_length, V* out, values_count_type batch_size) -> values_count_type { + using ControlFlow = RleBitPackedParser::ControlFlow; + if (ARROW_PREDICT_FALSE(batch_size <= 0)) { return 0; } @@ -1143,40 +1251,49 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, return batch_size - values_read; }; - while (values_read < batch_size) { - values_count_type run_values_read = 0; - values_count_type run_null_read = 0; - - if (auto* rle_decoder = std::get_if>(&decoder_)) { - if (rle_decoder->Remaining() > 0) { - std::tie(run_values_read, run_null_read) = internal::GetSpacedRle( - converter, out, batch_values_remaining(), /* null_count= */ 0, - validity_reader, validity_run, *rle_decoder); - } else if (!ParseAndResetDecoder()) { - // We try to get the next run from the batch, it will be read in the next loop. - // Otherwise there may be remaining null not greedily filled. - break; - } - - } else { - auto* bit_packed_decoder = std::get_if>(&decoder_); - ARROW_DCHECK(bit_packed_decoder); // Only two possibilities in the variant - if (bit_packed_decoder->Remaining() > 0) { - std::tie(run_values_read, run_null_read) = internal::GetSpacedBitPacked( - converter, out, batch_values_remaining(), /* null_count= */ 0, - validity_reader, validity_run, *bit_packed_decoder); - } else if (!ParseAndResetDecoder()) { - // We try to get the next run from the batch, it will be read in the next loop. - // Otherwise there may be remaining null not greedily filled. - break; - } - } + if (ARROW_PREDICT_FALSE(RunRemaining() > 0)) { + auto const [run_values_read, run_null_read] = + RunGetSpaced(converter, out, batch_size, /* null_count= */ 0, validity_reader, + validity_run, decoder_); ARROW_DCHECK_EQ(run_null_read, 0); values_read += run_values_read; out += run_values_read; + + // Either we fulfilled all the batch values to be read + if (ARROW_PREDICT_FALSE(values_read >= batch_size)) { + // There may be remaining null if they are not greedily filled + return values_read; + } + + /// We finished the remaining run + ARROW_DCHECK(RunRemaining() == 0); } + auto handler = internal::LambdaHandler{ + [&](auto run) { + internal::decoder_for_t decoder(run); + + auto const [run_values_read, run_null_read] = internal::RunGetSpaced( + converter, out, batch_values_remaining(), /* null_count= */ 0, + validity_reader, validity_run, decoder); + + ARROW_DCHECK_EQ(run_null_read, 0); + values_read += run_values_read; + out += run_values_read; + + // Stop reading and store remaining decoder + if (ARROW_PREDICT_FALSE(run_values_read == 0 || values_read == batch_size)) { + decoder_ = std::move(decoder); + return ControlFlow::Break; + } + + return ControlFlow::Continue; + }, + }; + + parser_.Parse(handler); + return values_read; } @@ -1274,14 +1391,33 @@ constexpr void RleBitPackedParser::Reset(raw_data_const_pointer data, } inline auto RleBitPackedParser::Peek() const -> std::optional { - auto [out, count] = PeekCount(); + if (ARROW_PREDICT_FALSE(Exhausted())) { + return {}; + } + + auto out = std::optional{}; + auto handler = internal::LambdaHandler{[&](auto run) { + out = run; + return ControlFlow::Break; + }}; + PeekImpl(handler); return out; } inline auto RleBitPackedParser::Next() -> std::optional { - auto [out, count] = PeekCount(); - data_ += count; - data_size_ -= count; + if (ARROW_PREDICT_FALSE(Exhausted())) { + return {}; + } + + auto out = std::optional{}; + auto handler = internal::LambdaHandler{[&](auto run) { + out = run; + return ControlFlow::Break; + }}; + PeekImpl(handler); + auto [read, control] = PeekImpl(handler); + data_ += read; + data_size_ -= read; return out; } @@ -1289,11 +1425,10 @@ inline bool RleBitPackedParser::Advance() { return Next().has_value(); } inline bool RleBitPackedParser::Exhausted() const { return data_size_ == 0; } -inline auto RleBitPackedParser::PeekCount() const - -> std::pair, raw_data_size_type> { - if (ARROW_PREDICT_FALSE(Exhausted())) { - return {}; - } +template +auto RleBitPackedParser::PeekImpl(Handler&& handler) const + -> std::pair { + ARROW_DCHECK(!Exhausted()); constexpr auto kMaxSize = bit_util::MaxLEB128ByteLenFor; uint32_t run_len_type = 0; @@ -1321,10 +1456,10 @@ inline auto RleBitPackedParser::PeekCount() const auto const bytes_read = header_bytes + static_cast(count) * value_bit_width_; - return { - {BitPackedRun(data_ + header_bytes, values_count, value_bit_width_)}, - bytes_read, - }; + auto control = handler.OnBitPackedRun( + BitPackedRun(data_ + header_bytes, values_count, value_bit_width_)); + + return {bytes_read, control}; } using values_count_type = RleRun::values_count_type; @@ -1340,10 +1475,22 @@ inline auto RleBitPackedParser::PeekCount() const ARROW_DCHECK_LT(value_bytes, internal::max_size_for_v); auto const bytes_read = header_bytes + static_cast(value_bytes); - return { - {RleRun(data_ + header_bytes, values_count, value_bit_width_)}, - bytes_read, - }; + auto control = + handler.OnRleRun(RleRun(data_ + header_bytes, values_count, value_bit_width_)); + + return {bytes_read, control}; +} + +template +void RleBitPackedParser::Parse(Handler&& handler) { + while (!Exhausted()) { + auto [read, control] = PeekImpl(handler); + data_ += read; + data_size_ -= read; + if (ARROW_PREDICT_FALSE(control == ControlFlow::Break)) { + break; + } + } } /**************** From 407de76b5b702826ae730bc81a256552af5226a6 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 Aug 2025 18:31:01 +0200 Subject: [PATCH 28/56] Perf: predict false in LEB128 --- cpp/src/arrow/util/bit_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index bf63f740f58e..8c6ade97f539 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -435,7 +435,7 @@ constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, // Read as many bytes as the could be for the give output for (int32_t i = 0; i < MaxLEB128ByteLenFor; i++) { // We have not finished reading a valid LEB128, yet we run out of data - if (i >= max_data_size) { + if (ARROW_PREDICT_FALSE(i >= max_data_size)) { return 0; } From e641ca00ffb5864dc359b6f0768d605fdd47858d Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 Aug 2025 18:38:15 +0200 Subject: [PATCH 29/56] Remove BitPackedIdentity --- cpp/src/arrow/util/rle_encoding_internal.h | 92 +--------------------- 1 file changed, 4 insertions(+), 88 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 7cde0cff5da2..499c662eac84 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -838,74 +838,10 @@ static_assert(min(5, 41) == 5); template -auto GetSpacedBitPackedIdentity(Converter& converter, typename Converter::out_type* out, - values_count_type batch_size, - values_count_type null_count, - BitRunReader&& validity_reader, BitRun&& validity_run, - BitPackedDecoder& decoder) - -> std::pair { - ARROW_DCHECK_GT(batch_size, 0); - // The equality case is handled in the main loop in GetSpaced - ARROW_DCHECK_LT(null_count, batch_size); - - auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); - - values_count_type const values_available = decoder.Remaining(); - ARROW_DCHECK_GT(values_available, 0); - auto run_values_remaining = [&]() { - auto out = values_available - batch.ValuesRead(); - ARROW_DCHECK_GE(out, 0); - return out; - }; - - while (run_values_remaining() > 0 && batch.ValuesRemaining() > 0) { - ARROW_DCHECK_GE(validity_run.length, 0); - ARROW_DCHECK_LT(validity_run.length, max_size_for_v); - ARROW_DCHECK_LE(validity_run.length, batch.TotalRemaining()); - auto const validity_run_length = static_cast(validity_run.length); - - // Copy as much as possible from the buffer into the output while not exceeding - // validity run - if (validity_run.set) { - auto const requested_read = std::min(validity_run_length, run_values_remaining()); - // Since this is identity, we can write directly to the output - auto const actual_read = decoder.GetBatch(out, requested_read); - - if (ARROW_PREDICT_FALSE(!converter.InputIsValid(out, actual_read))) { - return {batch.ValuesRead(), batch.NullRead()}; - } - - batch.AccrueReadValues(actual_read); - out += actual_read; - validity_run.length -= actual_read; - - // Simply write zeros in the output - } else { - auto const update_size = std::min(validity_run_length, batch.NullRemaining()); - converter.WriteZero(out, out + update_size); - batch.AccrueReadNulls(update_size); - out += update_size; - validity_run.length -= update_size; - } - - if (validity_run.length == 0) { - validity_run = validity_reader.NextRun(); - } - } - - ARROW_DCHECK_EQ(values_available - decoder.Remaining(), batch.ValuesRead()); - ARROW_DCHECK_LE(batch.TotalRead(), batch_size); - ARROW_DCHECK_LE(batch.NullRead(), batch.NullCount()); - - return {batch.ValuesRead(), batch.NullRead()}; -} - -template -auto GetSpacedBitPackedDefault(Converter& converter, typename Converter::out_type* out, - values_count_type batch_size, values_count_type null_count, - BitRunReader&& validity_reader, BitRun&& validity_run, - BitPackedDecoder& decoder) +auto RunGetSpaced(Converter& converter, typename Converter::out_type* out, + values_count_type batch_size, values_count_type null_count, + BitRunReader&& validity_reader, BitRun&& validity_run, + BitPackedDecoder& decoder) -> std::pair { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced @@ -987,26 +923,6 @@ auto GetSpacedBitPackedDefault(Converter& converter, typename Converter::out_typ return {batch.ValuesRead(), batch.NullRead()}; } -/// Overload for GetSpaced for a single run in a BitPackedDecoder -template -auto RunGetSpaced(Converter& converter, typename Converter::out_type* out, - values_count_type batch_size, values_count_type null_count, - BitRunReader&& validity_reader, BitRun&& validity_run, - BitPackedDecoder& decoder) - -> std::pair { - if constexpr (Converter::kIsIdentity) { - // An optimization - return GetSpacedBitPackedIdentity(converter, out, batch_size, null_count, - std::forward(validity_reader), - std::forward(validity_run), decoder); - } else { - return GetSpacedBitPackedDefault(converter, out, batch_size, null_count, - std::forward(validity_reader), - std::forward(validity_run), decoder); - } -} - /// Overload for GetSpaced for a single run in a decoder variant template From 93415fbc2a7d06700fddec751238fa88c8ecc8c9 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 26 Aug 2025 10:53:44 +0200 Subject: [PATCH 30/56] Check overflow in LEB128 reading --- cpp/src/arrow/util/bit_util.h | 12 ++++++++++-- cpp/src/arrow/util/bit_util_test.cc | 3 +++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 8c6ade97f539..119771c35f61 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -426,6 +426,7 @@ constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) { template constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, Int* out) { + constexpr auto kMaxBytes = static_cast(MaxLEB128ByteLenFor); constexpr uint8_t kLow7Mask = 0x7F; constexpr uint8_t kContinuationBit = 0x80; @@ -433,7 +434,7 @@ constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, Int value = 0; // Read as many bytes as the could be for the give output - for (int32_t i = 0; i < MaxLEB128ByteLenFor; i++) { + for (int32_t i = 0; i < kMaxBytes; i++) { // We have not finished reading a valid LEB128, yet we run out of data if (ARROW_PREDICT_FALSE(i >= max_data_size)) { return 0; @@ -441,7 +442,14 @@ constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, // Read the byte and set its 7 LSB to in the final value uint8_t const byte = data[i]; - value |= static_cast(byte & kLow7Mask) << (7 * i); + auto const byte7 = static_cast(byte & kLow7Mask); + Int const shifted_byte = byte7 << (7 * i); + value |= shifted_byte; + + // If we reach the last byte, there is a risk of overflowing the result + if (ARROW_PREDICT_FALSE((i == kMaxBytes - 1) && (shifted_byte >> (7 * i) != byte7))) { + return 0; + } // Check for lack of continuation flag in MSB if ((byte & kContinuationBit) == 0) { diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 9790669443ea..31e82cce3af2 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -2093,6 +2093,9 @@ TEST(BitStreamUtil, LEB128) { 18446744073709551615ULL, 10); // Error case: Truncated sequence (continuation bit set but no more data) TestLEB128Decode(std::array{0x80}, 0U, 0); + // Error case: Input over the maximum number of bytes for a int32_t (5), but the + // overflow none the less (7 * 5 = 35 bits of data). + TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0xFF, 0x7F}, int32_t{}, 0); // Error case: Oversized sequence for uint32_t (too many bytes) TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 0U, 0); } From c13243d89e808788a679ad63b67945e6bd6b32c6 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 26 Aug 2025 15:42:40 +0200 Subject: [PATCH 31/56] Avoid UB in LEB128 overflow check --- cpp/src/arrow/util/bit_util.h | 41 ++++++++++++++++++++--------- cpp/src/arrow/util/bit_util_test.cc | 4 +++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 119771c35f61..b0fc310cb6a9 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -427,14 +427,20 @@ template constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, Int* out) { constexpr auto kMaxBytes = static_cast(MaxLEB128ByteLenFor); + static_assert(kMaxBytes >= 1); constexpr uint8_t kLow7Mask = 0x7F; constexpr uint8_t kContinuationBit = 0x80; + constexpr int32_t kSignBitCount = std::is_signed_v ? 1 : 0; + // Number of bits allowed for encoding data on the last byte to avoid overflow + constexpr uint8_t kHighBitCount = (8 * sizeof(Int) - kSignBitCount) % 7; + // kHighBitCount least significant `0` bits and the rest with `1` + constexpr uint8_t kHighForbiddenMask = ~((1 << kHighBitCount) - 1); // Iteratively building the value - Int value = 0; + std::make_unsigned_t value = 0; // Read as many bytes as the could be for the give output - for (int32_t i = 0; i < kMaxBytes; i++) { + for (int32_t i = 0; i < kMaxBytes - 1; i++) { // We have not finished reading a valid LEB128, yet we run out of data if (ARROW_PREDICT_FALSE(i >= max_data_size)) { return 0; @@ -442,14 +448,7 @@ constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, // Read the byte and set its 7 LSB to in the final value uint8_t const byte = data[i]; - auto const byte7 = static_cast(byte & kLow7Mask); - Int const shifted_byte = byte7 << (7 * i); - value |= shifted_byte; - - // If we reach the last byte, there is a risk of overflowing the result - if (ARROW_PREDICT_FALSE((i == kMaxBytes - 1) && (shifted_byte >> (7 * i) != byte7))) { - return 0; - } + value |= static_cast(byte & kLow7Mask) << (7 * i); // Check for lack of continuation flag in MSB if ((byte & kContinuationBit) == 0) { @@ -458,8 +457,26 @@ constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, } } - // There is still data - return 0; + // Process the last index avoiding overflowing + constexpr int32_t last = kMaxBytes - 1; + + // We have not finished reading a valid LEB128, yet we run out of data + if (ARROW_PREDICT_FALSE(last >= max_data_size)) { + return 0; + } + + uint8_t const byte = data[last]; + + // Need to check if there are bits that would overflow the output. + // Also checks that there is no continuation. + if (ARROW_PREDICT_FALSE((byte & kHighForbiddenMask) != 0)) { + return 0; + } + + // No longer need to mask since we ensured + value |= static_cast(byte) << (7 * last); + *out = value; + return last + 1; } } // namespace bit_util } // namespace arrow diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 31e82cce3af2..a82f5b05e35b 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -2091,6 +2091,10 @@ TEST(BitStreamUtil, LEB128) { TestLEB128Decode( std::array{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 18446744073709551615ULL, 10); + // int32_t with maximum size (31 bits of 1) + TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0xFF, 0x7}, + std::numeric_limits::max(), 5); + // Error case: Truncated sequence (continuation bit set but no more data) TestLEB128Decode(std::array{0x80}, 0U, 0); // Error case: Input over the maximum number of bytes for a int32_t (5), but the From bbf6ffb0122f68083645f8fc200554eb12984b1a Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 26 Aug 2025 16:38:41 +0200 Subject: [PATCH 32/56] Fix UB in test --- cpp/src/arrow/util/rle_encoding_test.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 8a7544269b29..3f6fbfa313c3 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -911,29 +911,30 @@ void CheckRoundTrip(const Array& data, int bit_width, to_read = remaining; } - auto* out = values_read.data() + requested_read_count; - auto* dict_out = dict_read.data() + requested_read_count; - auto read = 0; if constexpr (kSpaced) { // We need to slice the input array get the proper null count and bitmap auto data_remaining = data.Slice(requested_read_count, to_read); if (dict) { + auto* out = dict_read.data() + requested_read_count; read = decoder.GetBatchWithDictSpaced( - dict->raw_values(), static_cast(dict->length()), dict_out, to_read, + dict->raw_values(), static_cast(dict->length()), out, to_read, static_cast(data_remaining->null_count()), data_remaining->null_bitmap_data(), data_remaining->offset()); } else { + auto* out = values_read.data() + requested_read_count; read = decoder.GetBatchSpaced( to_read, static_cast(data_remaining->null_count()), data_remaining->null_bitmap_data(), data_remaining->offset(), out); } } else { if (dict) { + auto* out = dict_read.data() + requested_read_count; read = decoder.GetBatchWithDict( - dict->raw_values(), static_cast(dict->length()), dict_out, to_read); + dict->raw_values(), static_cast(dict->length()), out, to_read); } else { + auto* out = values_read.data() + requested_read_count; read = decoder.GetBatch(out, to_read); } } From b4a37ce79a0c39fdc58a6d7a562f169809287b38 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 26 Aug 2025 17:42:52 +0200 Subject: [PATCH 33/56] Fix UB --- cpp/src/arrow/util/rle_encoding_internal.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 499c662eac84..987ac05d05fb 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -1426,11 +1426,12 @@ void RleDecoder::Reset(run_type const& run) noexcept { // If we memcpy + FromLittleEndian, we have potential undefined behavior // if the bool value isn't 0 or 1. value_ = *run.RawDataPtr() & 1; + } else { + // Memcopy is required to avoid undefined behavior. + value_ = {}; + std::memcpy(&value_, run.RawDataPtr(), run.RawDataSize()); + value_ = ::arrow::bit_util::FromLittleEndian(value_); } - // Memcopy is required to avoid undefined behavior. - std::memset(&value_, 0, sizeof(value_type)); - std::memcpy(&value_, run.RawDataPtr(), run.RawDataSize()); - value_ = ::arrow::bit_util::FromLittleEndian(value_); } template From 9027ca84863703c33ed89cc1cc4f263b6b8547eb Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 9 Sep 2025 18:08:52 +0200 Subject: [PATCH 34/56] First pass addressing review comments --- .../arrow/util/bit_stream_utils_internal.h | 12 +-- cpp/src/arrow/util/bit_util.h | 18 ++-- cpp/src/arrow/util/bit_util_test.cc | 101 ++++++++++-------- cpp/src/arrow/util/rle_encoding_internal.h | 2 +- 4 files changed, 72 insertions(+), 61 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 3f8577944d7f..dc8be0c0ae69 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -190,10 +190,10 @@ class BitReader { } /// Maximum byte length of a vlq encoded int - static constexpr int kMaxVlqByteLengthForInt32 = MaxLEB128ByteLenFor; + static constexpr int kMaxVlqByteLengthForInt32 = kMaxLEB128ByteLenFor; /// Maximum byte length of a vlq encoded int64 - static constexpr int kMaxVlqByteLengthForInt64 = MaxLEB128ByteLenFor; + static constexpr int kMaxVlqByteLengthForInt64 = kMaxLEB128ByteLenFor; private: const uint8_t* buffer_; @@ -452,18 +452,18 @@ inline bool BitWriter::PutVlqInt(uint32_t v) { inline bool BitReader::GetVlqInt(uint32_t* v) { // The data that we will pass to the LEB128 parser // In all case, we read an byte-aligned value, skipping remaining bits - uint8_t const* data = NULLPTR; + const uint8_t* data = NULLPTR; int max_size = 0; // Number of bytes left in the buffered values, not including the current // byte (i.e., there may be an additional fraction of a byte). - int const bytes_left_in_cache = + const int bytes_left_in_cache = sizeof(buffered_values_) - static_cast(bit_util::BytesForBits(bit_offset_)); // If there are clearly enough bytes left we can try to parse from the cache if (bytes_left_in_cache >= kMaxVlqByteLengthForInt32) { max_size = bytes_left_in_cache; - data = reinterpret_cast(&buffered_values_) + + data = reinterpret_cast(&buffered_values_) + bit_util::BytesForBits(bit_offset_); // Otherwise, we try straight from buffer (ignoring few bytes that may be cached) } else { @@ -496,7 +496,7 @@ inline bool BitReader::GetZigZagVlqInt(int32_t* v) { } inline bool BitWriter::PutVlqInt(uint64_t v) { - constexpr auto kMaxBytes = bit_util::MaxLEB128ByteLenFor; + constexpr auto kMaxBytes = bit_util::kMaxLEB128ByteLenFor; uint8_t leb128[kMaxBytes] = {}; auto const bytes_written = bit_util::WriteLEB128(v, leb128, kMaxBytes); diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index b0fc310cb6a9..c0f501c433f0 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -368,11 +368,11 @@ void PackBits(const uint32_t* values, uint8_t* out) { constexpr int64_t MaxLEB128ByteLen(int64_t n_bits) { return CeilDiv(n_bits, 7); } template -constexpr int64_t MaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8); +constexpr int64_t kMaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8); /// Write a integer as LEB128 /// -/// Write the input value as LEB128 into the outptu buffer and return the number of bytes +/// Write the input value as LEB128 into the outptut buffer and return the number of bytes /// written. /// If the output buffer size is insufficient, return 0 but the output may have been /// written to. @@ -385,9 +385,9 @@ constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) { constexpr Int kHigh7Mask = ~kLow7Mask; constexpr uint8_t kContinuationBit = 0x80; - auto const out_first = out; + const auto out_first = out; - // Write as many bytes as the could be for the given input + // Write as many bytes as we could be for the given input while ((value & kHigh7Mask) != Int(0)) { // We do not have enough room to write the LEB128 if (out - out_first >= max_out_size) { @@ -424,9 +424,9 @@ constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) { /// \see https://en.wikipedia.org/wiki/LEB128 /// \see MaxLEB128ByteLenFor template -constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, +constexpr int32_t ParseLeadingLEB128(const uint8_t* data, int32_t max_data_size, Int* out) { - constexpr auto kMaxBytes = static_cast(MaxLEB128ByteLenFor); + constexpr auto kMaxBytes = static_cast(kMaxLEB128ByteLenFor); static_assert(kMaxBytes >= 1); constexpr uint8_t kLow7Mask = 0x7F; constexpr uint8_t kContinuationBit = 0x80; @@ -439,7 +439,7 @@ constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, // Iteratively building the value std::make_unsigned_t value = 0; - // Read as many bytes as the could be for the give output + // Read as many bytes as we could be for the given output. for (int32_t i = 0; i < kMaxBytes - 1; i++) { // We have not finished reading a valid LEB128, yet we run out of data if (ARROW_PREDICT_FALSE(i >= max_data_size)) { @@ -447,7 +447,7 @@ constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, } // Read the byte and set its 7 LSB to in the final value - uint8_t const byte = data[i]; + const uint8_t byte = data[i]; value |= static_cast(byte & kLow7Mask) << (7 * i); // Check for lack of continuation flag in MSB @@ -465,7 +465,7 @@ constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, return 0; } - uint8_t const byte = data[last]; + const uint8_t byte = data[last]; // Need to check if there are bits that would overflow the output. // Also checks that there is no continuation. diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index a82f5b05e35b..afa9614e6d7f 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -1999,66 +1999,75 @@ TEST(BitUtil, RoundUpToPowerOf2) { /// Test the maximum number of bytes needed to write a LEB128 of a give size. TEST(BitStreamUtil, MaxLEB128ByteLenFor) { - EXPECT_EQ(bit_util::MaxLEB128ByteLenFor, 3); - EXPECT_EQ(bit_util::MaxLEB128ByteLenFor, 5); - EXPECT_EQ(bit_util::MaxLEB128ByteLenFor, 10); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 3); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 5); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 10); } /// Utility function to test LEB128 encoding with known input value and expected byte /// array -template -void TestLEB128Encode(Int input_value, std::array const& expected_data, - int32_t expected_bytes_written) { - std::array buffer{}; +template +void TestLEB128Encode(Int input_value, std::vector const& expected_data, + std::size_t buffer_size) { + std::vector buffer(buffer_size); auto bytes_written = bit_util::WriteLEB128(input_value, buffer.data(), static_cast(buffer.size())); - EXPECT_EQ(bytes_written, expected_bytes_written); + + EXPECT_EQ(bytes_written, expected_data.size()); + // Encoded data + for (std::size_t i = 0; i < expected_data.size(); ++i) { + EXPECT_EQ(buffer.at(i), expected_data.at(i)); + } + + // When the value is successfully encoded, the remaining of the buffer is untouched if (bytes_written > 0) { - EXPECT_EQ(buffer, expected_data); + for (std::size_t i = bytes_written; i < buffer.size(); ++i) { + EXPECT_EQ(buffer.at(i), 0); + } } } /// Test encoding to known LEB128 byte sequences TEST(WriteLEB128Test, KnownArrayValues) { // Single byte value 0 - TestLEB128Encode(0U, std::array{0x00}, 1); + TestLEB128Encode(0U, std::vector{0x00}, 1); // Single byte value 127 - TestLEB128Encode(127U, std::array{0x7F}, 1); + TestLEB128Encode(127U, std::vector{0x7F}, 1); // Two byte value 128 - TestLEB128Encode(128U, std::array{0x80, 0x01}, 2); + TestLEB128Encode(128U, std::vector{0x80, 0x01}, 2); // Two byte value 300 - TestLEB128Encode(300U, std::array{0xAC, 0x02}, 2); + TestLEB128Encode(300U, std::vector{0xAC, 0x02}, 2); // Three byte value 16384 - TestLEB128Encode(16384U, std::array{0x80, 0x80, 0x01}, 3); + TestLEB128Encode(16384U, std::vector{0x80, 0x80, 0x01}, 3); // Four byte value 268435455 - TestLEB128Encode(268435455U, std::array{0xFF, 0xFF, 0xFF, 0x7F}, 4); + TestLEB128Encode(268435455U, std::vector{0xFF, 0xFF, 0xFF, 0x7F}, 4); // Five byte uint32_t max value - TestLEB128Encode(4294967295U, std::array{0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, 5); + TestLEB128Encode(4294967295U, std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, 5); // uint64_t value requiring 10 bytes TestLEB128Encode( 18446744073709551615ULL, - std::array{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, + std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 10); - // Edge case: Exact buffer size match - TestLEB128Encode(16384U, std::array{0x80, 0x80, 0x01}, 3); + // Three byte value 16384, encoded in larger buffer + TestLEB128Encode(16384U, std::vector{0x80, 0x80, 0x01}, 10); // Various single byte values - TestLEB128Encode(1U, std::array{0x01}, 1); - TestLEB128Encode(63U, std::array{0x3F}, 1); - TestLEB128Encode(64U, std::array{0x40}, 1); + TestLEB128Encode(1U, std::vector{0x01}, 1); + TestLEB128Encode(63U, std::vector{0x3F}, 1); + TestLEB128Encode(64U, std::vector{0x40}, 1); // Two byte boundary values - TestLEB128Encode(129U, std::array{0x81, 0x01}, 2); - TestLEB128Encode(16383U, std::array{0xFF, 0x7F}, 2); + TestLEB128Encode(129U, std::vector{0x81, 0x01}, 2); + TestLEB128Encode(16383U, std::vector{0xFF, 0x7F}, 2); // Error case: Buffer too small for value 128 (needs 2 bytes but only 1 provided) - TestLEB128Encode(128U, std::array{}, 0); + TestLEB128Encode(128U, std::vector{}, 1); // Error case: Buffer too small for uint32_t max (needs 5 bytes but only 4 provided) - TestLEB128Encode(4294967295U, std::array{}, 0); + TestLEB128Encode(4294967295U, std::vector{}, 4); // Error case: Zero buffer size - TestLEB128Encode(52U, std::array{}, 0); + TestLEB128Encode(52U, std::vector{}, 0); } /// Utility function to test LEB128 decoding with known byte array and expected result -template -void TestLEB128Decode(std::array const& data, Int expected_value, +template +void TestLEB128Decode(std::vector const& data, Int expected_value, int32_t expected_bytes_read) { Int result = 0; auto bytes_read = bit_util::ParseLeadingLEB128( @@ -2072,43 +2081,45 @@ void TestLEB128Decode(std::array const& data, Int expected_value, /// Test decoding from known LEB128 byte sequences TEST(BitStreamUtil, LEB128) { // Single byte value 0 - TestLEB128Decode(std::array{0x00}, 0U, 1); + TestLEB128Decode(std::vector{0x00}, 0U, 1); // Single byte value 127 - TestLEB128Decode(std::array{0x7F}, 127U, 1); + TestLEB128Decode(std::vector{0x7F}, 127U, 1); // Two byte value 128 - TestLEB128Decode(std::array{0x80, 0x01}, 128U, 2); + TestLEB128Decode(std::vector{0x80, 0x01}, 128U, 2); // Two byte value 300 - TestLEB128Decode(std::array{0xAC, 0x02}, 300U, 2); + TestLEB128Decode(std::vector{0xAC, 0x02}, 300U, 2); // Three byte value 16384 - TestLEB128Decode(std::array{0x80, 0x80, 0x01}, 16384U, 3); + TestLEB128Decode(std::vector{0x80, 0x80, 0x01}, 16384U, 3); // Three byte value 16384, with remaining data - TestLEB128Decode(std::array{0x80, 0x80, 0x01, 0x80, 0x00}, 16384U, 3); + TestLEB128Decode(std::vector{0x80, 0x80, 0x01, 0x80, 0x00}, 16384U, 3); // Four byte value 268435455 - TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0x7F}, 268435455U, 4); + TestLEB128Decode(std::vector{0xFF, 0xFF, 0xFF, 0x7F}, 268435455U, 4); // Five byte uint32_t max value - TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, 4294967295U, 5); + TestLEB128Decode(std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, + std::numeric_limits::max(), 5); // uint64_t value requiring 10 bytes TestLEB128Decode( - std::array{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, + std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 18446744073709551615ULL, 10); // int32_t with maximum size (31 bits of 1) - TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0xFF, 0x7}, + TestLEB128Decode(std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0x7}, std::numeric_limits::max(), 5); // Error case: Truncated sequence (continuation bit set but no more data) - TestLEB128Decode(std::array{0x80}, 0U, 0); - // Error case: Input over the maximum number of bytes for a int32_t (5), but the - // overflow none the less (7 * 5 = 35 bits of data). - TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0xFF, 0x7F}, int32_t{}, 0); + TestLEB128Decode(std::vector{0x80}, 0U, 0); + // Error case: Input has exactly the maximum number of bytes for a int32_t (5), + // but the decoded value overflows nonetheless (7 * 5 = 35 bits of data). + TestLEB128Decode(std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0x7F}, int32_t{}, 0); // Error case: Oversized sequence for uint32_t (too many bytes) - TestLEB128Decode(std::array{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 0U, 0); + TestLEB128Decode(std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 0U, 0); } static void TestZigZag(int32_t v, std::array buffer_expect) { uint8_t buffer[bit_util::BitReader::kMaxVlqByteLengthForInt32] = {}; bit_util::BitWriter writer(buffer, sizeof(buffer)); writer.PutZigZagVlqInt(v); - // WARN reader buffer input on creation so it must be created after the data is written + // WARNING: The reader reads and caches the input when created, so it must be created + // after the data is written in the buffer. bit_util::BitReader reader(buffer, sizeof(buffer)); EXPECT_THAT(buffer, testing::ElementsAreArray(buffer_expect)); int32_t result; diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 987ac05d05fb..a9dd19c2e3f5 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -1346,7 +1346,7 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const -> std::pair { ARROW_DCHECK(!Exhausted()); - constexpr auto kMaxSize = bit_util::MaxLEB128ByteLenFor; + constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor; uint32_t run_len_type = 0; auto const header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type); From 1db18ded42c35124551f2f3bf482e59919693d50 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 10 Sep 2025 11:46:15 +0200 Subject: [PATCH 35/56] Second pass addressing review comments --- .../arrow/util/bit_stream_utils_internal.h | 8 +- cpp/src/arrow/util/rle_encoding_internal.h | 264 ++++++++---------- cpp/src/arrow/util/rle_encoding_test.cc | 26 +- 3 files changed, 139 insertions(+), 159 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index dc8be0c0ae69..674f051f1a00 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -471,14 +471,14 @@ inline bool BitReader::GetVlqInt(uint32_t* v) { data = buffer_ + (max_bytes_ - max_size); } - auto const read = bit_util::ParseLeadingLEB128(data, max_size, v); - if (ARROW_PREDICT_FALSE(read == 0)) { + const auto bytes_read = bit_util::ParseLeadingLEB128(data, max_size, v); + if (ARROW_PREDICT_FALSE(bytes_read == 0)) { // Corrupt LEB128 return false; } - // Advance for the bytes we have read + the bit we skipped - return Advance((8 * read) + (bit_offset_ % 8)); + // Advance for the bytes we have read + the bits we skipped + return Advance((8 * bytes_read) + (bit_offset_ % 8)); } inline bool BitWriter::PutZigZagVlqInt(int32_t v) { diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index a9dd19c2e3f5..b63abd21f060 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -32,8 +32,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/macros.h" -namespace arrow { -namespace util { +namespace arrow::util { /// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs /// are sufficiently long, RLE is used, otherwise, the values are just bit-packed @@ -85,18 +84,24 @@ namespace util { /// (total 26 bytes, 1 byte overhead) // +template +class RleRunDecoder; + class RleRun { public: - using byte = uint8_t; /// Enough space to store a 64bit value - using raw_data_storage = std::array; - using raw_data_const_pointer = const byte*; + using raw_data_storage = std::array; + using raw_data_const_pointer = const uint8_t*; using raw_data_size_type = int32_t; /// The type of the size of either run, between 1 and 2^31-1 as per Parquet spec using values_count_type = int32_t; /// The type to represent a size in bits using bit_size_type = int32_t; + /// The decoder class used to decode a single run in the given type. + template + using DecoderType = RleRunDecoder; + constexpr RleRun() noexcept = default; constexpr RleRun(RleRun const&) noexcept = default; constexpr RleRun(RleRun&&) noexcept = default; @@ -128,10 +133,12 @@ class RleRun { bit_size_type value_bit_width_ = 0; }; +template +class BitPackedRunDecoder; + class BitPackedRun { public: - using byte = uint8_t; - using raw_data_const_pointer = const byte*; + using raw_data_const_pointer = const uint8_t*; /// According to the Parquet thrift definition the page size can be written into an /// int32_t. using raw_data_size_type = int32_t; @@ -140,6 +147,10 @@ class BitPackedRun { /// The type to represent a size in bits using bit_size_type = int32_t; + /// The decoder class used to decode a single run in the given type. + template + using DecoderType = BitPackedRunDecoder; + constexpr BitPackedRun() noexcept = default; constexpr BitPackedRun(BitPackedRun const&) noexcept = default; constexpr BitPackedRun(BitPackedRun&&) noexcept = default; @@ -171,8 +182,7 @@ class BitPackedRun { /// A parser that emits either a ``BitPackedRun`` or a ``RleRun``. class RleBitPackedParser { public: - using byte = uint8_t; - using raw_data_const_pointer = const byte*; + using raw_data_const_pointer = const uint8_t*; /// By Parquet thrift definition the page size can be written into an int32_t. using raw_data_size_type = int32_t; /// The type to represent a size in bits @@ -199,8 +209,8 @@ class RleBitPackedParser { /// Whether there is still runs to iterate over. /// - /// WARN: Due to lack of proper error handling, iteration with Next and Peek could - /// return not data while the parser is not exhausted. + /// WARN: Due to simplistic error handling, iteration with Next and Peek could + /// fail to return data while the parser is not exhausted. /// This is how one can check for errors. [[nodiscard]] bool Exhausted() const; @@ -219,7 +229,7 @@ class RleBitPackedParser { /// variant. /// /// The handler must be of the form - /// ```cpp` + /// ```cpp /// struct Handler { /// ControlFlow OnBitPackedRun(BitPackedRun run); /// @@ -243,9 +253,9 @@ class RleBitPackedParser { std::pair PeekImpl(Handler&&) const; }; -/// Decoder class for RLE encoded data. +/// Decoder class for a single run of RLE encoded data. template -class RleDecoder { +class RleRunDecoder { public: /// The type in which the data should be decoded. using value_type = T; @@ -253,9 +263,9 @@ class RleDecoder { using run_type = RleRun; using values_count_type = run_type::values_count_type; - constexpr RleDecoder() noexcept = default; + constexpr RleRunDecoder() noexcept = default; - explicit RleDecoder(run_type const& run) noexcept; + explicit RleRunDecoder(run_type const& run) noexcept; void Reset(run_type const& run) noexcept; @@ -283,9 +293,9 @@ class RleDecoder { "This class makes assumptions about integer endianness and padding"); }; -/// Decoder class for Bit packing encoded data. +/// Decoder class for single run of bit-packed encoded data. template -class BitPackedDecoder { +class BitPackedRunDecoder { public: /// The type in which the data should be decoded. using value_type = T; @@ -294,9 +304,9 @@ class BitPackedDecoder { using values_count_type = run_type::values_count_type; using bit_size_type = run_type::bit_size_type; - BitPackedDecoder() noexcept = default; + BitPackedRunDecoder() noexcept = default; - explicit BitPackedDecoder(run_type const& run) noexcept; + explicit BitPackedRunDecoder(run_type const& run) noexcept; void Reset(run_type const& run) noexcept; @@ -325,13 +335,12 @@ class BitPackedDecoder { "This class makes assumptions about integer endianness and padding"); }; -/// Decoder class for RLE encoded data. +/// Decoder class for Parquet RLE bit-packed data. template class RleBitPackedDecoder { public: /// The type in which the data should be decoded. using value_type = T; - using byte = RleBitPackedParser::byte; using raw_data_const_pointer = RleBitPackedParser::raw_data_const_pointer; using raw_data_size_type = RleBitPackedParser::raw_data_size_type; using bit_size_type = RleBitPackedParser::bit_size_type; @@ -373,7 +382,7 @@ class RleBitPackedDecoder { /// Like GetBatch but add spacing for null entries [[nodiscard]] values_count_type GetBatchSpaced(values_count_type batch_size, values_count_type null_count, - const byte* valid_bits, + const uint8_t* valid_bits, int64_t valid_bits_offset, value_type* out); @@ -390,12 +399,12 @@ class RleBitPackedDecoder { template [[nodiscard]] values_count_type GetBatchWithDictSpaced( const V* dictionary, int32_t dictionary_length, V* out, - values_count_type batch_size, values_count_type null_count, const byte* valid_bits, - int64_t valid_bits_offset); + values_count_type batch_size, values_count_type null_count, + const uint8_t* valid_bits, int64_t valid_bits_offset); private: RleBitPackedParser parser_ = {}; - std::variant, BitPackedDecoder> decoder_ = {}; + std::variant, BitPackedRunDecoder> decoder_ = {}; /// Return the number of values that are remaining in the current run. [[nodiscard]] values_count_type RunRemaining() const; @@ -404,8 +413,9 @@ class RleBitPackedDecoder { [[nodiscard]] values_count_type RunGetBatch(value_type* out, values_count_type batch_size); - /// Return the number of values that are remaining in the current run. - [[nodiscard]] bool ParseAndResetDecoder(); + /// Call the parser with a single callable for all event types. + template + void ParseWithCallable(Callable&& func); /// Utility methods for retrieving spaced values. template @@ -584,21 +594,15 @@ bool RleBitPackedDecoder::Exhausted() const { } template -bool RleBitPackedDecoder::ParseAndResetDecoder() { - auto dyn_run = parser_.Next(); - if (!dyn_run.has_value()) { - return false; - } - - if (auto* rle_run = std::get_if(dyn_run.operator->())) { - decoder_ = {BitPackedDecoder(*rle_run)}; - return true; - } - - auto* bit_packed_run = std::get_if(dyn_run.operator->()); - ARROW_DCHECK(bit_packed_run); // Only two possibilities in the variant - decoder_ = {RleDecoder(*bit_packed_run)}; - return true; +template +void RleBitPackedDecoder::ParseWithCallable(Callable&& func) { + struct { + Callable func; + auto OnBitPackedRun(BitPackedRun run) { return func(std::move(run)); } + auto OnRleRun(RleRun run) { return func(std::move(run)); } + } handler{std::move(func)}; + + parser_.Parse(std::move(handler)); } template @@ -620,32 +624,16 @@ namespace internal { /// templated) and deduce other types from it. template struct LambdaHandler { - Lambda handlder_; + Lambda handler_; - auto OnBitPackedRun(BitPackedRun run) { return handlder_(std::move(run)); } + auto OnBitPackedRun(BitPackedRun run) { return handler_(std::move(run)); } - auto OnRleRun(RleRun run) { return handlder_(std::move(run)); } + auto OnRleRun(RleRun run) { return handler_(std::move(run)); } }; template LambdaHandler(Lambda) -> LambdaHandler; -template -struct decoder_for; - -template -struct decoder_for { - using type = BitPackedDecoder; -}; - -template -struct decoder_for { - using type = RleDecoder; -}; - -template -using decoder_for_t = typename decoder_for::type; - } // namespace internal template @@ -668,26 +656,24 @@ auto RleBitPackedDecoder::GetBatch(value_type* out, values_count_type batch_s ARROW_DCHECK(RunRemaining() == 0); } - auto handler = internal::LambdaHandler{ - [&](auto run) { - ARROW_DCHECK_LT(values_read, batch_size); - internal::decoder_for_t decoder(run); - auto const read = decoder.GetBatch(out, batch_size - values_read); - ARROW_DCHECK_LE(read, batch_size - values_read); - values_read += read; - out += read; - - // Stop reading and store remaining decoder - if (ARROW_PREDICT_FALSE(values_read == batch_size || read == 0)) { - decoder_ = std::move(decoder); - return ControlFlow::Break; - } - - return ControlFlow::Continue; - }, - }; + ParseWithCallable([&](auto run) { + using RunDecoder = typename decltype(run)::template DecoderType; + + ARROW_DCHECK_LT(values_read, batch_size); + RunDecoder decoder(run); + auto const read = decoder.GetBatch(out, batch_size - values_read); + ARROW_DCHECK_LE(read, batch_size - values_read); + values_read += read; + out += read; + + // Stop reading and store remaining decoder + if (ARROW_PREDICT_FALSE(values_read == batch_size || read == 0)) { + decoder_ = std::move(decoder); + return ControlFlow::Break; + } - parser_.Parse(handler); + return ControlFlow::Continue; + }); return values_read; } @@ -769,7 +755,7 @@ template & decoder) + RleRunDecoder& decoder) -> std::pair { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced @@ -841,7 +827,7 @@ template & decoder) + BitPackedRunDecoder& decoder) -> std::pair { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced @@ -929,7 +915,7 @@ template , BitPackedDecoder>& decoder) + std::variant, BitPackedRunDecoder>& decoder) -> std::pair { return std::visit( [&](auto& dec) { @@ -944,9 +930,12 @@ auto RunGetSpaced( template template -auto RleBitPackedDecoder::GetSpaced( - Converter converter, typename Converter::out_type* out, values_count_type batch_size, - const byte* validity_bits, int64_t validity_bits_offset, values_count_type null_count) +auto RleBitPackedDecoder::GetSpaced(Converter converter, + typename Converter::out_type* out, + values_count_type batch_size, + const uint8_t* validity_bits, + int64_t validity_bits_offset, + values_count_type null_count) -> values_count_type { using ControlFlow = RleBitPackedParser::ControlFlow; @@ -995,29 +984,27 @@ auto RleBitPackedDecoder::GetSpaced( ARROW_DCHECK(RunRemaining() == 0); } - auto handler = internal::LambdaHandler{ - [&](auto run) { - internal::decoder_for_t decoder(run); + ParseWithCallable([&](auto run) { + using RunDecoder = typename decltype(run)::template DecoderType; - auto const [values_read, null_read] = internal::RunGetSpaced( - converter, out, batch.TotalRemaining(), batch.NullRemaining(), - validity_reader, validity_run, decoder); + RunDecoder decoder(run); - batch.AccrueReadNulls(null_read); - batch.AccrueReadValues(values_read); - out += values_read + null_read; + const auto [values_read, null_read] = internal::RunGetSpaced( + converter, out, batch.TotalRemaining(), batch.NullRemaining(), validity_reader, + validity_run, decoder); - // Stop reading and store remaining decoder - if (ARROW_PREDICT_FALSE(values_read == 0 || batch.ValuesRemaining() == 0)) { - decoder_ = std::move(decoder); - return ControlFlow::Break; - } + batch.AccrueReadNulls(null_read); + batch.AccrueReadValues(values_read); + out += values_read + null_read; - return ControlFlow::Continue; - }, - }; + // Stop reading and store remaining decoder + if (ARROW_PREDICT_FALSE(values_read == 0 || batch.ValuesRemaining() == 0)) { + decoder_ = std::move(decoder); + return ControlFlow::Break; + } - parser_.Parse(handler); + return ControlFlow::Continue; + }); // There may be remaining null if they are not greedily filled by either decoder calls check_and_handle_fully_null_remaining(); @@ -1065,7 +1052,7 @@ struct NoOpConverter { template auto RleBitPackedDecoder::GetBatchSpaced(values_count_type batch_size, values_count_type null_count, - const byte* valid_bits, + const uint8_t* valid_bits, int64_t valid_bits_offset, value_type* out) -> values_count_type { if (null_count == 0) { @@ -1186,29 +1173,27 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, ARROW_DCHECK(RunRemaining() == 0); } - auto handler = internal::LambdaHandler{ - [&](auto run) { - internal::decoder_for_t decoder(run); + ParseWithCallable([&](auto run) { + using RunDecoder = typename decltype(run)::template DecoderType; - auto const [run_values_read, run_null_read] = internal::RunGetSpaced( - converter, out, batch_values_remaining(), /* null_count= */ 0, - validity_reader, validity_run, decoder); + RunDecoder decoder(run); - ARROW_DCHECK_EQ(run_null_read, 0); - values_read += run_values_read; - out += run_values_read; + auto const [run_values_read, run_null_read] = internal::RunGetSpaced( + converter, out, batch_values_remaining(), /* null_count= */ 0, validity_reader, + validity_run, decoder); - // Stop reading and store remaining decoder - if (ARROW_PREDICT_FALSE(run_values_read == 0 || values_read == batch_size)) { - decoder_ = std::move(decoder); - return ControlFlow::Break; - } + ARROW_DCHECK_EQ(run_null_read, 0); + values_read += run_values_read; + out += run_values_read; - return ControlFlow::Continue; - }, - }; + // Stop reading and store remaining decoder + if (ARROW_PREDICT_FALSE(run_values_read == 0 || values_read == batch_size)) { + decoder_ = std::move(decoder); + return ControlFlow::Break; + } - parser_.Parse(handler); + return ControlFlow::Continue; + }); return values_read; } @@ -1414,12 +1399,12 @@ void RleBitPackedParser::Parse(Handler&& handler) { ****************/ template -RleDecoder::RleDecoder(run_type const& run) noexcept { +RleRunDecoder::RleRunDecoder(run_type const& run) noexcept { Reset(run); } template -void RleDecoder::Reset(run_type const& run) noexcept { +void RleRunDecoder::Reset(run_type const& run) noexcept { remaining_count_ = run.ValuesCount(); if constexpr (std::is_same_v) { // ARROW-18031: just check the LSB of the next byte and move on. @@ -1435,29 +1420,29 @@ void RleDecoder::Reset(run_type const& run) noexcept { } template -auto RleDecoder::Remaining() const -> values_count_type { +auto RleRunDecoder::Remaining() const -> values_count_type { return remaining_count_; } template -auto constexpr RleDecoder::Value() const -> value_type { +auto constexpr RleRunDecoder::Value() const -> value_type { return value_; } template -auto RleDecoder::Advance(values_count_type batch_size) -> values_count_type { +auto RleRunDecoder::Advance(values_count_type batch_size) -> values_count_type { auto const steps = std::min(batch_size, remaining_count_); remaining_count_ -= steps; return steps; } template -constexpr bool RleDecoder::Get(value_type* out_value) { +constexpr bool RleRunDecoder::Get(value_type* out_value) { return GetBatch(out_value, 1) == 1; } template -auto RleDecoder::GetBatch(value_type* out, values_count_type batch_size) +auto RleRunDecoder::GetBatch(value_type* out, values_count_type batch_size) -> values_count_type { if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { return 0; @@ -1474,12 +1459,12 @@ auto RleDecoder::GetBatch(value_type* out, values_count_type batch_size) **********************/ template -BitPackedDecoder::BitPackedDecoder(run_type const& run) noexcept { +BitPackedRunDecoder::BitPackedRunDecoder(run_type const& run) noexcept { Reset(run); } template -void BitPackedDecoder::Reset(run_type const& run) noexcept { +void BitPackedRunDecoder::Reset(run_type const& run) noexcept { value_bit_width_ = run.ValuesBitWidth(); remaining_count_ = run.ValuesCount(); ARROW_DCHECK_GE(value_bit_width_, 0); @@ -1488,17 +1473,17 @@ void BitPackedDecoder::Reset(run_type const& run) noexcept { } template -auto constexpr BitPackedDecoder::Remaining() const -> values_count_type { +auto constexpr BitPackedRunDecoder::Remaining() const -> values_count_type { return remaining_count_; } template -auto constexpr BitPackedDecoder::ValueBitWidth() const -> bit_size_type { +auto constexpr BitPackedRunDecoder::ValueBitWidth() const -> bit_size_type { return value_bit_width_; } template -auto BitPackedDecoder::Advance(values_count_type batch_size) -> values_count_type { +auto BitPackedRunDecoder::Advance(values_count_type batch_size) -> values_count_type { auto const steps = std::min(batch_size, remaining_count_); if (bit_reader_.Advance(steps * value_bit_width_)) { remaining_count_ -= steps; @@ -1508,12 +1493,12 @@ auto BitPackedDecoder::Advance(values_count_type batch_size) -> values_count_ } template -bool BitPackedDecoder::Get(value_type* out_value) { +bool BitPackedRunDecoder::Get(value_type* out_value) { return GetBatch(out_value, 1) == 1; } template -auto BitPackedDecoder::GetBatch(value_type* out, values_count_type batch_size) +auto BitPackedRunDecoder::GetBatch(value_type* out, values_count_type batch_size) -> values_count_type { if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { return 0; @@ -1684,5 +1669,4 @@ inline void RleBitPackedEncoder::Clear() { bit_writer_.Clear(); } -} // namespace util -} // namespace arrow +} // namespace arrow::util diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 3f6fbfa313c3..344462a1bbf8 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -26,7 +26,6 @@ #include "arrow/array.h" #include "arrow/array/concatenate.h" -#include "arrow/buffer.h" #include "arrow/scalar.h" #include "arrow/testing/random.h" #include "arrow/type.h" @@ -35,8 +34,7 @@ #include "arrow/util/io_util.h" #include "arrow/util/rle_encoding_internal.h" -namespace arrow { -namespace util { +namespace arrow::util { const int MAX_WIDTH = 32; @@ -212,7 +210,7 @@ TEST(BitUtil, RoundTripIntValues) { /// A Rle run is a simple class owning some data and a repetition count. /// It does not know how to read such data. TEST(Rle, RleRun) { - const std::array value = {21, 2, 0, 0}; + const std::array value = {21, 2, 0, 0}; RleRun::values_count_type value_count = 12; @@ -253,7 +251,7 @@ TEST(Rle, RleRun) { /// A BitPacked run is a simple class owning some data and its size. /// It does not know how to read such data. TEST(BitPacked, BitPackedRun) { - const std::array value = {0b10101010, 0, 0, 0b1111111}; + const std::array value = {0b10101010, 0, 0, 0b1111111}; /// 16 values of 1 bit for a total of 16 bits BitPackedRun::values_count_type value_count_1 = 16; @@ -277,8 +275,7 @@ TEST(BitPacked, BitPackedRun) { } template -void TestRleDecoder(std::vector bytes, - RleRun::values_count_type value_count, +void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_count, RleRun::bit_size_type bit_width) { // Pre-requisite for this test EXPECT_GT(value_count, 6); @@ -291,7 +288,7 @@ void TestRleDecoder(std::vector bytes, auto const run = RleRun(bytes.data(), value_count, bit_width); - auto decoder = RleDecoder(run); + auto decoder = RleRunDecoder(run); std::vector vals = {0, 0}; EXPECT_EQ(decoder.Remaining(), value_count); @@ -337,7 +334,7 @@ TEST(Rle, RleDecoder) { } template -void TestBitPackedDecoder(std::vector bytes, +void TestBitPackedDecoder(std::vector bytes, BitPackedRun::values_count_type value_count, BitPackedRun::bit_size_type bit_width, std::vector expected) { @@ -346,7 +343,7 @@ void TestBitPackedDecoder(std::vector bytes, auto const run = BitPackedRun(bytes.data(), value_count, bit_width); - auto decoder = BitPackedDecoder(run); + auto decoder = BitPackedRunDecoder(run); std::vector vals = {0, 0}; EXPECT_EQ(decoder.Remaining(), value_count); @@ -411,7 +408,7 @@ TEST(BitPacked, BitPackedDecoder) { } template -void TestRleBitPackedParser(std::vector bytes, +void TestRleBitPackedParser(std::vector bytes, RleBitPackedParser::bit_size_type bit_width, std::vector expected) { auto parser = RleBitPackedParser( @@ -433,8 +430,8 @@ void TestRleBitPackedParser(std::vector bytes, // Try to decode all data of all runs in the decoded vector decltype(expected) decoded = {}; - auto rle_decoder = RleDecoder(); - auto bit_packed_decoder = BitPackedDecoder(); + auto rle_decoder = RleRunDecoder(); + auto bit_packed_decoder = BitPackedRunDecoder(); // Iterate over all runs while (auto run = parser.Next()) { EXPECT_TRUE(run.has_value()); @@ -1134,5 +1131,4 @@ TEST(RleBitPacked, GetBatchSpacedRoundtripUint64) { DoTestGetBatchSpacedRoundtrip(); } -} // namespace util -} // namespace arrow +} // namespace arrow::util From 2b80bc615fee514f2853ad268bce829de282c797 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 10 Sep 2025 11:47:15 +0200 Subject: [PATCH 36/56] Test callback parsing --- cpp/src/arrow/util/rle_encoding_test.cc | 48 +++++++++++++++---------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 344462a1bbf8..2e3ef46be2f7 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -432,30 +432,42 @@ void TestRleBitPackedParser(std::vector bytes, decltype(expected) decoded = {}; auto rle_decoder = RleRunDecoder(); auto bit_packed_decoder = BitPackedRunDecoder(); - // Iterate over all runs - while (auto run = parser.Next()) { - EXPECT_TRUE(run.has_value()); - if (std::holds_alternative(run.value())) { - rle_decoder.Reset(std::get(run.value())); + struct { + decltype(rle_decoder)* rle_decoder_ptr_; + decltype(bit_packed_decoder)* bit_packed_decoder_ptr_; + decltype(decoded)* decoded_ptr_; + + auto OnRleRun(RleRun run) { + rle_decoder_ptr_->Reset(run); - auto const n_decoded = decoded.size(); - auto const n_to_decode = rle_decoder.Remaining(); - decoded.resize(n_decoded + n_to_decode); - EXPECT_EQ(rle_decoder.GetBatch(decoded.data() + n_decoded, n_to_decode), + auto const n_decoded = decoded_ptr_->size(); + auto const n_to_decode = rle_decoder_ptr_->Remaining(); + decoded_ptr_->resize(n_decoded + n_to_decode); + EXPECT_EQ(rle_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, n_to_decode), n_to_decode); - EXPECT_EQ(rle_decoder.Remaining(), 0); - } else { - bit_packed_decoder.Reset(std::get(run.value())); + EXPECT_EQ(rle_decoder_ptr_->Remaining(), 0); + + return RleBitPackedParser::ControlFlow::Continue; + } - auto const n_decoded = decoded.size(); - auto const n_to_decode = bit_packed_decoder.Remaining(); - decoded.resize(n_decoded + n_to_decode); - EXPECT_EQ(bit_packed_decoder.GetBatch(decoded.data() + n_decoded, n_to_decode), + auto OnBitPackedRun(BitPackedRun run) { + bit_packed_decoder_ptr_->Reset(run); + + auto const n_decoded = decoded_ptr_->size(); + auto const n_to_decode = bit_packed_decoder_ptr_->Remaining(); + decoded_ptr_->resize(n_decoded + n_to_decode); + EXPECT_EQ(bit_packed_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, + n_to_decode), n_to_decode); - EXPECT_EQ(bit_packed_decoder.Remaining(), 0); + EXPECT_EQ(bit_packed_decoder_ptr_->Remaining(), 0); + + return RleBitPackedParser::ControlFlow::Continue; } - } + } handler{&rle_decoder, &bit_packed_decoder, &decoded}; + + // Iterate over all runs + parser.Parse(handler); EXPECT_TRUE(parser.Exhausted()); EXPECT_EQ(decoded.size(), expected.size()); From d3ee424cf1f8ef481576777a4b6dac9495b53018 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 10 Sep 2025 11:59:57 +0200 Subject: [PATCH 37/56] Remove parser iteration API --- cpp/src/arrow/util/rle_encoding_internal.h | 63 ---------------------- cpp/src/arrow/util/rle_encoding_test.cc | 12 ----- 2 files changed, 75 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index b63abd21f060..302773dd12b9 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -198,15 +197,6 @@ class RleBitPackedParser { constexpr void Reset(raw_data_const_pointer data, raw_data_size_type data_size, bit_size_type value_bit_width_) noexcept; - /// Get the current run with a small parsing cost without advancing the iteration. - [[nodiscard]] std::optional Peek() const; - - /// Move to the next run. - [[nodiscard]] bool Advance(); - - /// Advance and return the current run. - [[nodiscard]] std::optional Next(); - /// Whether there is still runs to iterate over. /// /// WARN: Due to simplistic error handling, iteration with Next and Peek could @@ -616,26 +606,6 @@ bool RleBitPackedDecoder::Get(value_type* val) { return GetBatch(val, 1) == 1; } -namespace internal { - -/// A ``Parse`` handler that calls a single lambda. -/// -/// This lambda would typically take the input run as ``auto run`` (i.e. the lambda is -/// templated) and deduce other types from it. -template -struct LambdaHandler { - Lambda handler_; - - auto OnBitPackedRun(BitPackedRun run) { return handler_(std::move(run)); } - - auto OnRleRun(RleRun run) { return handler_(std::move(run)); } -}; - -template -LambdaHandler(Lambda) -> LambdaHandler; - -} // namespace internal - template auto RleBitPackedDecoder::GetBatch(value_type* out, values_count_type batch_size) -> values_count_type { @@ -1291,39 +1261,6 @@ constexpr void RleBitPackedParser::Reset(raw_data_const_pointer data, value_bit_width_ = value_bit_width; } -inline auto RleBitPackedParser::Peek() const -> std::optional { - if (ARROW_PREDICT_FALSE(Exhausted())) { - return {}; - } - - auto out = std::optional{}; - auto handler = internal::LambdaHandler{[&](auto run) { - out = run; - return ControlFlow::Break; - }}; - PeekImpl(handler); - return out; -} - -inline auto RleBitPackedParser::Next() -> std::optional { - if (ARROW_PREDICT_FALSE(Exhausted())) { - return {}; - } - - auto out = std::optional{}; - auto handler = internal::LambdaHandler{[&](auto run) { - out = run; - return ControlFlow::Break; - }}; - PeekImpl(handler); - auto [read, control] = PeekImpl(handler); - data_ += read; - data_size_ -= read; - return out; -} - -inline bool RleBitPackedParser::Advance() { return Next().has_value(); } - inline bool RleBitPackedParser::Exhausted() const { return data_size_ == 0; } template diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 2e3ef46be2f7..7aa5b4d04d1b 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -416,18 +416,6 @@ void TestRleBitPackedParser(std::vector bytes, bit_width); EXPECT_FALSE(parser.Exhausted()); - // Peek return the same data - auto run1 = parser.Peek(); - EXPECT_TRUE(run1.has_value()); - auto run2 = parser.Peek(); - EXPECT_TRUE(run2.has_value()); - auto ptr1 = std::visit([](auto const& r) { return r.RawDataPtr(); }, run1.value()); - auto size1 = std::visit([](auto const& r) { return r.RawDataSize(); }, run1.value()); - auto ptr2 = std::visit([](auto const& r) { return r.RawDataPtr(); }, run2.value()); - auto size2 = std::visit([](auto const& r) { return r.RawDataSize(); }, run2.value()); - EXPECT_TRUE(std::equal(ptr1, ptr1 + size1, ptr2, ptr2 + size2)); - EXPECT_FALSE(parser.Exhausted()); - // Try to decode all data of all runs in the decoded vector decltype(expected) decoded = {}; auto rle_decoder = RleRunDecoder(); From 365dd8802d133e14bda0117ceb6c34f1d0c53253 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 10 Sep 2025 14:30:56 +0200 Subject: [PATCH 38/56] Add missing plug of LEB128 in BitReader/BitWriter --- .../arrow/util/bit_stream_utils_internal.h | 149 +++++++----------- cpp/src/arrow/util/bit_util_test.cc | 8 +- cpp/src/arrow/util/rle_encoding_internal.h | 2 +- 3 files changed, 61 insertions(+), 98 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 674f051f1a00..1673a1c8d20c 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "arrow/util/bit_util.h" #include "arrow/util/bpacking_internal.h" @@ -30,8 +31,7 @@ #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" -namespace arrow { -namespace bit_util { +namespace arrow::bit_util { /// Utility class to write bit/byte streams. This class can write data to either be /// bit packed or byte aligned (and a single stream that has a mix of both). @@ -73,19 +73,14 @@ class BitWriter { /// room. The value is written byte aligned. /// For more details on vlq: /// en.wikipedia.org/wiki/Variable-length_quantity - bool PutVlqInt(uint32_t v); + template + bool PutVlqInt(Int v); - // Writes an int zigzag encoded. - bool PutZigZagVlqInt(int32_t v); - - /// Write a Vlq encoded int64 to the buffer. Returns false if there was not enough - /// room. The value is written byte aligned. - /// For more details on vlq: - /// en.wikipedia.org/wiki/Variable-length_quantity - bool PutVlqInt(uint64_t v); - - // Writes an int64 zigzag encoded. - bool PutZigZagVlqInt(int64_t v); + /// Writes a zigzag encoded signed integer. + /// Zigzag encoding is used to encode possibly negative numbers by alternating positive + /// and negative ones. + template + bool PutZigZagVlqInt(Int v); /// Get a pointer to the next aligned byte and advance the underlying buffer /// by num_bytes. @@ -169,18 +164,14 @@ class BitReader { /// Reads a vlq encoded int from the stream. The encoded int must start at /// the beginning of a byte. Return false if there were not enough bytes in /// the buffer. - bool GetVlqInt(uint32_t* v); + template + bool GetVlqInt(Int* v); - // Reads a zigzag encoded int `into` v. - bool GetZigZagVlqInt(int32_t* v); - - /// Reads a vlq encoded int64 from the stream. The encoded int must start at - /// the beginning of a byte. Return false if there were not enough bytes in - /// the buffer. - bool GetVlqInt(uint64_t* v); - - // Reads a zigzag encoded int64 `into` v. - bool GetZigZagVlqInt(int64_t* v); + /// Reads a zigzag encoded integer into a signed integer output v. + /// Zigzag encoding is used to decode possibly negative numbers by alternating positive + /// and negative ones. + template + bool GetZigZagVlqInt(Int* v); /// Returns the number of bytes left in the stream, not including the current /// byte (i.e., there may be an additional fraction of a byte). @@ -189,12 +180,6 @@ class BitReader { (byte_offset_ + static_cast(bit_util::BytesForBits(bit_offset_))); } - /// Maximum byte length of a vlq encoded int - static constexpr int kMaxVlqByteLengthForInt32 = kMaxLEB128ByteLenFor; - - /// Maximum byte length of a vlq encoded int64 - static constexpr int kMaxVlqByteLengthForInt64 = kMaxLEB128ByteLenFor; - private: const uint8_t* buffer_; int max_bytes_; @@ -439,17 +424,31 @@ inline bool BitReader::Advance(int64_t num_bits) { return true; } -inline bool BitWriter::PutVlqInt(uint32_t v) { - bool result = true; - while ((v & 0xFFFFFF80UL) != 0UL) { - result &= PutAligned(static_cast((v & 0x7F) | 0x80), 1); - v >>= 7; +template +inline bool BitWriter::PutVlqInt(Int v) { + static_assert(std::is_integral_v); + + constexpr auto kBufferSize = kMaxLEB128ByteLenFor; + + uint8_t buffer[kBufferSize] = {}; + const auto bytes_written = WriteLEB128(v, buffer, kBufferSize); + ARROW_DCHECK_LE(bytes_written, kBufferSize); + ARROW_DCHECK_GT(bytes_written, 0); // Cannot fail since we gave max space + + for (int i = 0; i < bytes_written; ++i) { + const bool success = PutAligned(buffer[i], 1); + if (ARROW_PREDICT_FALSE(!success)) { + return false; + } } - result &= PutAligned(static_cast(v & 0x7F), 1); - return result; + + return true; } -inline bool BitReader::GetVlqInt(uint32_t* v) { +template +inline bool BitReader::GetVlqInt(Int* v) { + static_assert(std::is_integral_v); + // The data that we will pass to the LEB128 parser // In all case, we read an byte-aligned value, skipping remaining bits const uint8_t* data = NULLPTR; @@ -461,7 +460,7 @@ inline bool BitReader::GetVlqInt(uint32_t* v) { sizeof(buffered_values_) - static_cast(bit_util::BytesForBits(bit_offset_)); // If there are clearly enough bytes left we can try to parse from the cache - if (bytes_left_in_cache >= kMaxVlqByteLengthForInt32) { + if (bytes_left_in_cache >= kMaxLEB128ByteLenFor) { max_size = bytes_left_in_cache; data = reinterpret_cast(&buffered_values_) + bit_util::BytesForBits(bit_offset_); @@ -481,66 +480,28 @@ inline bool BitReader::GetVlqInt(uint32_t* v) { return Advance((8 * bytes_read) + (bit_offset_ % 8)); } -inline bool BitWriter::PutZigZagVlqInt(int32_t v) { - uint32_t u_v = ::arrow::util::SafeCopy(v); - u_v = (u_v << 1) ^ static_cast(v >> 31); - return PutVlqInt(u_v); -} +template +inline bool BitWriter::PutZigZagVlqInt(Int v) { + static_assert(std::is_integral_v); + static_assert(std::is_signed_v); + using UInt = std::make_unsigned_t; + constexpr auto kBitSize = 8 * sizeof(Int); -inline bool BitReader::GetZigZagVlqInt(int32_t* v) { - uint32_t u; - if (!GetVlqInt(&u)) return false; - u = (u >> 1) ^ (~(u & 1) + 1); - *v = ::arrow::util::SafeCopy(u); - return true; -} - -inline bool BitWriter::PutVlqInt(uint64_t v) { - constexpr auto kMaxBytes = bit_util::kMaxLEB128ByteLenFor; - - uint8_t leb128[kMaxBytes] = {}; - auto const bytes_written = bit_util::WriteLEB128(v, leb128, kMaxBytes); - ARROW_DCHECK_NE(bytes_written, 0); - - if (auto* out = GetNextBytePtr(bytes_written)) { - std::memcpy(out, leb128, bytes_written); - return true; - } - return false; -} - -inline bool BitReader::GetVlqInt(uint64_t* v) { - uint64_t tmp = 0; - - for (int i = 0; i < kMaxVlqByteLengthForInt64; i++) { - uint8_t byte = 0; - if (ARROW_PREDICT_FALSE(!GetAligned(1, &byte))) { - return false; - } - tmp |= static_cast(byte & 0x7F) << (7 * i); - - if ((byte & 0x80) == 0) { - *v = tmp; - return true; - } - } - - return false; -} - -inline bool BitWriter::PutZigZagVlqInt(int64_t v) { - uint64_t u_v = ::arrow::util::SafeCopy(v); - u_v = (u_v << 1) ^ static_cast(v >> 63); + UInt u_v = ::arrow::util::SafeCopy(v); + u_v = (u_v << 1) ^ static_cast(v >> (kBitSize - 1)); return PutVlqInt(u_v); } -inline bool BitReader::GetZigZagVlqInt(int64_t* v) { - uint64_t u; +template +inline bool BitReader::GetZigZagVlqInt(Int* v) { + static_assert(std::is_integral_v); + static_assert(std::is_signed_v); + + std::make_unsigned_t u; if (!GetVlqInt(&u)) return false; u = (u >> 1) ^ (~(u & 1) + 1); - *v = ::arrow::util::SafeCopy(u); + *v = ::arrow::util::SafeCopy(u); return true; } -} // namespace bit_util -} // namespace arrow +} // namespace arrow::bit_util diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index afa9614e6d7f..0dc629118789 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -2115,7 +2115,7 @@ TEST(BitStreamUtil, LEB128) { } static void TestZigZag(int32_t v, std::array buffer_expect) { - uint8_t buffer[bit_util::BitReader::kMaxVlqByteLengthForInt32] = {}; + uint8_t buffer[bit_util::kMaxLEB128ByteLenFor] = {}; bit_util::BitWriter writer(buffer, sizeof(buffer)); writer.PutZigZagVlqInt(v); // WARNING: The reader reads and caches the input when created, so it must be created @@ -2139,10 +2139,12 @@ TEST(BitStreamUtil, ZigZag) { } static void TestZigZag64(int64_t v, std::array buffer_expect) { - uint8_t buffer[bit_util::BitReader::kMaxVlqByteLengthForInt64] = {}; + uint8_t buffer[bit_util::kMaxLEB128ByteLenFor] = {}; bit_util::BitWriter writer(buffer, sizeof(buffer)); - bit_util::BitReader reader(buffer, sizeof(buffer)); writer.PutZigZagVlqInt(v); + // WARNING: The reader reads and caches the input when created, so it must be created + // after the data is written in the buffer. + bit_util::BitReader reader(buffer, sizeof(buffer)); EXPECT_THAT(buffer, testing::ElementsAreArray(buffer_expect)); int64_t result = 0; EXPECT_TRUE(reader.GetZigZagVlqInt(&result)); diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 302773dd12b9..d08e28302f16 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -449,7 +449,7 @@ class RleBitPackedEncoder { MAX_VALUES_PER_LITERAL_RUN * bit_width)); /// Up to kMaxVlqByteLength indicator and a single 'bit_width' value. int max_repeated_run_size = - ::arrow::bit_util::BitReader::kMaxVlqByteLengthForInt32 + + bit_util::kMaxLEB128ByteLenFor + static_cast(::arrow::bit_util::BytesForBits(bit_width)); return std::max(max_literal_run_size, max_repeated_run_size); } From 2edec8670954acaae26c31d993fb52839c1a4b43 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 10 Sep 2025 15:37:57 +0200 Subject: [PATCH 39/56] Better handle signed/negative values in LEB128 --- cpp/src/arrow/util/bit_util.h | 8 +++ cpp/src/arrow/util/bit_util_test.cc | 90 +++++++++++++++++------------ 2 files changed, 60 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index c0f501c433f0..e19d2f80bb2a 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -376,6 +376,7 @@ constexpr int64_t kMaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8); /// written. /// If the output buffer size is insufficient, return 0 but the output may have been /// written to. +/// The input value can be a signed integer, but must be non negative. /// /// \see https://en.wikipedia.org/wiki/LEB128 /// \see MaxLEB128ByteLenFor @@ -385,6 +386,13 @@ constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) { constexpr Int kHigh7Mask = ~kLow7Mask; constexpr uint8_t kContinuationBit = 0x80; + // This encoding does not work for negative values + if constexpr (std::is_signed_v) { + if (ARROW_PREDICT_FALSE(value < 0)) { + return 0; + } + } + const auto out_first = out; // Write as many bytes as we could be for the given input diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 0dc629118789..d975a7059d34 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -2030,39 +2030,55 @@ void TestLEB128Encode(Int input_value, std::vector const& expected_data /// Test encoding to known LEB128 byte sequences TEST(WriteLEB128Test, KnownArrayValues) { // Single byte value 0 - TestLEB128Encode(0U, std::vector{0x00}, 1); + TestLEB128Encode(0U, {0x00}, 1); // Single byte value 127 - TestLEB128Encode(127U, std::vector{0x7F}, 1); + TestLEB128Encode(127U, {0x7F}, 1); // Two byte value 128 - TestLEB128Encode(128U, std::vector{0x80, 0x01}, 2); + TestLEB128Encode(128U, {0x80, 0x01}, 2); + // Two byte value 128 as signed type + TestLEB128Encode(128, {0x80, 0x01}, 2); // Two byte value 300 - TestLEB128Encode(300U, std::vector{0xAC, 0x02}, 2); + TestLEB128Encode(300U, {0xAC, 0x02}, 2); // Three byte value 16384 - TestLEB128Encode(16384U, std::vector{0x80, 0x80, 0x01}, 3); + TestLEB128Encode(16384U, {0x80, 0x80, 0x01}, 3); // Four byte value 268435455 - TestLEB128Encode(268435455U, std::vector{0xFF, 0xFF, 0xFF, 0x7F}, 4); + TestLEB128Encode(268435455U, {0xFF, 0xFF, 0xFF, 0x7F}, 4); + // Two bytes uint8_t max value + TestLEB128Encode(std::numeric_limits::max(), {0xFF, 0x01}, 2); + // One bytes int8_t max value + TestLEB128Encode(std::numeric_limits::max(), {0x7F}, 2); + // Three bytes uint16_t max value + TestLEB128Encode(std::numeric_limits::max(), {0xFF, 0xFF, 0x03}, 3); + // Three bytes int16_t max value + TestLEB128Encode(std::numeric_limits::max(), {0xFF, 0xFF, 0x01}, 3); // Five byte uint32_t max value - TestLEB128Encode(4294967295U, std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, 5); - // uint64_t value requiring 10 bytes - TestLEB128Encode( - 18446744073709551615ULL, - std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, - 10); + TestLEB128Encode(std::numeric_limits::max(), {0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, + 5); + // Five byte int32_t max value + TestLEB128Encode(std::numeric_limits::max(), {0xFF, 0xFF, 0xFF, 0xFF, 0x7}, 5); + // Ten bytes uint64_t max value + TestLEB128Encode(std::numeric_limits::max(), + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 10); + // Nine bytes int64_t max value + TestLEB128Encode(std::numeric_limits::max(), + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F}, 10); // Three byte value 16384, encoded in larger buffer - TestLEB128Encode(16384U, std::vector{0x80, 0x80, 0x01}, 10); + TestLEB128Encode(16384U, {0x80, 0x80, 0x01}, 10); // Various single byte values - TestLEB128Encode(1U, std::vector{0x01}, 1); - TestLEB128Encode(63U, std::vector{0x3F}, 1); - TestLEB128Encode(64U, std::vector{0x40}, 1); + TestLEB128Encode(1U, {0x01}, 1); + TestLEB128Encode(63U, {0x3F}, 1); + TestLEB128Encode(64U, {0x40}, 1); // Two byte boundary values - TestLEB128Encode(129U, std::vector{0x81, 0x01}, 2); - TestLEB128Encode(16383U, std::vector{0xFF, 0x7F}, 2); + TestLEB128Encode(129U, {0x81, 0x01}, 2); + TestLEB128Encode(16383U, {0xFF, 0x7F}, 2); // Error case: Buffer too small for value 128 (needs 2 bytes but only 1 provided) - TestLEB128Encode(128U, std::vector{}, 1); + TestLEB128Encode(128U, {}, 1); // Error case: Buffer too small for uint32_t max (needs 5 bytes but only 4 provided) - TestLEB128Encode(4294967295U, std::vector{}, 4); + TestLEB128Encode(4294967295U, {}, 4); // Error case: Zero buffer size - TestLEB128Encode(52U, std::vector{}, 0); + TestLEB128Encode(52U, {}, 0); + // Error case: Negative value + TestLEB128Encode(-3, {}, 1); } /// Utility function to test LEB128 decoding with known byte array and expected result @@ -2081,37 +2097,35 @@ void TestLEB128Decode(std::vector const& data, Int expected_value, /// Test decoding from known LEB128 byte sequences TEST(BitStreamUtil, LEB128) { // Single byte value 0 - TestLEB128Decode(std::vector{0x00}, 0U, 1); + TestLEB128Decode({0x00}, 0U, 1); // Single byte value 127 - TestLEB128Decode(std::vector{0x7F}, 127U, 1); + TestLEB128Decode({0x7F}, 127U, 1); // Two byte value 128 - TestLEB128Decode(std::vector{0x80, 0x01}, 128U, 2); + TestLEB128Decode({0x80, 0x01}, 128U, 2); // Two byte value 300 - TestLEB128Decode(std::vector{0xAC, 0x02}, 300U, 2); + TestLEB128Decode({0xAC, 0x02}, 300U, 2); // Three byte value 16384 - TestLEB128Decode(std::vector{0x80, 0x80, 0x01}, 16384U, 3); + TestLEB128Decode({0x80, 0x80, 0x01}, 16384U, 3); // Three byte value 16384, with remaining data - TestLEB128Decode(std::vector{0x80, 0x80, 0x01, 0x80, 0x00}, 16384U, 3); + TestLEB128Decode({0x80, 0x80, 0x01, 0x80, 0x00}, 16384U, 3); // Four byte value 268435455 - TestLEB128Decode(std::vector{0xFF, 0xFF, 0xFF, 0x7F}, 268435455U, 4); + TestLEB128Decode({0xFF, 0xFF, 0xFF, 0x7F}, 268435455U, 4); // Five byte uint32_t max value - TestLEB128Decode(std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, - std::numeric_limits::max(), 5); + TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, std::numeric_limits::max(), + 5); // uint64_t value requiring 10 bytes - TestLEB128Decode( - std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, - 18446744073709551615ULL, 10); + TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, + 18446744073709551615ULL, 10); // int32_t with maximum size (31 bits of 1) - TestLEB128Decode(std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0x7}, - std::numeric_limits::max(), 5); + TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0x7}, std::numeric_limits::max(), 5); // Error case: Truncated sequence (continuation bit set but no more data) - TestLEB128Decode(std::vector{0x80}, 0U, 0); + TestLEB128Decode({0x80}, 0U, 0); // Error case: Input has exactly the maximum number of bytes for a int32_t (5), // but the decoded value overflows nonetheless (7 * 5 = 35 bits of data). - TestLEB128Decode(std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0x7F}, int32_t{}, 0); + TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0x7F}, int32_t{}, 0); // Error case: Oversized sequence for uint32_t (too many bytes) - TestLEB128Decode(std::vector{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 0U, 0); + TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 0U, 0); } static void TestZigZag(int32_t v, std::array buffer_expect) { From 36cb673d0146b75805299c94fe62457fdb28a0be Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 10 Sep 2025 16:20:47 +0200 Subject: [PATCH 40/56] Aggressive LEB128 test cases --- cpp/src/arrow/util/bit_util_test.cc | 153 ++++++++++++++++++---------- 1 file changed, 100 insertions(+), 53 deletions(-) diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index d975a7059d34..9fdb925e09e0 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -1998,7 +1998,7 @@ TEST(BitUtil, RoundUpToPowerOf2) { #undef S64 /// Test the maximum number of bytes needed to write a LEB128 of a give size. -TEST(BitStreamUtil, MaxLEB128ByteLenFor) { +TEST(LEB128, MaxLEB128ByteLenFor) { EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 3); EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 5); EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 10); @@ -2027,47 +2027,15 @@ void TestLEB128Encode(Int input_value, std::vector const& expected_data } } -/// Test encoding to known LEB128 byte sequences -TEST(WriteLEB128Test, KnownArrayValues) { +/// Test encoding to known LEB128 byte sequences with edge cases parameters. +/// \see LEB128.KnownSuccessfulValues for other known values tested. +TEST(LEB128, WriteEdgeCases) { // Single byte value 0 TestLEB128Encode(0U, {0x00}, 1); // Single byte value 127 TestLEB128Encode(127U, {0x7F}, 1); - // Two byte value 128 - TestLEB128Encode(128U, {0x80, 0x01}, 2); - // Two byte value 128 as signed type - TestLEB128Encode(128, {0x80, 0x01}, 2); - // Two byte value 300 - TestLEB128Encode(300U, {0xAC, 0x02}, 2); - // Three byte value 16384 - TestLEB128Encode(16384U, {0x80, 0x80, 0x01}, 3); - // Four byte value 268435455 - TestLEB128Encode(268435455U, {0xFF, 0xFF, 0xFF, 0x7F}, 4); - // Two bytes uint8_t max value - TestLEB128Encode(std::numeric_limits::max(), {0xFF, 0x01}, 2); - // One bytes int8_t max value - TestLEB128Encode(std::numeric_limits::max(), {0x7F}, 2); - // Three bytes uint16_t max value - TestLEB128Encode(std::numeric_limits::max(), {0xFF, 0xFF, 0x03}, 3); - // Three bytes int16_t max value - TestLEB128Encode(std::numeric_limits::max(), {0xFF, 0xFF, 0x01}, 3); - // Five byte uint32_t max value - TestLEB128Encode(std::numeric_limits::max(), {0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, - 5); - // Five byte int32_t max value - TestLEB128Encode(std::numeric_limits::max(), {0xFF, 0xFF, 0xFF, 0xFF, 0x7}, 5); - // Ten bytes uint64_t max value - TestLEB128Encode(std::numeric_limits::max(), - {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 10); - // Nine bytes int64_t max value - TestLEB128Encode(std::numeric_limits::max(), - {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F}, 10); // Three byte value 16384, encoded in larger buffer TestLEB128Encode(16384U, {0x80, 0x80, 0x01}, 10); - // Various single byte values - TestLEB128Encode(1U, {0x01}, 1); - TestLEB128Encode(63U, {0x3F}, 1); - TestLEB128Encode(64U, {0x40}, 1); // Two byte boundary values TestLEB128Encode(129U, {0x81, 0x01}, 2); TestLEB128Encode(16383U, {0xFF, 0x7F}, 2); @@ -2094,31 +2062,25 @@ void TestLEB128Decode(std::vector const& data, Int expected_value, } } -/// Test decoding from known LEB128 byte sequences -TEST(BitStreamUtil, LEB128) { +template +void TestLEB128Decode(std::vector const& data, Int expected_value, + std::size_t expected_bytes_read) { + ASSERT_LE(expected_bytes_read, std::numeric_limits::max()); + return TestLEB128Decode(data, expected_value, + static_cast(expected_bytes_read)); +} + +/// Test decoding from known LEB128 byte sequences with edge case parameters. +/// \see LEB128.KnownSuccessfulValues for other known values tested. +TEST(LEB128, ReadEdgeCases) { // Single byte value 0 TestLEB128Decode({0x00}, 0U, 1); // Single byte value 127 TestLEB128Decode({0x7F}, 127U, 1); - // Two byte value 128 - TestLEB128Decode({0x80, 0x01}, 128U, 2); - // Two byte value 300 - TestLEB128Decode({0xAC, 0x02}, 300U, 2); - // Three byte value 16384 - TestLEB128Decode({0x80, 0x80, 0x01}, 16384U, 3); // Three byte value 16384, with remaining data TestLEB128Decode({0x80, 0x80, 0x01, 0x80, 0x00}, 16384U, 3); // Four byte value 268435455 TestLEB128Decode({0xFF, 0xFF, 0xFF, 0x7F}, 268435455U, 4); - // Five byte uint32_t max value - TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0x0F}, std::numeric_limits::max(), - 5); - // uint64_t value requiring 10 bytes - TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, - 18446744073709551615ULL, 10); - // int32_t with maximum size (31 bits of 1) - TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0x7}, std::numeric_limits::max(), 5); - // Error case: Truncated sequence (continuation bit set but no more data) TestLEB128Decode({0x80}, 0U, 0); // Error case: Input has exactly the maximum number of bytes for a int32_t (5), @@ -2128,6 +2090,91 @@ TEST(BitStreamUtil, LEB128) { TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 0U, 0); } +struct KnownLEB128Encoding { + uint64_t value; + std::vector bytes; +}; + +static const std::vector KnownLEB128EncodingValues{ + {0, {0x00}}, + {1, {0x01}}, + {63, {0x3F}}, + {64, {0x40}}, + {127U, {0x7F}}, + {128, {0x80, 0x01}}, + {300, {0xAC, 0x02}}, + {16384, {0x80, 0x80, 0x01}}, + {268435455, {0xFF, 0xFF, 0xFF, 0x7F}}, + {static_cast(std::numeric_limits::max()), {0xFF, 0x01}}, + {static_cast(std::numeric_limits::max()), {0x7F}}, + {static_cast(std::numeric_limits::max()), {0xFF, 0xFF, 0x03}}, + {static_cast(std::numeric_limits::max()), {0xFF, 0xFF, 0x01}}, + {static_cast(std::numeric_limits::max()), + {0xFF, 0xFF, 0xFF, 0xFF, 0x0F}}, + {static_cast(std::numeric_limits::max()), + {0xFF, 0xFF, 0xFF, 0xFF, 0x7}}, + {static_cast(std::numeric_limits::max()), + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}}, + {static_cast(std::numeric_limits::max()), + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F}}, +}; + +/// Test encoding and decoding to known LEB128 byte sequences with all possible +/// integer sizes and signess. +TEST(LEB128, KnownSuccessfulValues) { + for (const auto& data : KnownLEB128EncodingValues) { + SCOPED_TRACE("Testing value " + std::to_string(data.value)); + + // 8 bits + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, data.bytes.size()); + } + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, data.bytes.size()); + } + + // 16 bits + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, data.bytes.size()); + } + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, data.bytes.size()); + } + + // 32 bits + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, data.bytes.size()); + } + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, data.bytes.size()); + } + + // 64 bits + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, data.bytes.size()); + } + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, data.bytes.size()); + } + } +} + static void TestZigZag(int32_t v, std::array buffer_expect) { uint8_t buffer[bit_util::kMaxLEB128ByteLenFor] = {}; bit_util::BitWriter writer(buffer, sizeof(buffer)); From fb69856c001270b3375b5a839bf875f5e1f927d1 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 10 Sep 2025 17:18:24 +0200 Subject: [PATCH 41/56] Use pointer for non-const ref --- cpp/src/arrow/util/rle_encoding_internal.h | 154 +++++++++++---------- 1 file changed, 80 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index d08e28302f16..d2d8bb8329b6 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -280,7 +280,7 @@ class RleRunDecoder { values_count_type remaining_count_ = 0; static_assert(std::is_integral_v, - "This class makes assumptions about integer endianness and padding"); + "This class is meant to decode positive integers"); }; /// Decoder class for single run of bit-packed encoded data. @@ -322,7 +322,7 @@ class BitPackedRunDecoder { values_count_type remaining_count_ = 0; static_assert(std::is_integral_v, - "This class makes assumptions about integer endianness and padding"); + "This class is meant to decode positive integers"); }; /// Decoder class for Parquet RLE bit-packed data. @@ -714,26 +714,32 @@ class BatchCounter { size_type null_read_ = 0; }; -// The maximal unsigned size that a variable can fit. +/// The maximal unsigned size that a variable can fit. template constexpr auto max_size_for_v = static_cast>(std::numeric_limits::max()); +template +struct GetSpacedResult { + Int values_read; + Int null_read; +}; + /// Overload for GetSpaced for a single run in a RleDecoder template -auto RunGetSpaced(Converter& converter, typename Converter::out_type* out, +auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, values_count_type batch_size, values_count_type null_count, - BitRunReader&& validity_reader, BitRun&& validity_run, - RleRunDecoder& decoder) - -> std::pair { + BitRunReader* validity_reader, BitRun* validity_run, + RleRunDecoder* decoder) + -> GetSpacedResult { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced ARROW_DCHECK_LT(null_count, batch_size); auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); - values_count_type const values_available = decoder.Remaining(); + values_count_type const values_available = decoder->Remaining(); ARROW_DCHECK_GT(values_available, 0); auto values_remaining_run = [&]() { auto out = values_available - batch.ValuesRead(); @@ -746,40 +752,40 @@ auto RunGetSpaced(Converter& converter, typename Converter::out_type* out, // same value for nulls and non-nulls. // This proves to be a big efficiency win. while (values_remaining_run() > 0 && !batch.IsDone()) { - ARROW_DCHECK_GE(validity_run.length, 0); - ARROW_DCHECK_LT(validity_run.length, max_size_for_v); - ARROW_DCHECK_LE(validity_run.length, batch.TotalRemaining()); - auto const& validity_run_size = static_cast(validity_run.length); + ARROW_DCHECK_GE(validity_run->length, 0); + ARROW_DCHECK_LT(validity_run->length, max_size_for_v); + ARROW_DCHECK_LE(validity_run->length, batch.TotalRemaining()); + auto const& validity_run_size = static_cast(validity_run->length); - if (validity_run.set) { + if (validity_run->set) { // We may end the current RLE run in the middle of the validity run auto update_size = std::min(validity_run_size, values_remaining_run()); batch.AccrueReadValues(update_size); - validity_run.length -= update_size; + validity_run->length -= update_size; } else { // We can consume all nulls here because it does not matter if we consume on this // RLE run, or an a next encoded run. The value filled does not matter. auto update_size = std::min(validity_run_size, batch.NullRemaining()); batch.AccrueReadNulls(update_size); - validity_run.length -= update_size; + validity_run->length -= update_size; } - if (ARROW_PREDICT_TRUE(validity_run.length == 0)) { - validity_run = validity_reader.NextRun(); + if (ARROW_PREDICT_TRUE(validity_run->length == 0)) { + *validity_run = validity_reader->NextRun(); } } - value_type const value = decoder.Value(); - if (ARROW_PREDICT_FALSE(!converter.InputIsValid(value))) { + value_type const value = decoder->Value(); + if (ARROW_PREDICT_FALSE(!converter->InputIsValid(value))) { return {0, 0}; } - converter.WriteRepeated(out, out + batch.TotalRead(), value); - auto const actual_values_read = decoder.Advance(batch.ValuesRead()); + converter->WriteRepeated(out, out + batch.TotalRead(), value); + auto const actual_values_read = decoder->Advance(batch.ValuesRead()); // We always cropped the number of values_read by the remaining values in the run. // What's more the RLE decoder should not encounter any errors. ARROW_DCHECK_EQ(actual_values_read, batch.ValuesRead()); - return {batch.ValuesRead(), batch.NullRead()}; + return {/* .values_read= */ batch.ValuesRead(), /* .null_read= */ batch.NullRead()}; } template @@ -794,18 +800,18 @@ static_assert(min(5, 41) == 5); template -auto RunGetSpaced(Converter& converter, typename Converter::out_type* out, +auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, values_count_type batch_size, values_count_type null_count, - BitRunReader&& validity_reader, BitRun&& validity_run, - BitPackedRunDecoder& decoder) - -> std::pair { + BitRunReader* validity_reader, BitRun* validity_run, + BitPackedRunDecoder* decoder) + -> GetSpacedResult { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced ARROW_DCHECK_LT(null_count, batch_size); auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); - values_count_type const values_available = decoder.Remaining(); + values_count_type const values_available = decoder->Remaining(); ARROW_DCHECK_GT(values_available, 0); auto run_values_remaining = [&]() { auto out = values_available - batch.ValuesRead(); @@ -830,70 +836,70 @@ auto RunGetSpaced(Converter& converter, typename Converter::out_type* out, // buffer_start is 0 at this point so size is end buffer_end = min(run_values_remaining(), batch.ValuesRemaining(), kBufferCapacity); - buffer_end = decoder.GetBatch(buffer.data(), buffer_size()); + buffer_end = decoder->GetBatch(buffer.data(), buffer_size()); ARROW_DCHECK_LE(buffer_size(), kBufferCapacity); - if (ARROW_PREDICT_FALSE(!converter.InputIsValid(buffer.data(), buffer_size()))) { + if (ARROW_PREDICT_FALSE(!converter->InputIsValid(buffer.data(), buffer_size()))) { return {batch.ValuesRead(), batch.NullRead()}; } // Copy chunks of valid values into the output, while adjusting spacing for null // values. while (buffer_size() > 0) { - ARROW_DCHECK_GE(validity_run.length, 0); - ARROW_DCHECK_LT(validity_run.length, max_size_for_v); - ARROW_DCHECK_LE(validity_run.length, batch.TotalRemaining()); + ARROW_DCHECK_GE(validity_run->length, 0); + ARROW_DCHECK_LT(validity_run->length, max_size_for_v); + ARROW_DCHECK_LE(validity_run->length, batch.TotalRemaining()); auto const validity_run_length = - static_cast(validity_run.length); + static_cast(validity_run->length); // Copy as much as possible from the buffer into the output while not exceeding // validity run - if (validity_run.set) { + if (validity_run->set) { auto const update_size = std::min(validity_run_length, buffer_size()); - converter.WriteRange(out, buffer.data() + buffer_start, update_size); + converter->WriteRange(out, buffer.data() + buffer_start, update_size); buffer_start += update_size; batch.AccrueReadValues(update_size); out += update_size; - validity_run.length -= update_size; + validity_run->length -= update_size; // Simply write zeros in the output } else { auto const update_size = std::min(validity_run_length, batch.NullRemaining()); - converter.WriteZero(out, out + update_size); + converter->WriteZero(out, out + update_size); batch.AccrueReadNulls(update_size); out += update_size; - validity_run.length -= update_size; + validity_run->length -= update_size; } - if (validity_run.length == 0) { - validity_run = validity_reader.NextRun(); + if (validity_run->length == 0) { + *validity_run = validity_reader->NextRun(); } } ARROW_DCHECK_EQ(buffer_size(), 0); } - ARROW_DCHECK_EQ(values_available - decoder.Remaining(), batch.ValuesRead()); + ARROW_DCHECK_EQ(values_available - decoder->Remaining(), batch.ValuesRead()); ARROW_DCHECK_LE(batch.TotalRead(), batch_size); ARROW_DCHECK_LE(batch.NullRead(), batch.NullCount()); - return {batch.ValuesRead(), batch.NullRead()}; + return {/* .values_read= */ batch.ValuesRead(), /* .null_read= */ batch.NullRead()}; } /// Overload for GetSpaced for a single run in a decoder variant template auto RunGetSpaced( - Converter& converter, typename Converter::out_type* out, values_count_type batch_size, - values_count_type null_count, BitRunReader&& validity_reader, BitRun&& validity_run, - std::variant, BitPackedRunDecoder>& decoder) - -> std::pair { + Converter* converter, typename Converter::out_type* out, values_count_type batch_size, + values_count_type null_count, BitRunReader* validity_reader, BitRun* validity_run, + std::variant, BitPackedRunDecoder>* decoder) + -> GetSpacedResult { return std::visit( [&](auto& dec) { ARROW_DCHECK_GT(dec.Remaining(), 0); return RunGetSpaced(converter, out, batch_size, null_count, validity_reader, - validity_run, dec); + validity_run, &dec); }, - decoder); + *decoder); } } // namespace internal @@ -935,13 +941,13 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, // Remaining from a previous call that would have left some unread data from a run. if (ARROW_PREDICT_FALSE(RunRemaining() > 0)) { - auto const [values_read, null_read] = - RunGetSpaced(converter, out, batch.TotalRemaining(), batch.NullRemaining(), - validity_reader, validity_run, decoder_); + const auto read = internal::RunGetSpaced(&converter, out, batch.TotalRemaining(), + batch.NullRemaining(), &validity_reader, + &validity_run, &decoder_); - batch.AccrueReadNulls(null_read); - batch.AccrueReadValues(values_read); - out += values_read + null_read; + batch.AccrueReadNulls(read.null_read); + batch.AccrueReadValues(read.values_read); + out += read.values_read + read.null_read; // Either we fulfilled all the batch values to be read if (ARROW_PREDICT_FALSE(batch.ValuesRemaining() == 0)) { @@ -959,16 +965,16 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, RunDecoder decoder(run); - const auto [values_read, null_read] = internal::RunGetSpaced( - converter, out, batch.TotalRemaining(), batch.NullRemaining(), validity_reader, - validity_run, decoder); + const auto read = internal::RunGetSpaced(&converter, out, batch.TotalRemaining(), + batch.NullRemaining(), &validity_reader, + &validity_run, &decoder); - batch.AccrueReadNulls(null_read); - batch.AccrueReadValues(values_read); - out += values_read + null_read; + batch.AccrueReadNulls(read.null_read); + batch.AccrueReadValues(read.values_read); + out += read.values_read + read.null_read; // Stop reading and store remaining decoder - if (ARROW_PREDICT_FALSE(values_read == 0 || batch.ValuesRemaining() == 0)) { + if (ARROW_PREDICT_FALSE(read.values_read == 0 || batch.ValuesRemaining() == 0)) { decoder_ = std::move(decoder); return ControlFlow::Break; } @@ -1125,13 +1131,13 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, }; if (ARROW_PREDICT_FALSE(RunRemaining() > 0)) { - auto const [run_values_read, run_null_read] = - RunGetSpaced(converter, out, batch_size, /* null_count= */ 0, validity_reader, - validity_run, decoder_); + auto const read = + internal::RunGetSpaced(&converter, out, batch_size, /* null_count= */ 0, + &validity_reader, &validity_run, &decoder_); - ARROW_DCHECK_EQ(run_null_read, 0); - values_read += run_values_read; - out += run_values_read; + ARROW_DCHECK_EQ(read.null_read, 0); + values_read += read.values_read; + out += read.values_read; // Either we fulfilled all the batch values to be read if (ARROW_PREDICT_FALSE(values_read >= batch_size)) { @@ -1148,16 +1154,16 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, RunDecoder decoder(run); - auto const [run_values_read, run_null_read] = internal::RunGetSpaced( - converter, out, batch_values_remaining(), /* null_count= */ 0, validity_reader, - validity_run, decoder); + auto const read = internal::RunGetSpaced(&converter, out, batch_values_remaining(), + /* null_count= */ 0, &validity_reader, + &validity_run, &decoder); - ARROW_DCHECK_EQ(run_null_read, 0); - values_read += run_values_read; - out += run_values_read; + ARROW_DCHECK_EQ(read.null_read, 0); + values_read += read.values_read; + out += read.values_read; // Stop reading and store remaining decoder - if (ARROW_PREDICT_FALSE(run_values_read == 0 || values_read == batch_size)) { + if (ARROW_PREDICT_FALSE(read.values_read == 0 || values_read == batch_size)) { decoder_ = std::move(decoder); return ControlFlow::Break; } From eb384fe2ba6a21f898ad0e9a76570ceb5e97e960 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 10 Sep 2025 17:22:24 +0200 Subject: [PATCH 42/56] West const --- cpp/src/arrow/util/rle_encoding_internal.h | 44 +++++++++++----------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index d2d8bb8329b6..c128dbce21bf 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -575,7 +575,7 @@ void RleBitPackedDecoder::Reset(raw_data_const_pointer data, template auto RleBitPackedDecoder::RunRemaining() const -> values_count_type { - return std::visit([](auto const& dec) { return dec.Remaining(); }, decoder_); + return std::visit([](const auto& dec) { return dec.Remaining(); }, decoder_); } template @@ -615,7 +615,7 @@ auto RleBitPackedDecoder::GetBatch(value_type* out, values_count_type batch_s // Remaining from a previous call that would have left some unread data from a run. if (ARROW_PREDICT_FALSE(RunRemaining() > 0)) { - auto const read = RunGetBatch(out, batch_size); + const auto read = RunGetBatch(out, batch_size); values_read += read; out += read; @@ -631,7 +631,7 @@ auto RleBitPackedDecoder::GetBatch(value_type* out, values_count_type batch_s ARROW_DCHECK_LT(values_read, batch_size); RunDecoder decoder(run); - auto const read = decoder.GetBatch(out, batch_size - values_read); + const auto read = decoder.GetBatch(out, batch_size - values_read); ARROW_DCHECK_LE(read, batch_size - values_read); values_read += read; out += read; @@ -755,7 +755,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, ARROW_DCHECK_GE(validity_run->length, 0); ARROW_DCHECK_LT(validity_run->length, max_size_for_v); ARROW_DCHECK_LE(validity_run->length, batch.TotalRemaining()); - auto const& validity_run_size = static_cast(validity_run->length); + const auto& validity_run_size = static_cast(validity_run->length); if (validity_run->set) { // We may end the current RLE run in the middle of the validity run @@ -780,7 +780,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, return {0, 0}; } converter->WriteRepeated(out, out + batch.TotalRead(), value); - auto const actual_values_read = decoder->Advance(batch.ValuesRead()); + const auto actual_values_read = decoder->Advance(batch.ValuesRead()); // We always cropped the number of values_read by the remaining values in the run. // What's more the RLE decoder should not encounter any errors. ARROW_DCHECK_EQ(actual_values_read, batch.ValuesRead()); @@ -849,13 +849,13 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, ARROW_DCHECK_GE(validity_run->length, 0); ARROW_DCHECK_LT(validity_run->length, max_size_for_v); ARROW_DCHECK_LE(validity_run->length, batch.TotalRemaining()); - auto const validity_run_length = + const auto validity_run_length = static_cast(validity_run->length); // Copy as much as possible from the buffer into the output while not exceeding // validity run if (validity_run->set) { - auto const update_size = std::min(validity_run_length, buffer_size()); + const auto update_size = std::min(validity_run_length, buffer_size()); converter->WriteRange(out, buffer.data() + buffer_start, update_size); buffer_start += update_size; batch.AccrueReadValues(update_size); @@ -863,7 +863,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, validity_run->length -= update_size; // Simply write zeros in the output } else { - auto const update_size = std::min(validity_run_length, batch.NullRemaining()); + const auto update_size = std::min(validity_run_length, batch.NullRemaining()); converter->WriteZero(out, out + update_size); batch.AccrueReadNulls(update_size); out += update_size; @@ -928,7 +928,7 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, /*length=*/batch.TotalRemaining()); arrow::internal::BitRun validity_run = validity_reader.NextRun(); - auto const check_and_handle_fully_null_remaining = [&]() { + const auto check_and_handle_fully_null_remaining = [&]() { if (batch.IsFullyNull()) { ARROW_DCHECK(validity_run.length == 0 || !validity_run.set); ARROW_DCHECK_GE(validity_run.length, batch.NullRemaining()); @@ -1131,7 +1131,7 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, }; if (ARROW_PREDICT_FALSE(RunRemaining() > 0)) { - auto const read = + const auto read = internal::RunGetSpaced(&converter, out, batch_size, /* null_count= */ 0, &validity_reader, &validity_run, &decoder_); @@ -1154,7 +1154,7 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, RunDecoder decoder(run); - auto const read = internal::RunGetSpaced(&converter, out, batch_values_remaining(), + const auto read = internal::RunGetSpaced(&converter, out, batch_values_remaining(), /* null_count= */ 0, &validity_reader, &validity_run, &decoder); @@ -1276,7 +1276,7 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor; uint32_t run_len_type = 0; - auto const header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type); + const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type); if (ARROW_PREDICT_FALSE(header_bytes == 0)) { // Malfomrmed LEB128 data @@ -1294,10 +1294,10 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const return {}; } - auto const values_count = static_cast(count * 8); + const auto values_count = static_cast(count * 8); ARROW_DCHECK_LT(count, internal::max_size_for_v); // Count Already divided by 8 - auto const bytes_read = + const auto bytes_read = header_bytes + static_cast(count) * value_bit_width_; auto control = handler.OnBitPackedRun( @@ -1314,10 +1314,10 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const return {}; } - auto const values_count = static_cast(count); - auto const value_bytes = bit_util::BytesForBits(value_bit_width_); + const auto values_count = static_cast(count); + const auto value_bytes = bit_util::BytesForBits(value_bit_width_); ARROW_DCHECK_LT(value_bytes, internal::max_size_for_v); - auto const bytes_read = header_bytes + static_cast(value_bytes); + const auto bytes_read = header_bytes + static_cast(value_bytes); auto control = handler.OnRleRun(RleRun(data_ + header_bytes, values_count, value_bit_width_)); @@ -1374,7 +1374,7 @@ auto constexpr RleRunDecoder::Value() const -> value_type { template auto RleRunDecoder::Advance(values_count_type batch_size) -> values_count_type { - auto const steps = std::min(batch_size, remaining_count_); + const auto steps = std::min(batch_size, remaining_count_); remaining_count_ -= steps; return steps; } @@ -1391,7 +1391,7 @@ auto RleRunDecoder::GetBatch(value_type* out, values_count_type batch_size) return 0; } - auto const to_read = std::min(remaining_count_, batch_size); + const auto to_read = std::min(remaining_count_, batch_size); std::fill(out, out + to_read, value_); remaining_count_ -= to_read; return to_read; @@ -1427,7 +1427,7 @@ auto constexpr BitPackedRunDecoder::ValueBitWidth() const -> bit_size_type { template auto BitPackedRunDecoder::Advance(values_count_type batch_size) -> values_count_type { - auto const steps = std::min(batch_size, remaining_count_); + const auto steps = std::min(batch_size, remaining_count_); if (bit_reader_.Advance(steps * value_bit_width_)) { remaining_count_ -= steps; return steps; @@ -1447,8 +1447,8 @@ auto BitPackedRunDecoder::GetBatch(value_type* out, values_count_type batch_s return 0; } - auto const to_read = std::min(remaining_count_, batch_size); - auto const actual_read = bit_reader_.GetBatch(value_bit_width_, out, to_read); + const auto to_read = std::min(remaining_count_, batch_size); + const auto actual_read = bit_reader_.GetBatch(value_bit_width_, out, to_read); // There should not be any reason why the actual read would be different // but this is error resistant. remaining_count_ -= actual_read; From 12a87fd345f881d271318936778bef2de649a6b7 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 10 Sep 2025 17:38:38 +0200 Subject: [PATCH 43/56] Comment slashes --- cpp/src/arrow/util/rle_encoding_internal.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index c128dbce21bf..8ddea16fab74 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -444,10 +444,10 @@ class RleBitPackedEncoder { /// This is the maximum length of a single run for 'bit_width'. /// It is not valid to pass a buffer less than this length. static int MinBufferSize(int bit_width) { - /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. + // 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. int max_literal_run_size = 1 + static_cast(::arrow::bit_util::BytesForBits( MAX_VALUES_PER_LITERAL_RUN * bit_width)); - /// Up to kMaxVlqByteLength indicator and a single 'bit_width' value. + // Up to kMaxVlqByteLength indicator and a single 'bit_width' value. int max_repeated_run_size = bit_util::kMaxLEB128ByteLenFor + static_cast(::arrow::bit_util::BytesForBits(bit_width)); @@ -956,7 +956,7 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, return batch.TotalRead(); } - /// We finished the remaining run + // We finished the remaining run ARROW_DCHECK(RunRemaining() == 0); } @@ -1145,7 +1145,7 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, return values_read; } - /// We finished the remaining run + // We finished the remaining run ARROW_DCHECK(RunRemaining() == 0); } @@ -1290,7 +1290,7 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const constexpr auto kMaxCount = bit_util::CeilDiv(internal::max_size_for_v, 8); if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) { - /// Illegal number of encoded values + // Illegal number of encoded values return {}; } @@ -1310,7 +1310,7 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const if (ARROW_PREDICT_FALSE( count == 0 || count > static_cast(std::numeric_limits::max()))) { - /// Illegal number of encoded values + // Illegal number of encoded values return {}; } From a2722748c46d229fc1725b70f10f28d1ae8b0e2f Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 10 Sep 2025 19:28:35 +0200 Subject: [PATCH 44/56] Third pass addressing review comments --- cpp/src/arrow/util/rle_encoding_internal.h | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 8ddea16fab74..a57716464d35 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -788,16 +788,6 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, return {/* .values_read= */ batch.ValuesRead(), /* .null_read= */ batch.NullRead()}; } -template -[[nodiscard]] constexpr T min(T x, Ts... ys) { - ((x = std::min(x, ys)), ...); - return x; -} - -static_assert(min(5) == 5); -static_assert(min(5, 4, -1) == -1); -static_assert(min(5, 41) == 5); - template auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, @@ -835,7 +825,8 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, }; // buffer_start is 0 at this point so size is end - buffer_end = min(run_values_remaining(), batch.ValuesRemaining(), kBufferCapacity); + buffer_end = std::min(std::min(run_values_remaining(), batch.ValuesRemaining()), + kBufferCapacity); buffer_end = decoder->GetBatch(buffer.data(), buffer_size()); ARROW_DCHECK_LE(buffer_size(), kBufferCapacity); @@ -986,8 +977,6 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, check_and_handle_fully_null_remaining(); ARROW_DCHECK(batch.IsDone() || Exhausted()); - // batch.Done() => batch.NullRemaining() == 0 - ARROW_DCHECK(!batch.IsDone() || (batch.NullRemaining() == 0)); return batch.TotalRead(); } @@ -1067,6 +1056,8 @@ struct DictionaryConverter { } [[nodiscard]] bool InputIsValid(const in_type* indices, size_type length) const { + ARROW_DCHECK(length > 0); + in_type min_index = std::numeric_limits::max(); in_type max_index = std::numeric_limits::min(); for (size_type x = 0; x < length; x++) { From ab7d212f38660579a3acb4a7757127ece75b3df7 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 15 Sep 2025 11:59:18 +0200 Subject: [PATCH 45/56] Fix infinite loop on invalid input --- cpp/src/arrow/util/rle_encoding_internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index a57716464d35..7b460ffab825 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -1282,7 +1282,7 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const bit_util::CeilDiv(internal::max_size_for_v, 8); if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) { // Illegal number of encoded values - return {}; + return {0, ControlFlow::Break}; } const auto values_count = static_cast(count * 8); @@ -1302,7 +1302,7 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const count == 0 || count > static_cast(std::numeric_limits::max()))) { // Illegal number of encoded values - return {}; + return {0, ControlFlow::Break}; } const auto values_count = static_cast(count); From 6ed409e34ae9e144dad18c37d6cd699b5bf9de91 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 15 Sep 2025 15:46:09 +0200 Subject: [PATCH 46/56] Fourth pass addressing review comments --- cpp/src/arrow/util/bit_util.h | 4 +- cpp/src/arrow/util/rle_encoding_test.cc | 143 ++++++++++++++---------- 2 files changed, 88 insertions(+), 59 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index e19d2f80bb2a..8d4811ede79c 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -398,7 +398,7 @@ constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) { // Write as many bytes as we could be for the given input while ((value & kHigh7Mask) != Int(0)) { // We do not have enough room to write the LEB128 - if (out - out_first >= max_out_size) { + if (ARROW_PREDICT_FALSE(out - out_first >= max_out_size)) { return 0; } @@ -410,7 +410,7 @@ constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) { } // We do not have enough room to write the LEB128 - if (out - out_first >= max_out_size) { + if (ARROW_PREDICT_FALSE(out - out_first >= max_out_size)) { return 0; } diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 7aa5b4d04d1b..a4427ca4b35d 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -26,7 +26,9 @@ #include "arrow/array.h" #include "arrow/array/concatenate.h" +#include "arrow/array/util.h" #include "arrow/scalar.h" +#include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/util/bit_stream_utils_internal.h" @@ -253,7 +255,7 @@ TEST(Rle, RleRun) { TEST(BitPacked, BitPackedRun) { const std::array value = {0b10101010, 0, 0, 0b1111111}; - /// 16 values of 1 bit for a total of 16 bits + // 16 values of 1 bit for a total of 16 bits BitPackedRun::values_count_type value_count_1 = 16; auto const run_1 = BitPackedRun(value.data(), value_count_1, /* value_bit_width= */ 1); EXPECT_EQ(run_1.ValuesCount(), value_count_1); @@ -263,7 +265,7 @@ TEST(BitPacked, BitPackedRun) { EXPECT_EQ(*(run_1.RawDataPtr() + i), value[i]); } - /// 8 values of 3 bits for a total of 24 bits + // 8 values of 3 bits for a total of 24 bits BitPackedRun::values_count_type value_count_3 = 8; auto const run_3 = BitPackedRun(value.data(), value_count_3, /* value_bit_width= */ 3); EXPECT_EQ(run_3.ValuesCount(), value_count_3); @@ -276,16 +278,10 @@ TEST(BitPacked, BitPackedRun) { template void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_count, - RleRun::bit_size_type bit_width) { + RleRun::bit_size_type bit_width, T expected_value) { // Pre-requisite for this test EXPECT_GT(value_count, 6); - // Compute value associated with bytes encoded as little endian - T value = 0; - for (std::size_t i = 0; i < bytes.size(); ++i) { - value += static_cast(bytes.at(i)) << (8 * i); - } - auto const run = RleRun(bytes.data(), value_count, bit_width); auto decoder = RleRunDecoder(run); @@ -296,7 +292,7 @@ void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_ typename decltype(decoder)::values_count_type read = 0; EXPECT_EQ(decoder.Get(vals.data()), 1); read += 1; - EXPECT_EQ(vals.at(0), value); + EXPECT_EQ(vals.at(0), expected_value); EXPECT_EQ(decoder.Remaining(), value_count - read); EXPECT_EQ(decoder.Advance(3), 3); @@ -305,8 +301,8 @@ void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_ vals = {0, 0}; EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); - EXPECT_EQ(vals.at(0), value); - EXPECT_EQ(vals.at(1), value); + EXPECT_EQ(vals.at(0), expected_value); + EXPECT_EQ(vals.at(1), expected_value); read += static_cast(vals.size()); EXPECT_EQ(decoder.Remaining(), value_count - read); @@ -323,14 +319,21 @@ void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_ EXPECT_EQ(decoder.Remaining(), value_count); vals = {0, 0}; EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); - EXPECT_EQ(vals.at(0), value); - EXPECT_EQ(vals.at(1), value); + EXPECT_EQ(vals.at(0), expected_value); + EXPECT_EQ(vals.at(1), expected_value); } TEST(Rle, RleDecoder) { - TestRleDecoder({21, 0, 0}, /* value_count= */ 21, /* bit_width= */ 5); - TestRleDecoder({1, 0}, /* value_count= */ 13, /* bit_width= */ 1); - TestRleDecoder({21, 2, 0, 1}, /* value_count= */ 20, /* bit_width= */ 30); + TestRleDecoder({21, 0, 0}, /* value_count= */ 21, /* bit_width= */ 5, + /* expected_value= */ 21); + TestRleDecoder({1, 0}, /* value_count= */ 13, /* bit_width= */ 1, + /* expected_value= */ 1); + TestRleDecoder({21, 0, 0}, /* value_count= */ 21, /* bit_width= */ 5, + /* expected_value= */ 21); + TestRleDecoder({21, 0, 0}, /* value_count= */ 21, /* bit_width= */ 5, + /* expected_value= */ 21); + TestRleDecoder({21, 2, 0, 1}, /* value_count= */ 20, /* bit_width= */ 30, + /* expected_value= */ 16777749); } template @@ -384,7 +387,7 @@ void TestBitPackedDecoder(std::vector bytes, } TEST(BitPacked, BitPackedDecoder) { - /// See parquet encoding for bytes layout + // See parquet encoding for bytes layout TestBitPackedDecoder( /* bytes= */ {0x88, 0xc6, 0xfa}, /* values_count= */ 8, @@ -852,32 +855,32 @@ TEST(BitRle, Overflow) { /// Check RleBitPacked encoding/decoding round trip. /// -/// \tparam kSpaced If set to false, treat Nulls in the input array as regular data. -/// \tparam kParts The number of parts in which the data will be decoded. +/// \param spaced If set to false, treat Nulls in the input array as regular data. +/// \param parts The number of parts in which the data will be decoded. /// For number greater than one, this ensure that the decoder intermediary state /// is valid. -template -void CheckRoundTrip(const Array& data, int bit_width, +template +void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts, std::shared_ptr dict = {}) { using ArrayType = typename TypeTraits::ArrayType; using value_type = typename Type::c_type; int const data_size = static_cast(data.length()); int const data_values_count = - static_cast(data.length() - kSpaced * data.null_count()); + static_cast(data.length() - spaced * data.null_count()); int const buffer_size = RleBitPackedEncoder::MaxBufferSize(bit_width, data_size); - ASSERT_GE(kParts, 1); - ASSERT_LE(kParts, data_size); + ASSERT_GE(parts, 1); + ASSERT_LE(parts, data_size); const value_type* data_values = static_cast(data).raw_values(); - // Encode the data into ``buffer`` using the encoder. + // Encode the data into `buffer` using the encoder. std::vector buffer(buffer_size); RleBitPackedEncoder encoder(buffer.data(), buffer_size, bit_width); int32_t encoded_values_size = 0; for (int i = 0; i < data_size; ++i) { - // Depending on kSpaced we treat nulls as regular values. - if (data.IsValid(i) || !kSpaced) { + // Depending on `spaced` we treat nulls as regular values. + if (data.IsValid(i) || !spaced) { bool success = encoder.Put(static_cast(data_values[i])); ASSERT_TRUE(success) << "Encoding failed in pos " << i; ++encoded_values_size; @@ -887,7 +890,7 @@ void CheckRoundTrip(const Array& data, int bit_width, ASSERT_EQ(encoded_values_size, data_values_count) << "All values input were not encoded successfully by the encoder"; - // On to verify batch read + // Now we verify batch read RleBitPackedDecoder decoder(buffer.data(), encoded_byte_size, bit_width); // We will only use one of them depending on whether this is a dictonnary tests std::vector dict_read; @@ -898,18 +901,18 @@ void CheckRoundTrip(const Array& data, int bit_width, values_read.resize(data_size); } - // We will read the data in kParts calls to make sure intermediate states are valid + // We will read the data in `parts` calls to make sure intermediate states are valid int32_t actual_read_count = 0; int32_t requested_read_count = 0; while (requested_read_count < data_size) { auto const remaining = data_size - requested_read_count; - auto to_read = data_size / kParts; + auto to_read = data_size / parts; if (remaining / to_read == 1) { to_read = remaining; } auto read = 0; - if constexpr (kSpaced) { + if (spaced) { // We need to slice the input array get the proper null count and bitmap auto data_remaining = data.Slice(requested_read_count, to_read); @@ -945,7 +948,7 @@ void CheckRoundTrip(const Array& data, int bit_width, // Verify the round trip: encoded-decoded values must equal the original one for (int64_t i = 0; i < data_size; ++i) { - if (data.IsValid(i) || !kSpaced) { + if (data.IsValid(i) || !spaced) { if (dict) { EXPECT_EQ(dict_read.at(i), dict->Value(data_values[i])) << "Encoded then decoded and mapped value at position " << i << " (" @@ -975,6 +978,7 @@ struct DataTestRleBitPackedRepeatPart { value_type value; int32_t size; + double null_probability; }; template @@ -991,12 +995,15 @@ struct DataTestRleBitPacked { using RandomPart = DataTestRleBitPackedRandomPart; using RepeatPart = DataTestRleBitPackedRepeatPart; using NullPart = DataTestRleBitPackedNullPart; + using AnyPart = std::variant; - std::vector> parts; + std::vector parts; int32_t bit_width; std::shared_ptr<::arrow::Array> MakeArray( ::arrow::random::RandomArrayGenerator& rand) const { + using Traits = arrow::TypeTraits; + std::vector> arrays = {}; for (auto const& dyn_part : parts) { @@ -1006,14 +1013,15 @@ struct DataTestRleBitPacked { arrays.push_back(std::move(arr)); } else if (auto* part = std::get_if(&dyn_part)) { - auto scalar = ::arrow::MakeScalar(part->value); - arrays.push_back(::arrow::MakeArrayFromScalar(*scalar, part->size).ValueOrDie()); + auto arr = + rand.Numeric(part->size, /* min= */ part->value, + /* max= */ part->value, part->null_probability); + arrays.push_back(std::move(arr)); } else if (auto* part = std::get_if(&dyn_part)) { - using Traits = arrow::TypeTraits; - auto null_scalar = ::arrow::MakeNullScalar(Traits::type_singleton()); - arrays.push_back( - ::arrow::MakeArrayFromScalar(*null_scalar, part->size).ValueOrDie()); + EXPECT_OK_AND_ASSIGN( + auto arr, ::arrow::MakeArrayOfNull(Traits::type_singleton(), part->size)); + arrays.push_back(std::move(arr)); } } ARROW_DCHECK_EQ(parts.size(), arrays.size()); @@ -1045,7 +1053,7 @@ void DoTestGetBatchSpacedRoundtrip() { }, { { - RandomPart{/* max=*/7, /* size=*/10037, /* null_proba= */ 0.1}, + RandomPart{/* max=*/7, /* size=*/10037, /* null_proba= */ 0.0}, NullPart{/* size= */ 1153}, RandomPart{/* max=*/7, /* size=*/800, /* null_proba= */ 0.5}, }, @@ -1054,28 +1062,30 @@ void DoTestGetBatchSpacedRoundtrip() { { { NullPart{/* size= */ 80}, - RandomPart{/* max=*/1023, /* size=*/800, /* null_proba= */ 0.01}, + RandomPart{/* max=*/static_cast(1023), /* size=*/800, + /* null_proba= */ 0.01}, NullPart{/* size= */ 1023}, }, /* bit_width= */ 11, }, { - {RepeatPart{/* value=*/13, /* size=*/100000}}, + {RepeatPart{/* value=*/13, /* size=*/100000, /* null_proba= */ 0.01}}, /* bit_width= */ 10, }, { { NullPart{/* size= */ 1024}, - RepeatPart{/* value=*/10000, /* size=*/100000}, + RepeatPart{/* value=*/static_cast(10000), /* size=*/100000, + /* null_proba= */ 0.1}, NullPart{/* size= */ 77}, }, /* bit_width= */ 23, }, { { - RepeatPart{/* value=*/13, /* size=*/100000}, + RepeatPart{/* value=*/13, /* size=*/100000, /* null_proba= */ 0.0}, NullPart{/* size= */ 1153}, - RepeatPart{/* value=*/72, /* size=*/100799}, + RepeatPart{/* value=*/72, /* size=*/100799, /* null_proba= */ 0.5}, }, /* bit_width= */ 10, }, @@ -1083,9 +1093,9 @@ void DoTestGetBatchSpacedRoundtrip() { { RandomPart{/* max=*/1, /* size=*/1013, /* null_proba= */ 0.01}, NullPart{/* size=*/8}, - RepeatPart{1, /* size= */ 256}, + RepeatPart{1, /* size= */ 256, /* null_proba= */ 0.1}, NullPart{/* size=*/128}, - RepeatPart{0, /* size= */ 256}, + RepeatPart{0, /* size= */ 256, /* null_proba= */ 0.0}, NullPart{/* size=*/15}, RandomPart{/* max=*/1, /* size=*/8 * 1024, /* null_proba= */ 0.01}, }, @@ -1094,39 +1104,58 @@ void DoTestGetBatchSpacedRoundtrip() { }; ::arrow::random::RandomArrayGenerator rand(/* seed= */ 12); - // FRAGILE: Large enough so that it can be indexed by any value in all cases + // FRAGILE: we create a dictionary large enough so that any encoded value from the + // previous test cases can be used as an index in the dictionary. + // Its size must be increased accordingly if larger values are encoded in the test + // cases. auto dict = std::static_pointer_cast(rand.Float32(20000, -1.0, 1.0)); + // Number of bits available in T to write a positive integer. + constexpr int kBitsAvailable = 8 * sizeof(T) - (std::is_signed_v ? 1 : 0); + for (auto case_ : test_cases) { - if (static_cast(case_.bit_width) > sizeof(T)) { + if (static_cast(case_.bit_width) > kBitsAvailable) { continue; } auto array = case_.MakeArray(rand); // Tests for GetBatch - CheckRoundTrip(*array, case_.bit_width); - CheckRoundTrip(*array, case_.bit_width); + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ false, + /* parts= */ 1); + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ false, + /* parts= */ 3); // Tests for GetBatchSpaced - CheckRoundTrip(*array, case_.bit_width); - CheckRoundTrip(*array, case_.bit_width); - CheckRoundTrip(*array->Slice(1), case_.bit_width); + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ true, + /* parts= */ 1); + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ true, + /* parts= */ 7); + CheckRoundTrip(*array->Slice(1), case_.bit_width, /* spaced= */ true, + /* parts= */ 1); // Cannot test GetBatchWithDict with this method since unknown null values // Tests for GetBatchWithDictSpaced - CheckRoundTrip(*array, case_.bit_width, dict); - CheckRoundTrip(*array, case_.bit_width, dict); + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ true, /* parts= */ 1, + dict); + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ true, /* parts= */ 5, + dict); } } +TEST(RleBitPacked, GetBatchSpacedRoundtripUint8) { + DoTestGetBatchSpacedRoundtrip(); +} TEST(RleBitPacked, GetBatchSpacedRoundtripUint16) { DoTestGetBatchSpacedRoundtrip(); } TEST(RleBitPacked, GetBatchSpacedRoundtripInt32) { DoTestGetBatchSpacedRoundtrip(); } +TEST(RleBitPacked, GetBatchSpacedRoundtripUInt32) { + DoTestGetBatchSpacedRoundtrip(); +} TEST(RleBitPacked, GetBatchSpacedRoundtripUint64) { DoTestGetBatchSpacedRoundtrip(); } From a61d8362f75b02ad28b44b61a63253317f1951d8 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 15 Sep 2025 19:15:05 +0200 Subject: [PATCH 47/56] Apply formatter --- cpp/src/arrow/util/bit_util_test.cc | 6 ++-- cpp/src/arrow/util/rle_encoding_internal.h | 34 +++++++++++----------- cpp/src/arrow/util/rle_encoding_test.cc | 34 +++++++++++----------- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 9fdb925e09e0..ab007d4a98ba 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -2007,7 +2007,7 @@ TEST(LEB128, MaxLEB128ByteLenFor) { /// Utility function to test LEB128 encoding with known input value and expected byte /// array template -void TestLEB128Encode(Int input_value, std::vector const& expected_data, +void TestLEB128Encode(Int input_value, const std::vector& expected_data, std::size_t buffer_size) { std::vector buffer(buffer_size); auto bytes_written = bit_util::WriteLEB128(input_value, buffer.data(), @@ -2051,7 +2051,7 @@ TEST(LEB128, WriteEdgeCases) { /// Utility function to test LEB128 decoding with known byte array and expected result template -void TestLEB128Decode(std::vector const& data, Int expected_value, +void TestLEB128Decode(const std::vector& data, Int expected_value, int32_t expected_bytes_read) { Int result = 0; auto bytes_read = bit_util::ParseLeadingLEB128( @@ -2063,7 +2063,7 @@ void TestLEB128Decode(std::vector const& data, Int expected_value, } template -void TestLEB128Decode(std::vector const& data, Int expected_value, +void TestLEB128Decode(const std::vector& data, Int expected_value, std::size_t expected_bytes_read) { ASSERT_LE(expected_bytes_read, std::numeric_limits::max()); return TestLEB128Decode(data, expected_value, diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 7b460ffab825..2f937dfc9320 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -102,13 +102,13 @@ class RleRun { using DecoderType = RleRunDecoder; constexpr RleRun() noexcept = default; - constexpr RleRun(RleRun const&) noexcept = default; + constexpr RleRun(const RleRun&) noexcept = default; constexpr RleRun(RleRun&&) noexcept = default; explicit RleRun(raw_data_const_pointer data, values_count_type values_count, bit_size_type value_bit_width) noexcept; - constexpr RleRun& operator=(RleRun const&) noexcept = default; + constexpr RleRun& operator=(const RleRun&) noexcept = default; constexpr RleRun& operator=(RleRun&&) noexcept = default; /// The number of repeated values in this run. @@ -151,13 +151,13 @@ class BitPackedRun { using DecoderType = BitPackedRunDecoder; constexpr BitPackedRun() noexcept = default; - constexpr BitPackedRun(BitPackedRun const&) noexcept = default; + constexpr BitPackedRun(const BitPackedRun&) noexcept = default; constexpr BitPackedRun(BitPackedRun&&) noexcept = default; constexpr BitPackedRun(raw_data_const_pointer data, values_count_type values_count, bit_size_type value_bit_width) noexcept; - constexpr BitPackedRun& operator=(BitPackedRun const&) noexcept = default; + constexpr BitPackedRun& operator=(const BitPackedRun&) noexcept = default; constexpr BitPackedRun& operator=(BitPackedRun&&) noexcept = default; [[nodiscard]] constexpr values_count_type ValuesCount() const noexcept; @@ -255,9 +255,9 @@ class RleRunDecoder { constexpr RleRunDecoder() noexcept = default; - explicit RleRunDecoder(run_type const& run) noexcept; + explicit RleRunDecoder(const run_type& run) noexcept; - void Reset(run_type const& run) noexcept; + void Reset(const run_type& run) noexcept; /// Return the number of values that can be advanced. [[nodiscard]] values_count_type Remaining() const; @@ -296,9 +296,9 @@ class BitPackedRunDecoder { BitPackedRunDecoder() noexcept = default; - explicit BitPackedRunDecoder(run_type const& run) noexcept; + explicit BitPackedRunDecoder(const run_type& run) noexcept; - void Reset(run_type const& run) noexcept; + void Reset(const run_type& run) noexcept; /// Return the number of values that can be advanced. [[nodiscard]] constexpr values_count_type Remaining() const; @@ -739,7 +739,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); - values_count_type const values_available = decoder->Remaining(); + const values_count_type values_available = decoder->Remaining(); ARROW_DCHECK_GT(values_available, 0); auto values_remaining_run = [&]() { auto out = values_available - batch.ValuesRead(); @@ -775,7 +775,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, } } - value_type const value = decoder->Value(); + const value_type value = decoder->Value(); if (ARROW_PREDICT_FALSE(!converter->InputIsValid(value))) { return {0, 0}; } @@ -801,7 +801,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); - values_count_type const values_available = decoder->Remaining(); + const values_count_type values_available = decoder->Remaining(); ARROW_DCHECK_GT(values_available, 0); auto run_values_remaining = [&]() { auto out = values_available - batch.ValuesRead(); @@ -1274,8 +1274,8 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const return {}; } - bool const is_bit_packed = run_len_type & 1; - uint32_t const count = run_len_type >> 1; + const bool is_bit_packed = run_len_type & 1; + const uint32_t count = run_len_type >> 1; if (is_bit_packed) { using values_count_type = BitPackedRun::values_count_type; constexpr auto kMaxCount = @@ -1333,12 +1333,12 @@ void RleBitPackedParser::Parse(Handler&& handler) { ****************/ template -RleRunDecoder::RleRunDecoder(run_type const& run) noexcept { +RleRunDecoder::RleRunDecoder(const run_type& run) noexcept { Reset(run); } template -void RleRunDecoder::Reset(run_type const& run) noexcept { +void RleRunDecoder::Reset(const run_type& run) noexcept { remaining_count_ = run.ValuesCount(); if constexpr (std::is_same_v) { // ARROW-18031: just check the LSB of the next byte and move on. @@ -1393,12 +1393,12 @@ auto RleRunDecoder::GetBatch(value_type* out, values_count_type batch_size) **********************/ template -BitPackedRunDecoder::BitPackedRunDecoder(run_type const& run) noexcept { +BitPackedRunDecoder::BitPackedRunDecoder(const run_type& run) noexcept { Reset(run); } template -void BitPackedRunDecoder::Reset(run_type const& run) noexcept { +void BitPackedRunDecoder::Reset(const run_type& run) noexcept { value_bit_width_ = run.ValuesBitWidth(); remaining_count_ = run.ValuesCount(); ARROW_DCHECK_GE(value_bit_width_, 0); diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index a4427ca4b35d..d456891350a7 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -217,21 +217,21 @@ TEST(Rle, RleRun) { RleRun::values_count_type value_count = 12; // 12 times the value 21 fitting over 5 bits - auto const run_5 = RleRun(value.data(), value_count, /* value_bit_width= */ 5); + const auto run_5 = RleRun(value.data(), value_count, /* value_bit_width= */ 5); EXPECT_EQ(run_5.ValuesCount(), value_count); EXPECT_EQ(run_5.ValuesBitWidth(), 5); EXPECT_EQ(run_5.RawDataSize(), 1); // 5 bits fit in one byte EXPECT_EQ(*run_5.RawDataPtr(), 21); // 12 times the value 21 fitting over 16 bits - auto const run_8 = RleRun(value.data(), value_count, /* value_bit_width= */ 8); + const auto run_8 = RleRun(value.data(), value_count, /* value_bit_width= */ 8); EXPECT_EQ(run_8.ValuesCount(), value_count); EXPECT_EQ(run_8.ValuesBitWidth(), 8); EXPECT_EQ(run_8.RawDataSize(), 1); // 8 bits fit in 1 byte EXPECT_EQ(*run_8.RawDataPtr(), 21); // 12 times the value {21, 2} fitting over 10 bits - auto const run_10 = RleRun(value.data(), value_count, /* value_bit_width= */ 10); + const auto run_10 = RleRun(value.data(), value_count, /* value_bit_width= */ 10); EXPECT_EQ(run_10.ValuesCount(), value_count); EXPECT_EQ(run_10.ValuesBitWidth(), 10); @@ -240,7 +240,7 @@ TEST(Rle, RleRun) { EXPECT_EQ(*(run_10.RawDataPtr() + 1), 2); // 12 times the value {21, 2} fitting over 32 bits - auto const run_32 = RleRun(value.data(), value_count, /* value_bit_width= */ 32); + const auto run_32 = RleRun(value.data(), value_count, /* value_bit_width= */ 32); EXPECT_EQ(run_32.ValuesCount(), value_count); EXPECT_EQ(run_32.ValuesBitWidth(), 32); EXPECT_EQ(run_32.RawDataSize(), 4); // 32 bits fit in 4 bytes @@ -257,7 +257,7 @@ TEST(BitPacked, BitPackedRun) { // 16 values of 1 bit for a total of 16 bits BitPackedRun::values_count_type value_count_1 = 16; - auto const run_1 = BitPackedRun(value.data(), value_count_1, /* value_bit_width= */ 1); + const auto run_1 = BitPackedRun(value.data(), value_count_1, /* value_bit_width= */ 1); EXPECT_EQ(run_1.ValuesCount(), value_count_1); EXPECT_EQ(run_1.ValuesBitWidth(), 1); EXPECT_EQ(run_1.RawDataSize(), 2); // 16 bits fit in 2 bytes @@ -267,7 +267,7 @@ TEST(BitPacked, BitPackedRun) { // 8 values of 3 bits for a total of 24 bits BitPackedRun::values_count_type value_count_3 = 8; - auto const run_3 = BitPackedRun(value.data(), value_count_3, /* value_bit_width= */ 3); + const auto run_3 = BitPackedRun(value.data(), value_count_3, /* value_bit_width= */ 3); EXPECT_EQ(run_3.ValuesCount(), value_count_3); EXPECT_EQ(run_3.ValuesBitWidth(), 3); EXPECT_EQ(run_3.RawDataSize(), 3); // 24 bits fit in 3 bytes @@ -282,7 +282,7 @@ void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_ // Pre-requisite for this test EXPECT_GT(value_count, 6); - auto const run = RleRun(bytes.data(), value_count, bit_width); + const auto run = RleRun(bytes.data(), value_count, bit_width); auto decoder = RleRunDecoder(run); std::vector vals = {0, 0}; @@ -344,7 +344,7 @@ void TestBitPackedDecoder(std::vector bytes, // Pre-requisite for this test EXPECT_GT(value_count, 6); - auto const run = BitPackedRun(bytes.data(), value_count, bit_width); + const auto run = BitPackedRun(bytes.data(), value_count, bit_width); auto decoder = BitPackedRunDecoder(run); std::vector vals = {0, 0}; @@ -432,8 +432,8 @@ void TestRleBitPackedParser(std::vector bytes, auto OnRleRun(RleRun run) { rle_decoder_ptr_->Reset(run); - auto const n_decoded = decoded_ptr_->size(); - auto const n_to_decode = rle_decoder_ptr_->Remaining(); + const auto n_decoded = decoded_ptr_->size(); + const auto n_to_decode = rle_decoder_ptr_->Remaining(); decoded_ptr_->resize(n_decoded + n_to_decode); EXPECT_EQ(rle_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, n_to_decode), n_to_decode); @@ -445,8 +445,8 @@ void TestRleBitPackedParser(std::vector bytes, auto OnBitPackedRun(BitPackedRun run) { bit_packed_decoder_ptr_->Reset(run); - auto const n_decoded = decoded_ptr_->size(); - auto const n_to_decode = bit_packed_decoder_ptr_->Remaining(); + const auto n_decoded = decoded_ptr_->size(); + const auto n_to_decode = bit_packed_decoder_ptr_->Remaining(); decoded_ptr_->resize(n_decoded + n_to_decode); EXPECT_EQ(bit_packed_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, n_to_decode), @@ -865,10 +865,10 @@ void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts using ArrayType = typename TypeTraits::ArrayType; using value_type = typename Type::c_type; - int const data_size = static_cast(data.length()); - int const data_values_count = + const int data_size = static_cast(data.length()); + const int data_values_count = static_cast(data.length() - spaced * data.null_count()); - int const buffer_size = RleBitPackedEncoder::MaxBufferSize(bit_width, data_size); + const int buffer_size = RleBitPackedEncoder::MaxBufferSize(bit_width, data_size); ASSERT_GE(parts, 1); ASSERT_LE(parts, data_size); @@ -905,7 +905,7 @@ void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts int32_t actual_read_count = 0; int32_t requested_read_count = 0; while (requested_read_count < data_size) { - auto const remaining = data_size - requested_read_count; + const auto remaining = data_size - requested_read_count; auto to_read = data_size / parts; if (remaining / to_read == 1) { to_read = remaining; @@ -1006,7 +1006,7 @@ struct DataTestRleBitPacked { std::vector> arrays = {}; - for (auto const& dyn_part : parts) { + for (const auto& dyn_part : parts) { if (auto* part = std::get_if(&dyn_part)) { auto arr = rand.Numeric(part->size, /* min= */ value_type(0), part->max, part->null_probability); From c91638ba0d181416ef37aabf6fa5b55c31da6392 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 17 Sep 2025 12:53:42 +0200 Subject: [PATCH 48/56] Address newer reviewer comments --- .../arrow/util/bit_stream_utils_internal.h | 12 +++- cpp/src/arrow/util/bit_util_test.cc | 30 ++++---- cpp/src/arrow/util/rle_encoding_test.cc | 71 ++++++++----------- 3 files changed, 52 insertions(+), 61 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 1673a1c8d20c..1f3b699e1ac6 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -433,7 +433,15 @@ inline bool BitWriter::PutVlqInt(Int v) { uint8_t buffer[kBufferSize] = {}; const auto bytes_written = WriteLEB128(v, buffer, kBufferSize); ARROW_DCHECK_LE(bytes_written, kBufferSize); - ARROW_DCHECK_GT(bytes_written, 0); // Cannot fail since we gave max space + if constexpr (std::is_signed_v) { + // Can fail if negative + if (ARROW_PREDICT_FALSE(!bytes_written == 0)) { + return false; + } + } else { + // Cannot fail since we gave max space + ARROW_DCHECK_GT(bytes_written, 0); + } for (int i = 0; i < bytes_written; ++i) { const bool success = PutAligned(buffer[i], 1); @@ -450,7 +458,7 @@ inline bool BitReader::GetVlqInt(Int* v) { static_assert(std::is_integral_v); // The data that we will pass to the LEB128 parser - // In all case, we read an byte-aligned value, skipping remaining bits + // In all case, we read a byte-aligned value, skipping remaining bits const uint8_t* data = NULLPTR; int max_size = 0; diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index ab007d4a98ba..e8cee340dedc 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -1999,9 +1999,14 @@ TEST(BitUtil, RoundUpToPowerOf2) { /// Test the maximum number of bytes needed to write a LEB128 of a give size. TEST(LEB128, MaxLEB128ByteLenFor) { + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 2); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 2); EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 3); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 3); EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 5); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 5); EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 10); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 10); } /// Utility function to test LEB128 encoding with known input value and expected byte @@ -2037,6 +2042,7 @@ TEST(LEB128, WriteEdgeCases) { // Three byte value 16384, encoded in larger buffer TestLEB128Encode(16384U, {0x80, 0x80, 0x01}, 10); // Two byte boundary values + TestLEB128Encode(128U, {0x80, 0x01}, 2); TestLEB128Encode(129U, {0x81, 0x01}, 2); TestLEB128Encode(16383U, {0xFF, 0x7F}, 2); // Error case: Buffer too small for value 128 (needs 2 bytes but only 1 provided) @@ -2062,14 +2068,6 @@ void TestLEB128Decode(const std::vector& data, Int expected_value, } } -template -void TestLEB128Decode(const std::vector& data, Int expected_value, - std::size_t expected_bytes_read) { - ASSERT_LE(expected_bytes_read, std::numeric_limits::max()); - return TestLEB128Decode(data, expected_value, - static_cast(expected_bytes_read)); -} - /// Test decoding from known LEB128 byte sequences with edge case parameters. /// \see LEB128.KnownSuccessfulValues for other known values tested. TEST(LEB128, ReadEdgeCases) { @@ -2129,48 +2127,48 @@ TEST(LEB128, KnownSuccessfulValues) { if (data.value <= static_cast(std::numeric_limits::max())) { const auto val = static_cast(data.value); TestLEB128Encode(val, data.bytes, data.bytes.size()); - TestLEB128Decode(data.bytes, val, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); } if (data.value <= static_cast(std::numeric_limits::max())) { const auto val = static_cast(data.value); TestLEB128Encode(val, data.bytes, data.bytes.size()); - TestLEB128Decode(data.bytes, val, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); } // 16 bits if (data.value <= static_cast(std::numeric_limits::max())) { const auto val = static_cast(data.value); TestLEB128Encode(val, data.bytes, data.bytes.size()); - TestLEB128Decode(data.bytes, val, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); } if (data.value <= static_cast(std::numeric_limits::max())) { const auto val = static_cast(data.value); TestLEB128Encode(val, data.bytes, data.bytes.size()); - TestLEB128Decode(data.bytes, val, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); } // 32 bits if (data.value <= static_cast(std::numeric_limits::max())) { const auto val = static_cast(data.value); TestLEB128Encode(val, data.bytes, data.bytes.size()); - TestLEB128Decode(data.bytes, val, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); } if (data.value <= static_cast(std::numeric_limits::max())) { const auto val = static_cast(data.value); TestLEB128Encode(val, data.bytes, data.bytes.size()); - TestLEB128Decode(data.bytes, val, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); } // 64 bits if (data.value <= static_cast(std::numeric_limits::max())) { const auto val = static_cast(data.value); TestLEB128Encode(val, data.bytes, data.bytes.size()); - TestLEB128Decode(data.bytes, val, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); } if (data.value <= static_cast(std::numeric_limits::max())) { const auto val = static_cast(data.value); TestLEB128Encode(val, data.bytes, data.bytes.size()); - TestLEB128Decode(data.bytes, val, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); } } } diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index d456891350a7..f16583525a49 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -223,14 +223,14 @@ TEST(Rle, RleRun) { EXPECT_EQ(run_5.RawDataSize(), 1); // 5 bits fit in one byte EXPECT_EQ(*run_5.RawDataPtr(), 21); - // 12 times the value 21 fitting over 16 bits + // 12 times the value 21 fitting over 8 bits const auto run_8 = RleRun(value.data(), value_count, /* value_bit_width= */ 8); EXPECT_EQ(run_8.ValuesCount(), value_count); EXPECT_EQ(run_8.ValuesBitWidth(), 8); EXPECT_EQ(run_8.RawDataSize(), 1); // 8 bits fit in 1 byte EXPECT_EQ(*run_8.RawDataPtr(), 21); - // 12 times the value {21, 2} fitting over 10 bits + // 12 times the value 533 (21 + 2 * 2^8) fitting over 10 bits const auto run_10 = RleRun(value.data(), value_count, /* value_bit_width= */ 10); EXPECT_EQ(run_10.ValuesCount(), value_count); @@ -239,7 +239,7 @@ TEST(Rle, RleRun) { EXPECT_EQ(*(run_10.RawDataPtr() + 0), 21); EXPECT_EQ(*(run_10.RawDataPtr() + 1), 2); - // 12 times the value {21, 2} fitting over 32 bits + // 12 times the value 533 (21 + 2 * 2^8) fitting over 32 bits const auto run_32 = RleRun(value.data(), value_count, /* value_bit_width= */ 32); EXPECT_EQ(run_32.ValuesCount(), value_count); EXPECT_EQ(run_32.ValuesBitWidth(), 32); @@ -261,9 +261,7 @@ TEST(BitPacked, BitPackedRun) { EXPECT_EQ(run_1.ValuesCount(), value_count_1); EXPECT_EQ(run_1.ValuesBitWidth(), 1); EXPECT_EQ(run_1.RawDataSize(), 2); // 16 bits fit in 2 bytes - for (BitPackedRun::raw_data_size_type i = 0; i < run_1.RawDataSize(); ++i) { - EXPECT_EQ(*(run_1.RawDataPtr() + i), value[i]); - } + EXPECT_EQ(run_1.RawDataPtr(), value.data()); // 8 values of 3 bits for a total of 24 bits BitPackedRun::values_count_type value_count_3 = 8; @@ -271,9 +269,7 @@ TEST(BitPacked, BitPackedRun) { EXPECT_EQ(run_3.ValuesCount(), value_count_3); EXPECT_EQ(run_3.ValuesBitWidth(), 3); EXPECT_EQ(run_3.RawDataSize(), 3); // 24 bits fit in 3 bytes - for (BitPackedRun::raw_data_size_type i = 0; i < run_3.RawDataSize(); ++i) { - EXPECT_EQ(*(run_3.RawDataPtr() + i), value[i]); - } + EXPECT_EQ(run_3.RawDataPtr(), value.data()); } template @@ -324,13 +320,13 @@ void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_ } TEST(Rle, RleDecoder) { - TestRleDecoder({21, 0, 0}, /* value_count= */ 21, /* bit_width= */ 5, + TestRleDecoder({21, 0, 0}, /* value_count= */ 23, /* bit_width= */ 5, /* expected_value= */ 21); TestRleDecoder({1, 0}, /* value_count= */ 13, /* bit_width= */ 1, /* expected_value= */ 1); - TestRleDecoder({21, 0, 0}, /* value_count= */ 21, /* bit_width= */ 5, + TestRleDecoder({21, 0, 0}, /* value_count= */ 23, /* bit_width= */ 5, /* expected_value= */ 21); - TestRleDecoder({21, 0, 0}, /* value_count= */ 21, /* bit_width= */ 5, + TestRleDecoder({21, 0, 0}, /* value_count= */ 23, /* bit_width= */ 5, /* expected_value= */ 21); TestRleDecoder({21, 2, 0, 1}, /* value_count= */ 20, /* bit_width= */ 30, /* expected_value= */ 16777749); @@ -857,7 +853,7 @@ TEST(BitRle, Overflow) { /// /// \param spaced If set to false, treat Nulls in the input array as regular data. /// \param parts The number of parts in which the data will be decoded. -/// For number greater than one, this ensure that the decoder intermediary state +/// For number greater than one, this ensure that the decoder intermediate state /// is valid. template void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts, @@ -892,7 +888,7 @@ void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts // Now we verify batch read RleBitPackedDecoder decoder(buffer.data(), encoded_byte_size, bit_width); - // We will only use one of them depending on whether this is a dictonnary tests + // We will only use one of them depending on whether this is a dictionary tests std::vector dict_read; std::vector values_read; if (dict) { @@ -902,49 +898,46 @@ void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts } // We will read the data in `parts` calls to make sure intermediate states are valid - int32_t actual_read_count = 0; - int32_t requested_read_count = 0; - while (requested_read_count < data_size) { - const auto remaining = data_size - requested_read_count; + int32_t total_read_count = 0; + while (total_read_count < data_size) { + const auto remaining = data_size - total_read_count; auto to_read = data_size / parts; if (remaining / to_read == 1) { to_read = remaining; } - auto read = 0; + int32_t read = 0; if (spaced) { // We need to slice the input array get the proper null count and bitmap - auto data_remaining = data.Slice(requested_read_count, to_read); + auto data_remaining = data.Slice(total_read_count, to_read); if (dict) { - auto* out = dict_read.data() + requested_read_count; + auto* out = dict_read.data() + total_read_count; read = decoder.GetBatchWithDictSpaced( dict->raw_values(), static_cast(dict->length()), out, to_read, static_cast(data_remaining->null_count()), data_remaining->null_bitmap_data(), data_remaining->offset()); } else { - auto* out = values_read.data() + requested_read_count; + auto* out = values_read.data() + total_read_count; read = decoder.GetBatchSpaced( to_read, static_cast(data_remaining->null_count()), data_remaining->null_bitmap_data(), data_remaining->offset(), out); } } else { if (dict) { - auto* out = dict_read.data() + requested_read_count; + auto* out = dict_read.data() + total_read_count; read = decoder.GetBatchWithDict( dict->raw_values(), static_cast(dict->length()), out, to_read); } else { - auto* out = values_read.data() + requested_read_count; + auto* out = values_read.data() + total_read_count; read = decoder.GetBatch(out, to_read); } } ASSERT_EQ(read, to_read) << "Decoder did not read as many values as requested"; - actual_read_count += read; - requested_read_count += to_read; + total_read_count += read; } - EXPECT_EQ(requested_read_count, data_size) << "This test logic is wrong"; - EXPECT_EQ(actual_read_count, data_size) << "Total number of values read is off"; + EXPECT_EQ(total_read_count, data_size) << "Total number of values read is off"; // Verify the round trip: encoded-decoded values must equal the original one for (int64_t i = 0; i < data_size; ++i) { @@ -1030,14 +1023,6 @@ struct DataTestRleBitPacked { } }; -template -struct GetBatchSpacedTestCase { - T max_value; - int64_t size; - double null_probability; - int bit_width; -}; - template void DoTestGetBatchSpacedRoundtrip() { using Data = DataTestRleBitPacked; @@ -1053,7 +1038,7 @@ void DoTestGetBatchSpacedRoundtrip() { }, { { - RandomPart{/* max=*/7, /* size=*/10037, /* null_proba= */ 0.0}, + RandomPart{/* max=*/7, /* size=*/1037, /* null_proba= */ 0.0}, NullPart{/* size= */ 1153}, RandomPart{/* max=*/7, /* size=*/800, /* null_proba= */ 0.5}, }, @@ -1069,13 +1054,13 @@ void DoTestGetBatchSpacedRoundtrip() { /* bit_width= */ 11, }, { - {RepeatPart{/* value=*/13, /* size=*/100000, /* null_proba= */ 0.01}}, + {RepeatPart{/* value=*/13, /* size=*/1024, /* null_proba= */ 0.01}}, /* bit_width= */ 10, }, { { NullPart{/* size= */ 1024}, - RepeatPart{/* value=*/static_cast(10000), /* size=*/100000, + RepeatPart{/* value=*/static_cast(10000), /* size=*/1025, /* null_proba= */ 0.1}, NullPart{/* size= */ 77}, }, @@ -1083,9 +1068,9 @@ void DoTestGetBatchSpacedRoundtrip() { }, { { - RepeatPart{/* value=*/13, /* size=*/100000, /* null_proba= */ 0.0}, + RepeatPart{/* value=*/13, /* size=*/1023, /* null_proba= */ 0.0}, NullPart{/* size= */ 1153}, - RepeatPart{/* value=*/72, /* size=*/100799, /* null_proba= */ 0.5}, + RepeatPart{/* value=*/72, /* size=*/1799, /* null_proba= */ 0.5}, }, /* bit_width= */ 10, }, @@ -1097,7 +1082,7 @@ void DoTestGetBatchSpacedRoundtrip() { NullPart{/* size=*/128}, RepeatPart{0, /* size= */ 256, /* null_proba= */ 0.0}, NullPart{/* size=*/15}, - RandomPart{/* max=*/1, /* size=*/8 * 1024, /* null_proba= */ 0.01}, + RandomPart{/* max=*/1, /* size=*/1024, /* null_proba= */ 0.01}, }, /* bit_width= */ 1, }, @@ -1114,7 +1099,7 @@ void DoTestGetBatchSpacedRoundtrip() { constexpr int kBitsAvailable = 8 * sizeof(T) - (std::is_signed_v ? 1 : 0); for (auto case_ : test_cases) { - if (static_cast(case_.bit_width) > kBitsAvailable) { + if (case_.bit_width > kBitsAvailable) { continue; } From 11bb10a191642ff947c451c37514bf7e7be789cc Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 17 Sep 2025 14:03:27 +0200 Subject: [PATCH 49/56] Inline small method and remove [[nodiscard]] --- cpp/src/arrow/util/rle_encoding_internal.h | 570 ++++++++------------- 1 file changed, 213 insertions(+), 357 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 2f937dfc9320..ddcb3bc42ff4 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -106,22 +106,31 @@ class RleRun { constexpr RleRun(RleRun&&) noexcept = default; explicit RleRun(raw_data_const_pointer data, values_count_type values_count, - bit_size_type value_bit_width) noexcept; + bit_size_type value_bit_width) noexcept + : values_count_(values_count), value_bit_width_(value_bit_width) { + ARROW_DCHECK_GE(value_bit_width, 0); + ARROW_DCHECK_GE(values_count, 0); + std::copy(data, data + RawDataSize(), data_.begin()); + } constexpr RleRun& operator=(const RleRun&) noexcept = default; constexpr RleRun& operator=(RleRun&&) noexcept = default; /// The number of repeated values in this run. - [[nodiscard]] constexpr values_count_type ValuesCount() const noexcept; + constexpr values_count_type ValuesCount() const noexcept { return values_count_; } /// The size in bits of each encoded value. - [[nodiscard]] constexpr bit_size_type ValuesBitWidth() const noexcept; + constexpr bit_size_type ValuesBitWidth() const noexcept { return value_bit_width_; } /// A pointer to the repeated value raw bytes. - [[nodiscard]] constexpr raw_data_const_pointer RawDataPtr() const noexcept; + constexpr raw_data_const_pointer RawDataPtr() const noexcept { return data_.data(); } /// The number of bytes used for the raw repeated value. - [[nodiscard]] constexpr raw_data_size_type RawDataSize() const noexcept; + constexpr raw_data_size_type RawDataSize() const noexcept { + auto out = bit_util::BytesForBits(value_bit_width_); + ARROW_DCHECK_LE(out, std::numeric_limits::max()); + return static_cast(out); + } private: /// The repeated value raw bytes stored inside the class @@ -155,19 +164,28 @@ class BitPackedRun { constexpr BitPackedRun(BitPackedRun&&) noexcept = default; constexpr BitPackedRun(raw_data_const_pointer data, values_count_type values_count, - bit_size_type value_bit_width) noexcept; + bit_size_type value_bit_width) noexcept + : data_(data), values_count_(values_count), value_bit_width_(value_bit_width) { + ARROW_CHECK_GE(value_bit_width_, 0); + ARROW_CHECK_GE(values_count_, 0); + } constexpr BitPackedRun& operator=(const BitPackedRun&) noexcept = default; constexpr BitPackedRun& operator=(BitPackedRun&&) noexcept = default; - [[nodiscard]] constexpr values_count_type ValuesCount() const noexcept; + constexpr values_count_type ValuesCount() const noexcept { return values_count_; } /// The size in bits of each encoded value. - [[nodiscard]] constexpr bit_size_type ValuesBitWidth() const noexcept; + constexpr bit_size_type ValuesBitWidth() const noexcept { return value_bit_width_; } - [[nodiscard]] constexpr raw_data_const_pointer RawDataPtr() const noexcept; + constexpr raw_data_const_pointer RawDataPtr() const noexcept { return data_; } - [[nodiscard]] constexpr raw_data_size_type RawDataSize() const noexcept; + constexpr raw_data_size_type RawDataSize() const noexcept { + auto out = bit_util::BytesForBits(static_cast(value_bit_width_) * + static_cast(values_count_)); + ARROW_CHECK_LE(out, std::numeric_limits::max()); + return static_cast(out); + } private: /// The pointer to the beginning of the run @@ -192,17 +210,20 @@ class RleBitPackedParser { constexpr RleBitPackedParser() noexcept = default; constexpr RleBitPackedParser(raw_data_const_pointer data, raw_data_size_type data_size, - bit_size_type value_bit_width) noexcept; + bit_size_type value_bit_width) noexcept + : data_(data), data_size_(data_size), value_bit_width_(value_bit_width) {} constexpr void Reset(raw_data_const_pointer data, raw_data_size_type data_size, - bit_size_type value_bit_width_) noexcept; + bit_size_type value_bit_width) noexcept { + *this = {data, data_size, value_bit_width}; + } /// Whether there is still runs to iterate over. /// /// WARN: Due to simplistic error handling, iteration with Next and Peek could /// fail to return data while the parser is not exhausted. /// This is how one can check for errors. - [[nodiscard]] bool Exhausted() const; + bool Exhausted() const { return data_size_ == 0; } /// Enum to return from an ``Parse`` handler. /// @@ -255,25 +276,56 @@ class RleRunDecoder { constexpr RleRunDecoder() noexcept = default; - explicit RleRunDecoder(const run_type& run) noexcept; + explicit RleRunDecoder(const run_type& run) noexcept { Reset(run); } - void Reset(const run_type& run) noexcept; + void Reset(const run_type& run) noexcept { + remaining_count_ = run.ValuesCount(); + if constexpr (std::is_same_v) { + // ARROW-18031: just check the LSB of the next byte and move on. + // If we memcpy + FromLittleEndian, we have potential undefined behavior + // if the bool value isn't 0 or 1. + value_ = *run.RawDataPtr() & 1; + } else { + // Memcopy is required to avoid undefined behavior. + value_ = {}; + std::memcpy(&value_, run.RawDataPtr(), run.RawDataSize()); + value_ = ::arrow::bit_util::FromLittleEndian(value_); + } + } /// Return the number of values that can be advanced. - [[nodiscard]] values_count_type Remaining() const; + values_count_type Remaining() const { return remaining_count_; } /// Return the repeated value of this decoder. - [[nodiscard]] constexpr value_type Value() const; + constexpr value_type Value() const { return value_; } /// Try to advance by as many values as provided. /// Return the number of values skipped. - [[nodiscard]] values_count_type Advance(values_count_type batch_size); + /// May advance by less than asked for. + [[nodiscard]] values_count_type Advance(values_count_type batch_size) { + const auto steps = std::min(batch_size, remaining_count_); + remaining_count_ -= steps; + return steps; + } /// Get the next value and return false if there are no more. - [[nodiscard]] constexpr bool Get(value_type* out_value); + [[nodiscard]] constexpr bool Get(value_type* out_value) { + return GetBatch(out_value, 1) == 1; + } /// Get a batch of values return the number of decoded elements. - [[nodiscard]] values_count_type GetBatch(value_type* out, values_count_type batch_size); + /// May write fewer elements to the output than requested. + [[nodiscard]] values_count_type GetBatch(value_type* out, + values_count_type batch_size) { + if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { + return 0; + } + + const auto to_read = std::min(remaining_count_, batch_size); + std::fill(out, out + to_read, value_); + remaining_count_ -= to_read; + return to_read; + } private: value_type value_ = {}; @@ -296,25 +348,50 @@ class BitPackedRunDecoder { BitPackedRunDecoder() noexcept = default; - explicit BitPackedRunDecoder(const run_type& run) noexcept; + explicit BitPackedRunDecoder(const run_type& run) noexcept { Reset(run); } - void Reset(const run_type& run) noexcept; + void Reset(const run_type& run) noexcept { + value_bit_width_ = run.ValuesBitWidth(); + remaining_count_ = run.ValuesCount(); + ARROW_DCHECK_GE(value_bit_width_, 0); + ARROW_DCHECK_LE(value_bit_width_, 64); + bit_reader_.Reset(run.RawDataPtr(), run.RawDataSize()); + } /// Return the number of values that can be advanced. - [[nodiscard]] constexpr values_count_type Remaining() const; + constexpr values_count_type Remaining() const { return remaining_count_; } /// Return the size in bit in which each encoded value is written. - [[nodiscard]] constexpr bit_size_type ValueBitWidth() const; + constexpr bit_size_type ValueBitWidth() const { return value_bit_width_; } /// Try to advance by as many values as provided. - /// Return the number of values skipped. - [[nodiscard]] values_count_type Advance(values_count_type batch_size); + /// Return the number of values skipped or 0 if it fail to advance. + [[nodiscard]] values_count_type Advance(values_count_type batch_size) { + const auto steps = std::min(batch_size, remaining_count_); + if (bit_reader_.Advance(steps * value_bit_width_)) { + remaining_count_ -= steps; + return steps; + } + return 0; + } /// Get the next value and return false if there are no more. - [[nodiscard]] bool Get(value_type* out_value); + [[nodiscard]] bool Get(value_type* out_value) { return GetBatch(out_value, 1) == 1; } /// Get a batch of values return the number of decoded elements. - [[nodiscard]] values_count_type GetBatch(value_type* out, values_count_type batch_size); + [[nodiscard]] values_count_type GetBatch(value_type* out, + values_count_type batch_size) { + if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { + return 0; + } + + const auto to_read = std::min(remaining_count_, batch_size); + const auto actual_read = bit_reader_.GetBatch(value_bit_width_, out, to_read); + // There should not be any reason why the actual read would be different + // but this is error resistant. + remaining_count_ -= actual_read; + return actual_read; + } private: ::arrow::bit_util::BitReader bit_reader_ = {}; @@ -345,17 +422,24 @@ class RleBitPackedDecoder { /// data and data_size are the raw bytes to decode. /// value_bit_width is the size in bits of each encoded value. RleBitPackedDecoder(raw_data_const_pointer data, raw_data_size_type data_size, - bit_size_type value_bit_width) noexcept; + bit_size_type value_bit_width) noexcept { + Reset(data, data_size, value_bit_width); + } void Reset(raw_data_const_pointer data, raw_data_size_type data_size, - bit_size_type value_bit_width_) noexcept; + bit_size_type value_bit_width) noexcept { + ARROW_DCHECK_GE(value_bit_width, 0); + ARROW_DCHECK_LE(value_bit_width, 64); + parser_.Reset(data, data_size, value_bit_width); + decoder_ = {}; + } /// Whether there is still runs to iterate over. /// /// WARN: Due to lack of proper error handling, iteration with Get methods could return /// no data while the parser is not exhausted. /// This is how one can check for errors. - [[nodiscard]] bool Exhausted() const; + bool Exhausted() const { return (RunRemaining() == 0) && parser_.Exhausted(); } /// Gets the next value. Returns false if there are no more. /// @@ -397,11 +481,15 @@ class RleBitPackedDecoder { std::variant, BitPackedRunDecoder> decoder_ = {}; /// Return the number of values that are remaining in the current run. - [[nodiscard]] values_count_type RunRemaining() const; + values_count_type RunRemaining() const { + return std::visit([](const auto& dec) { return dec.Remaining(); }, decoder_); + } /// Get a batch of values from the current run and return the number elements read. [[nodiscard]] values_count_type RunGetBatch(value_type* out, - values_count_type batch_size); + values_count_type batch_size) { + return std::visit([&](auto& dec) { return dec.GetBatch(out, batch_size); }, decoder_); + } /// Call the parser with a single callable for all event types. template @@ -552,37 +640,90 @@ class RleBitPackedEncoder { uint8_t* literal_indicator_byte_; }; -/************************* - * RleBitPackedDecoder * - *************************/ +/************************ + * RleBitPackedParser * + ************************/ -template -RleBitPackedDecoder::RleBitPackedDecoder(raw_data_const_pointer data, - raw_data_size_type data_size, - bit_size_type value_bit_width) noexcept { - Reset(data, data_size, value_bit_width); +template +void RleBitPackedParser::Parse(Handler&& handler) { + while (!Exhausted()) { + auto [read, control] = PeekImpl(handler); + data_ += read; + data_size_ -= read; + if (ARROW_PREDICT_FALSE(control == ControlFlow::Break)) { + break; + } + } } +namespace internal { +/// The maximal unsigned size that a variable can fit. template -void RleBitPackedDecoder::Reset(raw_data_const_pointer data, - raw_data_size_type data_size, - bit_size_type value_bit_width) noexcept { - ARROW_DCHECK_GE(value_bit_width, 0); - ARROW_DCHECK_LE(value_bit_width, 64); - parser_.Reset(data, data_size, value_bit_width); - decoder_ = {}; -} +constexpr auto max_size_for_v = + static_cast>(std::numeric_limits::max()); -template -auto RleBitPackedDecoder::RunRemaining() const -> values_count_type { - return std::visit([](const auto& dec) { return dec.Remaining(); }, decoder_); -} +} // namespace internal -template -bool RleBitPackedDecoder::Exhausted() const { - return (RunRemaining() == 0) && parser_.Exhausted(); +template +auto RleBitPackedParser::PeekImpl(Handler&& handler) const + -> std::pair { + ARROW_DCHECK(!Exhausted()); + + constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor; + uint32_t run_len_type = 0; + const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type); + + if (ARROW_PREDICT_FALSE(header_bytes == 0)) { + // Malfomrmed LEB128 data + return {}; + } + + const bool is_bit_packed = run_len_type & 1; + const uint32_t count = run_len_type >> 1; + if (is_bit_packed) { + using values_count_type = BitPackedRun::values_count_type; + constexpr auto kMaxCount = + bit_util::CeilDiv(internal::max_size_for_v, 8); + if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) { + // Illegal number of encoded values + return {0, ControlFlow::Break}; + } + + const auto values_count = static_cast(count * 8); + ARROW_DCHECK_LT(count, internal::max_size_for_v); + // Count Already divided by 8 + const auto bytes_read = + header_bytes + static_cast(count) * value_bit_width_; + + auto control = handler.OnBitPackedRun( + BitPackedRun(data_ + header_bytes, values_count, value_bit_width_)); + + return {bytes_read, control}; + } + + using values_count_type = RleRun::values_count_type; + if (ARROW_PREDICT_FALSE( + count == 0 || + count > static_cast(std::numeric_limits::max()))) { + // Illegal number of encoded values + return {0, ControlFlow::Break}; + } + + const auto values_count = static_cast(count); + const auto value_bytes = bit_util::BytesForBits(value_bit_width_); + ARROW_DCHECK_LT(value_bytes, internal::max_size_for_v); + const auto bytes_read = header_bytes + static_cast(value_bytes); + + auto control = + handler.OnRleRun(RleRun(data_ + header_bytes, values_count, value_bit_width_)); + + return {bytes_read, control}; } +/************************* + * RleBitPackedDecoder * + *************************/ + template template void RleBitPackedDecoder::ParseWithCallable(Callable&& func) { @@ -595,12 +736,6 @@ void RleBitPackedDecoder::ParseWithCallable(Callable&& func) { parser_.Parse(std::move(handler)); } -template -auto RleBitPackedDecoder::RunGetBatch(value_type* out, values_count_type batch_size) - -> values_count_type { - return std::visit([&](auto& dec) { return dec.GetBatch(out, batch_size); }, decoder_); -} - template bool RleBitPackedDecoder::Get(value_type* val) { return GetBatch(val, 1) == 1; @@ -656,8 +791,8 @@ class BatchCounter { public: using size_type = int32_t; - [[nodiscard]] static constexpr BatchCounter FromBatchSizeAndNulls( - size_type batch_size, size_type null_count) { + static constexpr BatchCounter FromBatchSizeAndNulls(size_type batch_size, + size_type null_count) { ARROW_DCHECK_LE(null_count, batch_size); return {batch_size - null_count, null_count}; } @@ -665,11 +800,11 @@ class BatchCounter { constexpr BatchCounter(size_type values_count, size_type null_count) noexcept : values_count_(values_count), null_count_(null_count) {} - [[nodiscard]] constexpr size_type ValuesCount() const noexcept { return values_count_; } + constexpr size_type ValuesCount() const noexcept { return values_count_; } - [[nodiscard]] constexpr size_type ValuesRead() const noexcept { return values_read_; } + constexpr size_type ValuesRead() const noexcept { return values_read_; } - [[nodiscard]] constexpr size_type ValuesRemaining() const noexcept { + constexpr size_type ValuesRemaining() const noexcept { ARROW_DCHECK_LE(values_read_, values_count_); return values_count_ - values_read_; } @@ -679,11 +814,11 @@ class BatchCounter { values_read_ += to_read; } - [[nodiscard]] constexpr size_type NullCount() const noexcept { return null_count_; } + constexpr size_type NullCount() const noexcept { return null_count_; } - [[nodiscard]] constexpr size_type NullRead() const noexcept { return null_read_; } + constexpr size_type NullRead() const noexcept { return null_read_; } - [[nodiscard]] constexpr size_type NullRemaining() const noexcept { + constexpr size_type NullRemaining() const noexcept { ARROW_DCHECK_LE(null_read_, null_count_); return null_count_ - null_read_; } @@ -693,19 +828,15 @@ class BatchCounter { null_read_ += to_read; } - [[nodiscard]] constexpr size_type TotalRemaining() const noexcept { + constexpr size_type TotalRemaining() const noexcept { return ValuesRemaining() + NullRemaining(); } - [[nodiscard]] constexpr size_type TotalRead() const noexcept { - return values_read_ + null_read_; - } + constexpr size_type TotalRead() const noexcept { return values_read_ + null_read_; } - [[nodiscard]] constexpr bool IsFullyNull() const noexcept { - return ValuesRemaining() == 0; - } + constexpr bool IsFullyNull() const noexcept { return ValuesRemaining() == 0; } - [[nodiscard]] constexpr bool IsDone() const noexcept { return TotalRemaining() == 0; } + constexpr bool IsDone() const noexcept { return TotalRemaining() == 0; } private: size_type values_count_ = 0; @@ -714,11 +845,6 @@ class BatchCounter { size_type null_read_ = 0; }; -/// The maximal unsigned size that a variable can fit. -template -constexpr auto max_size_for_v = - static_cast>(std::numeric_limits::max()); - template struct GetSpacedResult { Int values_read; @@ -992,10 +1118,9 @@ struct NoOpConverter { static constexpr bool kIsIdentity = true; - [[nodiscard]] static constexpr bool InputIsValid(const in_type& values) { return true; } + static constexpr bool InputIsValid(const in_type& values) { return true; } - [[nodiscard]] static constexpr bool InputIsValid(const in_type* values, - size_type length) { + static constexpr bool InputIsValid(const in_type* values, size_type length) { return true; } @@ -1051,11 +1176,9 @@ struct DictionaryConverter { const out_type* dictionary; size_type dictionary_length; - [[nodiscard]] bool InputIsValid(in_type idx) const { - return IndexInRange(idx, dictionary_length); - } + bool InputIsValid(in_type idx) const { return IndexInRange(idx, dictionary_length); } - [[nodiscard]] bool InputIsValid(const in_type* indices, size_type length) const { + bool InputIsValid(const in_type* indices, size_type length) const { ARROW_DCHECK(length > 0); in_type min_index = std::numeric_limits::max(); @@ -1179,273 +1302,6 @@ auto RleBitPackedDecoder::GetBatchWithDictSpaced( return GetSpaced(converter, out, batch_size, valid_bits, valid_bits_offset, null_count); } -/************ - * RleRun * - ************/ - -inline RleRun::RleRun(raw_data_const_pointer data, values_count_type values_count, - bit_size_type value_bit_width) noexcept - : values_count_(values_count), value_bit_width_(value_bit_width) { - ARROW_DCHECK_GE(value_bit_width, 0); - ARROW_DCHECK_GE(values_count, 0); - std::copy(data, data + RawDataSize(), data_.begin()); -} - -constexpr auto RleRun::ValuesCount() const noexcept -> values_count_type { - return values_count_; -} - -constexpr auto RleRun::ValuesBitWidth() const noexcept -> bit_size_type { - return value_bit_width_; -} - -constexpr auto RleRun::RawDataPtr() const noexcept -> raw_data_const_pointer { - return data_.data(); -} - -constexpr auto RleRun::RawDataSize() const noexcept -> raw_data_size_type { - auto out = bit_util::BytesForBits(value_bit_width_); - ARROW_DCHECK_LE(out, std::numeric_limits::max()); - return static_cast(out); -}; - -/****************** - * BitPackedRun * - ******************/ - -constexpr BitPackedRun::BitPackedRun(raw_data_const_pointer data, - values_count_type values_count, - bit_size_type value_bit_width) noexcept - : data_(data), values_count_(values_count), value_bit_width_(value_bit_width) { - ARROW_CHECK_GE(value_bit_width_, 0); - ARROW_CHECK_GE(values_count_, 0); -} - -constexpr auto BitPackedRun::ValuesCount() const noexcept -> values_count_type { - return values_count_; -} - -constexpr auto BitPackedRun::ValuesBitWidth() const noexcept -> bit_size_type { - return value_bit_width_; -} - -constexpr auto BitPackedRun::RawDataPtr() const noexcept -> raw_data_const_pointer { - return data_; -} - -constexpr auto BitPackedRun::RawDataSize() const noexcept -> raw_data_size_type { - auto out = bit_util::BytesForBits(static_cast(value_bit_width_) * - static_cast(values_count_)); - ARROW_CHECK_LE(out, std::numeric_limits::max()); - return static_cast(out); -} - -/************************ - * RleBitPackedParser * - ************************/ - -constexpr RleBitPackedParser::RleBitPackedParser(raw_data_const_pointer data, - raw_data_size_type size, - bit_size_type value_bit_width) noexcept { - Reset(data, size, value_bit_width); -} - -constexpr void RleBitPackedParser::Reset(raw_data_const_pointer data, - raw_data_size_type data_size, - bit_size_type value_bit_width) noexcept { - data_ = data; - data_size_ = data_size; - value_bit_width_ = value_bit_width; -} - -inline bool RleBitPackedParser::Exhausted() const { return data_size_ == 0; } - -template -auto RleBitPackedParser::PeekImpl(Handler&& handler) const - -> std::pair { - ARROW_DCHECK(!Exhausted()); - - constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor; - uint32_t run_len_type = 0; - const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type); - - if (ARROW_PREDICT_FALSE(header_bytes == 0)) { - // Malfomrmed LEB128 data - return {}; - } - - const bool is_bit_packed = run_len_type & 1; - const uint32_t count = run_len_type >> 1; - if (is_bit_packed) { - using values_count_type = BitPackedRun::values_count_type; - constexpr auto kMaxCount = - bit_util::CeilDiv(internal::max_size_for_v, 8); - if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) { - // Illegal number of encoded values - return {0, ControlFlow::Break}; - } - - const auto values_count = static_cast(count * 8); - ARROW_DCHECK_LT(count, internal::max_size_for_v); - // Count Already divided by 8 - const auto bytes_read = - header_bytes + static_cast(count) * value_bit_width_; - - auto control = handler.OnBitPackedRun( - BitPackedRun(data_ + header_bytes, values_count, value_bit_width_)); - - return {bytes_read, control}; - } - - using values_count_type = RleRun::values_count_type; - if (ARROW_PREDICT_FALSE( - count == 0 || - count > static_cast(std::numeric_limits::max()))) { - // Illegal number of encoded values - return {0, ControlFlow::Break}; - } - - const auto values_count = static_cast(count); - const auto value_bytes = bit_util::BytesForBits(value_bit_width_); - ARROW_DCHECK_LT(value_bytes, internal::max_size_for_v); - const auto bytes_read = header_bytes + static_cast(value_bytes); - - auto control = - handler.OnRleRun(RleRun(data_ + header_bytes, values_count, value_bit_width_)); - - return {bytes_read, control}; -} - -template -void RleBitPackedParser::Parse(Handler&& handler) { - while (!Exhausted()) { - auto [read, control] = PeekImpl(handler); - data_ += read; - data_size_ -= read; - if (ARROW_PREDICT_FALSE(control == ControlFlow::Break)) { - break; - } - } -} - -/**************** - * RleDecoder * - ****************/ - -template -RleRunDecoder::RleRunDecoder(const run_type& run) noexcept { - Reset(run); -} - -template -void RleRunDecoder::Reset(const run_type& run) noexcept { - remaining_count_ = run.ValuesCount(); - if constexpr (std::is_same_v) { - // ARROW-18031: just check the LSB of the next byte and move on. - // If we memcpy + FromLittleEndian, we have potential undefined behavior - // if the bool value isn't 0 or 1. - value_ = *run.RawDataPtr() & 1; - } else { - // Memcopy is required to avoid undefined behavior. - value_ = {}; - std::memcpy(&value_, run.RawDataPtr(), run.RawDataSize()); - value_ = ::arrow::bit_util::FromLittleEndian(value_); - } -} - -template -auto RleRunDecoder::Remaining() const -> values_count_type { - return remaining_count_; -} - -template -auto constexpr RleRunDecoder::Value() const -> value_type { - return value_; -} - -template -auto RleRunDecoder::Advance(values_count_type batch_size) -> values_count_type { - const auto steps = std::min(batch_size, remaining_count_); - remaining_count_ -= steps; - return steps; -} - -template -constexpr bool RleRunDecoder::Get(value_type* out_value) { - return GetBatch(out_value, 1) == 1; -} - -template -auto RleRunDecoder::GetBatch(value_type* out, values_count_type batch_size) - -> values_count_type { - if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { - return 0; - } - - const auto to_read = std::min(remaining_count_, batch_size); - std::fill(out, out + to_read, value_); - remaining_count_ -= to_read; - return to_read; -} - -/********************** - * BitPackedDecoder * - **********************/ - -template -BitPackedRunDecoder::BitPackedRunDecoder(const run_type& run) noexcept { - Reset(run); -} - -template -void BitPackedRunDecoder::Reset(const run_type& run) noexcept { - value_bit_width_ = run.ValuesBitWidth(); - remaining_count_ = run.ValuesCount(); - ARROW_DCHECK_GE(value_bit_width_, 0); - ARROW_DCHECK_LE(value_bit_width_, 64); - bit_reader_.Reset(run.RawDataPtr(), run.RawDataSize()); -} - -template -auto constexpr BitPackedRunDecoder::Remaining() const -> values_count_type { - return remaining_count_; -} - -template -auto constexpr BitPackedRunDecoder::ValueBitWidth() const -> bit_size_type { - return value_bit_width_; -} - -template -auto BitPackedRunDecoder::Advance(values_count_type batch_size) -> values_count_type { - const auto steps = std::min(batch_size, remaining_count_); - if (bit_reader_.Advance(steps * value_bit_width_)) { - remaining_count_ -= steps; - return steps; - } - return 0; -} - -template -bool BitPackedRunDecoder::Get(value_type* out_value) { - return GetBatch(out_value, 1) == 1; -} - -template -auto BitPackedRunDecoder::GetBatch(value_type* out, values_count_type batch_size) - -> values_count_type { - if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { - return 0; - } - - const auto to_read = std::min(remaining_count_, batch_size); - const auto actual_read = bit_reader_.GetBatch(value_bit_width_, out, to_read); - // There should not be any reason why the actual read would be different - // but this is error resistant. - remaining_count_ -= actual_read; - return actual_read; -} - /************************* * RleBitPackedEncoder * *************************/ From a91a847a69dacb1acf8c45a7c383d15cb17a86bb Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 17 Sep 2025 14:26:47 +0200 Subject: [PATCH 50/56] snake_case for small const methods --- cpp/src/arrow/util/rle_encoding_internal.h | 168 ++++++++++----------- cpp/src/arrow/util/rle_encoding_test.cc | 92 +++++------ 2 files changed, 130 insertions(+), 130 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index ddcb3bc42ff4..bc4533dca36f 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -110,23 +110,23 @@ class RleRun { : values_count_(values_count), value_bit_width_(value_bit_width) { ARROW_DCHECK_GE(value_bit_width, 0); ARROW_DCHECK_GE(values_count, 0); - std::copy(data, data + RawDataSize(), data_.begin()); + std::copy(data, data + raw_data_size(), data_.begin()); } constexpr RleRun& operator=(const RleRun&) noexcept = default; constexpr RleRun& operator=(RleRun&&) noexcept = default; /// The number of repeated values in this run. - constexpr values_count_type ValuesCount() const noexcept { return values_count_; } + constexpr values_count_type values_count() const noexcept { return values_count_; } /// The size in bits of each encoded value. - constexpr bit_size_type ValuesBitWidth() const noexcept { return value_bit_width_; } + constexpr bit_size_type values_bit_width() const noexcept { return value_bit_width_; } /// A pointer to the repeated value raw bytes. - constexpr raw_data_const_pointer RawDataPtr() const noexcept { return data_.data(); } + constexpr raw_data_const_pointer raw_data_ptr() const noexcept { return data_.data(); } /// The number of bytes used for the raw repeated value. - constexpr raw_data_size_type RawDataSize() const noexcept { + constexpr raw_data_size_type raw_data_size() const noexcept { auto out = bit_util::BytesForBits(value_bit_width_); ARROW_DCHECK_LE(out, std::numeric_limits::max()); return static_cast(out); @@ -173,14 +173,14 @@ class BitPackedRun { constexpr BitPackedRun& operator=(const BitPackedRun&) noexcept = default; constexpr BitPackedRun& operator=(BitPackedRun&&) noexcept = default; - constexpr values_count_type ValuesCount() const noexcept { return values_count_; } + constexpr values_count_type values_count() const noexcept { return values_count_; } /// The size in bits of each encoded value. - constexpr bit_size_type ValuesBitWidth() const noexcept { return value_bit_width_; } + constexpr bit_size_type values_bit_width() const noexcept { return value_bit_width_; } - constexpr raw_data_const_pointer RawDataPtr() const noexcept { return data_; } + constexpr raw_data_const_pointer raw_data_ptr() const noexcept { return data_; } - constexpr raw_data_size_type RawDataSize() const noexcept { + constexpr raw_data_size_type raw_data_size() const noexcept { auto out = bit_util::BytesForBits(static_cast(value_bit_width_) * static_cast(values_count_)); ARROW_CHECK_LE(out, std::numeric_limits::max()); @@ -223,7 +223,7 @@ class RleBitPackedParser { /// WARN: Due to simplistic error handling, iteration with Next and Peek could /// fail to return data while the parser is not exhausted. /// This is how one can check for errors. - bool Exhausted() const { return data_size_ == 0; } + bool exhausted() const { return data_size_ == 0; } /// Enum to return from an ``Parse`` handler. /// @@ -279,25 +279,25 @@ class RleRunDecoder { explicit RleRunDecoder(const run_type& run) noexcept { Reset(run); } void Reset(const run_type& run) noexcept { - remaining_count_ = run.ValuesCount(); + remaining_count_ = run.values_count(); if constexpr (std::is_same_v) { // ARROW-18031: just check the LSB of the next byte and move on. // If we memcpy + FromLittleEndian, we have potential undefined behavior // if the bool value isn't 0 or 1. - value_ = *run.RawDataPtr() & 1; + value_ = *run.raw_data_ptr() & 1; } else { // Memcopy is required to avoid undefined behavior. value_ = {}; - std::memcpy(&value_, run.RawDataPtr(), run.RawDataSize()); + std::memcpy(&value_, run.raw_data_ptr(), run.raw_data_size()); value_ = ::arrow::bit_util::FromLittleEndian(value_); } } /// Return the number of values that can be advanced. - values_count_type Remaining() const { return remaining_count_; } + values_count_type remaining() const { return remaining_count_; } /// Return the repeated value of this decoder. - constexpr value_type Value() const { return value_; } + constexpr value_type value() const { return value_; } /// Try to advance by as many values as provided. /// Return the number of values skipped. @@ -351,18 +351,18 @@ class BitPackedRunDecoder { explicit BitPackedRunDecoder(const run_type& run) noexcept { Reset(run); } void Reset(const run_type& run) noexcept { - value_bit_width_ = run.ValuesBitWidth(); - remaining_count_ = run.ValuesCount(); + value_bit_width_ = run.values_bit_width(); + remaining_count_ = run.values_count(); ARROW_DCHECK_GE(value_bit_width_, 0); ARROW_DCHECK_LE(value_bit_width_, 64); - bit_reader_.Reset(run.RawDataPtr(), run.RawDataSize()); + bit_reader_.Reset(run.raw_data_ptr(), run.raw_data_size()); } /// Return the number of values that can be advanced. - constexpr values_count_type Remaining() const { return remaining_count_; } + constexpr values_count_type remaining() const { return remaining_count_; } /// Return the size in bit in which each encoded value is written. - constexpr bit_size_type ValueBitWidth() const { return value_bit_width_; } + constexpr bit_size_type value_bit_width() const { return value_bit_width_; } /// Try to advance by as many values as provided. /// Return the number of values skipped or 0 if it fail to advance. @@ -439,7 +439,7 @@ class RleBitPackedDecoder { /// WARN: Due to lack of proper error handling, iteration with Get methods could return /// no data while the parser is not exhausted. /// This is how one can check for errors. - bool Exhausted() const { return (RunRemaining() == 0) && parser_.Exhausted(); } + bool exhausted() const { return (run_remaining() == 0) && parser_.exhausted(); } /// Gets the next value. Returns false if there are no more. /// @@ -481,8 +481,8 @@ class RleBitPackedDecoder { std::variant, BitPackedRunDecoder> decoder_ = {}; /// Return the number of values that are remaining in the current run. - values_count_type RunRemaining() const { - return std::visit([](const auto& dec) { return dec.Remaining(); }, decoder_); + values_count_type run_remaining() const { + return std::visit([](const auto& dec) { return dec.remaining(); }, decoder_); } /// Get a batch of values from the current run and return the number elements read. @@ -646,7 +646,7 @@ class RleBitPackedEncoder { template void RleBitPackedParser::Parse(Handler&& handler) { - while (!Exhausted()) { + while (!exhausted()) { auto [read, control] = PeekImpl(handler); data_ += read; data_size_ -= read; @@ -667,7 +667,7 @@ constexpr auto max_size_for_v = template auto RleBitPackedParser::PeekImpl(Handler&& handler) const -> std::pair { - ARROW_DCHECK(!Exhausted()); + ARROW_DCHECK(!exhausted()); constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor; uint32_t run_len_type = 0; @@ -749,7 +749,7 @@ auto RleBitPackedDecoder::GetBatch(value_type* out, values_count_type batch_s values_count_type values_read = 0; // Remaining from a previous call that would have left some unread data from a run. - if (ARROW_PREDICT_FALSE(RunRemaining() > 0)) { + if (ARROW_PREDICT_FALSE(run_remaining() > 0)) { const auto read = RunGetBatch(out, batch_size); values_read += read; out += read; @@ -758,7 +758,7 @@ auto RleBitPackedDecoder::GetBatch(value_type* out, values_count_type batch_s if (ARROW_PREDICT_FALSE(values_read == batch_size)) { return values_read; } - ARROW_DCHECK(RunRemaining() == 0); + ARROW_DCHECK(run_remaining() == 0); } ParseWithCallable([&](auto run) { @@ -800,43 +800,43 @@ class BatchCounter { constexpr BatchCounter(size_type values_count, size_type null_count) noexcept : values_count_(values_count), null_count_(null_count) {} - constexpr size_type ValuesCount() const noexcept { return values_count_; } + constexpr size_type values_count() const noexcept { return values_count_; } - constexpr size_type ValuesRead() const noexcept { return values_read_; } + constexpr size_type values_read() const noexcept { return values_read_; } - constexpr size_type ValuesRemaining() const noexcept { + constexpr size_type values_remaining() const noexcept { ARROW_DCHECK_LE(values_read_, values_count_); return values_count_ - values_read_; } constexpr void AccrueReadValues(size_type to_read) noexcept { - ARROW_DCHECK_LE(to_read, ValuesRemaining()); + ARROW_DCHECK_LE(to_read, values_remaining()); values_read_ += to_read; } - constexpr size_type NullCount() const noexcept { return null_count_; } + constexpr size_type null_count() const noexcept { return null_count_; } - constexpr size_type NullRead() const noexcept { return null_read_; } + constexpr size_type null_read() const noexcept { return null_read_; } - constexpr size_type NullRemaining() const noexcept { + constexpr size_type null_remaining() const noexcept { ARROW_DCHECK_LE(null_read_, null_count_); return null_count_ - null_read_; } constexpr void AccrueReadNulls(size_type to_read) noexcept { - ARROW_DCHECK_LE(to_read, NullRemaining()); + ARROW_DCHECK_LE(to_read, null_remaining()); null_read_ += to_read; } - constexpr size_type TotalRemaining() const noexcept { - return ValuesRemaining() + NullRemaining(); + constexpr size_type total_remaining() const noexcept { + return values_remaining() + null_remaining(); } - constexpr size_type TotalRead() const noexcept { return values_read_ + null_read_; } + constexpr size_type total_read() const noexcept { return values_read_ + null_read_; } - constexpr bool IsFullyNull() const noexcept { return ValuesRemaining() == 0; } + constexpr bool is_fully_null() const noexcept { return values_remaining() == 0; } - constexpr bool IsDone() const noexcept { return TotalRemaining() == 0; } + constexpr bool is_done() const noexcept { return total_remaining() == 0; } private: size_type values_count_ = 0; @@ -865,10 +865,10 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); - const values_count_type values_available = decoder->Remaining(); + const values_count_type values_available = decoder->remaining(); ARROW_DCHECK_GT(values_available, 0); auto values_remaining_run = [&]() { - auto out = values_available - batch.ValuesRead(); + auto out = values_available - batch.values_read(); ARROW_DCHECK_GE(out, 0); return out; }; @@ -877,10 +877,10 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, // We only need to count the number of nulls and non-nulls because we can fill in the // same value for nulls and non-nulls. // This proves to be a big efficiency win. - while (values_remaining_run() > 0 && !batch.IsDone()) { + while (values_remaining_run() > 0 && !batch.is_done()) { ARROW_DCHECK_GE(validity_run->length, 0); ARROW_DCHECK_LT(validity_run->length, max_size_for_v); - ARROW_DCHECK_LE(validity_run->length, batch.TotalRemaining()); + ARROW_DCHECK_LE(validity_run->length, batch.total_remaining()); const auto& validity_run_size = static_cast(validity_run->length); if (validity_run->set) { @@ -891,7 +891,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, } else { // We can consume all nulls here because it does not matter if we consume on this // RLE run, or an a next encoded run. The value filled does not matter. - auto update_size = std::min(validity_run_size, batch.NullRemaining()); + auto update_size = std::min(validity_run_size, batch.null_remaining()); batch.AccrueReadNulls(update_size); validity_run->length -= update_size; } @@ -901,17 +901,17 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, } } - const value_type value = decoder->Value(); + const value_type value = decoder->value(); if (ARROW_PREDICT_FALSE(!converter->InputIsValid(value))) { return {0, 0}; } - converter->WriteRepeated(out, out + batch.TotalRead(), value); - const auto actual_values_read = decoder->Advance(batch.ValuesRead()); + converter->WriteRepeated(out, out + batch.total_read(), value); + const auto actual_values_read = decoder->Advance(batch.values_read()); // We always cropped the number of values_read by the remaining values in the run. // What's more the RLE decoder should not encounter any errors. - ARROW_DCHECK_EQ(actual_values_read, batch.ValuesRead()); + ARROW_DCHECK_EQ(actual_values_read, batch.values_read()); - return {/* .values_read= */ batch.ValuesRead(), /* .null_read= */ batch.NullRead()}; + return {/* .values_read= */ batch.values_read(), /* .null_read= */ batch.null_read()}; } template Remaining(); + const values_count_type values_available = decoder->remaining(); ARROW_DCHECK_GT(values_available, 0); auto run_values_remaining = [&]() { - auto out = values_available - batch.ValuesRead(); + auto out = values_available - batch.values_read(); ARROW_DCHECK_GE(out, 0); return out; }; - while (run_values_remaining() > 0 && batch.ValuesRemaining() > 0) { + while (run_values_remaining() > 0 && batch.values_remaining() > 0) { // TODO should this size be tune depending on sizeof(value_size)? cpu cache size? // Pull a batch of values from the bit packed encoded data and store it in a local // buffer to benefit from unpacking intrinsics and data locality. @@ -951,13 +951,13 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, }; // buffer_start is 0 at this point so size is end - buffer_end = std::min(std::min(run_values_remaining(), batch.ValuesRemaining()), + buffer_end = std::min(std::min(run_values_remaining(), batch.values_remaining()), kBufferCapacity); buffer_end = decoder->GetBatch(buffer.data(), buffer_size()); ARROW_DCHECK_LE(buffer_size(), kBufferCapacity); if (ARROW_PREDICT_FALSE(!converter->InputIsValid(buffer.data(), buffer_size()))) { - return {batch.ValuesRead(), batch.NullRead()}; + return {batch.values_read(), batch.null_read()}; } // Copy chunks of valid values into the output, while adjusting spacing for null @@ -965,7 +965,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, while (buffer_size() > 0) { ARROW_DCHECK_GE(validity_run->length, 0); ARROW_DCHECK_LT(validity_run->length, max_size_for_v); - ARROW_DCHECK_LE(validity_run->length, batch.TotalRemaining()); + ARROW_DCHECK_LE(validity_run->length, batch.total_remaining()); const auto validity_run_length = static_cast(validity_run->length); @@ -980,7 +980,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, validity_run->length -= update_size; // Simply write zeros in the output } else { - const auto update_size = std::min(validity_run_length, batch.NullRemaining()); + const auto update_size = std::min(validity_run_length, batch.null_remaining()); converter->WriteZero(out, out + update_size); batch.AccrueReadNulls(update_size); out += update_size; @@ -995,11 +995,11 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, ARROW_DCHECK_EQ(buffer_size(), 0); } - ARROW_DCHECK_EQ(values_available - decoder->Remaining(), batch.ValuesRead()); - ARROW_DCHECK_LE(batch.TotalRead(), batch_size); - ARROW_DCHECK_LE(batch.NullRead(), batch.NullCount()); + ARROW_DCHECK_EQ(values_available - decoder->remaining(), batch.values_read()); + ARROW_DCHECK_LE(batch.total_read(), batch_size); + ARROW_DCHECK_LE(batch.null_read(), batch.null_count()); - return {/* .values_read= */ batch.ValuesRead(), /* .null_read= */ batch.NullRead()}; + return {/* .values_read= */ batch.values_read(), /* .null_read= */ batch.null_read()}; } /// Overload for GetSpaced for a single run in a decoder variant @@ -1012,7 +1012,7 @@ auto RunGetSpaced( -> GetSpacedResult { return std::visit( [&](auto& dec) { - ARROW_DCHECK_GT(dec.Remaining(), 0); + ARROW_DCHECK_GT(dec.remaining(), 0); return RunGetSpaced(converter, out, batch_size, null_count, validity_reader, validity_run, &dec); }, @@ -1036,30 +1036,30 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, auto batch = internal::BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); - if (ARROW_PREDICT_FALSE(batch.IsFullyNull())) { - converter.WriteZero(out, out + batch.NullRemaining()); - return batch.NullRemaining(); + if (ARROW_PREDICT_FALSE(batch.is_fully_null())) { + converter.WriteZero(out, out + batch.null_remaining()); + return batch.null_remaining(); } arrow::internal::BitRunReader validity_reader(validity_bits, validity_bits_offset, - /*length=*/batch.TotalRemaining()); + /*length=*/batch.total_remaining()); arrow::internal::BitRun validity_run = validity_reader.NextRun(); const auto check_and_handle_fully_null_remaining = [&]() { - if (batch.IsFullyNull()) { + if (batch.is_fully_null()) { ARROW_DCHECK(validity_run.length == 0 || !validity_run.set); - ARROW_DCHECK_GE(validity_run.length, batch.NullRemaining()); + ARROW_DCHECK_GE(validity_run.length, batch.null_remaining()); - converter.WriteZero(out, out + batch.NullRemaining()); - out += batch.NullRemaining(); - batch.AccrueReadNulls(batch.NullRemaining()); + converter.WriteZero(out, out + batch.null_remaining()); + out += batch.null_remaining(); + batch.AccrueReadNulls(batch.null_remaining()); } }; // Remaining from a previous call that would have left some unread data from a run. - if (ARROW_PREDICT_FALSE(RunRemaining() > 0)) { - const auto read = internal::RunGetSpaced(&converter, out, batch.TotalRemaining(), - batch.NullRemaining(), &validity_reader, + if (ARROW_PREDICT_FALSE(run_remaining() > 0)) { + const auto read = internal::RunGetSpaced(&converter, out, batch.total_remaining(), + batch.null_remaining(), &validity_reader, &validity_run, &decoder_); batch.AccrueReadNulls(read.null_read); @@ -1067,14 +1067,14 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, out += read.values_read + read.null_read; // Either we fulfilled all the batch values to be read - if (ARROW_PREDICT_FALSE(batch.ValuesRemaining() == 0)) { + if (ARROW_PREDICT_FALSE(batch.values_remaining() == 0)) { // There may be remaining null if they are not greedily filled check_and_handle_fully_null_remaining(); - return batch.TotalRead(); + return batch.total_read(); } // We finished the remaining run - ARROW_DCHECK(RunRemaining() == 0); + ARROW_DCHECK(run_remaining() == 0); } ParseWithCallable([&](auto run) { @@ -1082,8 +1082,8 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, RunDecoder decoder(run); - const auto read = internal::RunGetSpaced(&converter, out, batch.TotalRemaining(), - batch.NullRemaining(), &validity_reader, + const auto read = internal::RunGetSpaced(&converter, out, batch.total_remaining(), + batch.null_remaining(), &validity_reader, &validity_run, &decoder); batch.AccrueReadNulls(read.null_read); @@ -1091,7 +1091,7 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, out += read.values_read + read.null_read; // Stop reading and store remaining decoder - if (ARROW_PREDICT_FALSE(read.values_read == 0 || batch.ValuesRemaining() == 0)) { + if (ARROW_PREDICT_FALSE(read.values_read == 0 || batch.values_remaining() == 0)) { decoder_ = std::move(decoder); return ControlFlow::Break; } @@ -1102,8 +1102,8 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, // There may be remaining null if they are not greedily filled by either decoder calls check_and_handle_fully_null_remaining(); - ARROW_DCHECK(batch.IsDone() || Exhausted()); - return batch.TotalRead(); + ARROW_DCHECK(batch.is_done() || exhausted()); + return batch.total_read(); } namespace internal { @@ -1244,7 +1244,7 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, return batch_size - values_read; }; - if (ARROW_PREDICT_FALSE(RunRemaining() > 0)) { + if (ARROW_PREDICT_FALSE(run_remaining() > 0)) { const auto read = internal::RunGetSpaced(&converter, out, batch_size, /* null_count= */ 0, &validity_reader, &validity_run, &decoder_); @@ -1260,7 +1260,7 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, } // We finished the remaining run - ARROW_DCHECK(RunRemaining() == 0); + ARROW_DCHECK(run_remaining() == 0); } ParseWithCallable([&](auto run) { diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index f16583525a49..b4455cf28991 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -218,36 +218,36 @@ TEST(Rle, RleRun) { // 12 times the value 21 fitting over 5 bits const auto run_5 = RleRun(value.data(), value_count, /* value_bit_width= */ 5); - EXPECT_EQ(run_5.ValuesCount(), value_count); - EXPECT_EQ(run_5.ValuesBitWidth(), 5); - EXPECT_EQ(run_5.RawDataSize(), 1); // 5 bits fit in one byte - EXPECT_EQ(*run_5.RawDataPtr(), 21); + EXPECT_EQ(run_5.values_count(), value_count); + EXPECT_EQ(run_5.values_bit_width(), 5); + EXPECT_EQ(run_5.raw_data_size(), 1); // 5 bits fit in one byte + EXPECT_EQ(*run_5.raw_data_ptr(), 21); // 12 times the value 21 fitting over 8 bits const auto run_8 = RleRun(value.data(), value_count, /* value_bit_width= */ 8); - EXPECT_EQ(run_8.ValuesCount(), value_count); - EXPECT_EQ(run_8.ValuesBitWidth(), 8); - EXPECT_EQ(run_8.RawDataSize(), 1); // 8 bits fit in 1 byte - EXPECT_EQ(*run_8.RawDataPtr(), 21); + EXPECT_EQ(run_8.values_count(), value_count); + EXPECT_EQ(run_8.values_bit_width(), 8); + EXPECT_EQ(run_8.raw_data_size(), 1); // 8 bits fit in 1 byte + EXPECT_EQ(*run_8.raw_data_ptr(), 21); // 12 times the value 533 (21 + 2 * 2^8) fitting over 10 bits const auto run_10 = RleRun(value.data(), value_count, /* value_bit_width= */ 10); - EXPECT_EQ(run_10.ValuesCount(), value_count); - EXPECT_EQ(run_10.ValuesBitWidth(), 10); - EXPECT_EQ(run_10.RawDataSize(), 2); // 10 bits fit in 2 bytes - EXPECT_EQ(*(run_10.RawDataPtr() + 0), 21); - EXPECT_EQ(*(run_10.RawDataPtr() + 1), 2); + EXPECT_EQ(run_10.values_count(), value_count); + EXPECT_EQ(run_10.values_bit_width(), 10); + EXPECT_EQ(run_10.raw_data_size(), 2); // 10 bits fit in 2 bytes + EXPECT_EQ(*(run_10.raw_data_ptr() + 0), 21); + EXPECT_EQ(*(run_10.raw_data_ptr() + 1), 2); // 12 times the value 533 (21 + 2 * 2^8) fitting over 32 bits const auto run_32 = RleRun(value.data(), value_count, /* value_bit_width= */ 32); - EXPECT_EQ(run_32.ValuesCount(), value_count); - EXPECT_EQ(run_32.ValuesBitWidth(), 32); - EXPECT_EQ(run_32.RawDataSize(), 4); // 32 bits fit in 4 bytes - EXPECT_EQ(*(run_32.RawDataPtr() + 0), 21); - EXPECT_EQ(*(run_32.RawDataPtr() + 1), 2); - EXPECT_EQ(*(run_32.RawDataPtr() + 2), 0); - EXPECT_EQ(*(run_32.RawDataPtr() + 3), 0); + EXPECT_EQ(run_32.values_count(), value_count); + EXPECT_EQ(run_32.values_bit_width(), 32); + EXPECT_EQ(run_32.raw_data_size(), 4); // 32 bits fit in 4 bytes + EXPECT_EQ(*(run_32.raw_data_ptr() + 0), 21); + EXPECT_EQ(*(run_32.raw_data_ptr() + 1), 2); + EXPECT_EQ(*(run_32.raw_data_ptr() + 2), 0); + EXPECT_EQ(*(run_32.raw_data_ptr() + 3), 0); } /// A BitPacked run is a simple class owning some data and its size. @@ -258,18 +258,18 @@ TEST(BitPacked, BitPackedRun) { // 16 values of 1 bit for a total of 16 bits BitPackedRun::values_count_type value_count_1 = 16; const auto run_1 = BitPackedRun(value.data(), value_count_1, /* value_bit_width= */ 1); - EXPECT_EQ(run_1.ValuesCount(), value_count_1); - EXPECT_EQ(run_1.ValuesBitWidth(), 1); - EXPECT_EQ(run_1.RawDataSize(), 2); // 16 bits fit in 2 bytes - EXPECT_EQ(run_1.RawDataPtr(), value.data()); + EXPECT_EQ(run_1.values_count(), value_count_1); + EXPECT_EQ(run_1.values_bit_width(), 1); + EXPECT_EQ(run_1.raw_data_size(), 2); // 16 bits fit in 2 bytes + EXPECT_EQ(run_1.raw_data_ptr(), value.data()); // 8 values of 3 bits for a total of 24 bits BitPackedRun::values_count_type value_count_3 = 8; const auto run_3 = BitPackedRun(value.data(), value_count_3, /* value_bit_width= */ 3); - EXPECT_EQ(run_3.ValuesCount(), value_count_3); - EXPECT_EQ(run_3.ValuesBitWidth(), 3); - EXPECT_EQ(run_3.RawDataSize(), 3); // 24 bits fit in 3 bytes - EXPECT_EQ(run_3.RawDataPtr(), value.data()); + EXPECT_EQ(run_3.values_count(), value_count_3); + EXPECT_EQ(run_3.values_bit_width(), 3); + EXPECT_EQ(run_3.raw_data_size(), 3); // 24 bits fit in 3 bytes + EXPECT_EQ(run_3.raw_data_ptr(), value.data()); } template @@ -283,28 +283,28 @@ void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_ auto decoder = RleRunDecoder(run); std::vector vals = {0, 0}; - EXPECT_EQ(decoder.Remaining(), value_count); + EXPECT_EQ(decoder.remaining(), value_count); typename decltype(decoder)::values_count_type read = 0; EXPECT_EQ(decoder.Get(vals.data()), 1); read += 1; EXPECT_EQ(vals.at(0), expected_value); - EXPECT_EQ(decoder.Remaining(), value_count - read); + EXPECT_EQ(decoder.remaining(), value_count - read); EXPECT_EQ(decoder.Advance(3), 3); read += 3; - EXPECT_EQ(decoder.Remaining(), value_count - read); + EXPECT_EQ(decoder.remaining(), value_count - read); vals = {0, 0}; EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); EXPECT_EQ(vals.at(0), expected_value); EXPECT_EQ(vals.at(1), expected_value); read += static_cast(vals.size()); - EXPECT_EQ(decoder.Remaining(), value_count - read); + EXPECT_EQ(decoder.remaining(), value_count - read); // Exhaust iteration EXPECT_EQ(decoder.Advance(value_count - read), value_count - read); - EXPECT_EQ(decoder.Remaining(), 0); + EXPECT_EQ(decoder.remaining(), 0); EXPECT_EQ(decoder.Advance(1), 0); vals = {0, 0}; EXPECT_EQ(decoder.Get(vals.data()), 0); @@ -312,7 +312,7 @@ void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_ // Reset the decoder decoder.Reset(run); - EXPECT_EQ(decoder.Remaining(), value_count); + EXPECT_EQ(decoder.remaining(), value_count); vals = {0, 0}; EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); EXPECT_EQ(vals.at(0), expected_value); @@ -345,28 +345,28 @@ void TestBitPackedDecoder(std::vector bytes, auto decoder = BitPackedRunDecoder(run); std::vector vals = {0, 0}; - EXPECT_EQ(decoder.Remaining(), value_count); + EXPECT_EQ(decoder.remaining(), value_count); typename decltype(decoder)::values_count_type read = 0; EXPECT_EQ(decoder.Get(vals.data()), 1); EXPECT_EQ(vals.at(0), expected.at(0 + read)); read += 1; - EXPECT_EQ(decoder.Remaining(), value_count - read); + EXPECT_EQ(decoder.remaining(), value_count - read); EXPECT_EQ(decoder.Advance(3), 3); read += 3; - EXPECT_EQ(decoder.Remaining(), value_count - read); + EXPECT_EQ(decoder.remaining(), value_count - read); vals = {0, 0}; EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); EXPECT_EQ(vals.at(0), expected.at(0 + read)); EXPECT_EQ(vals.at(1), expected.at(1 + read)); read += static_cast(vals.size()); - EXPECT_EQ(decoder.Remaining(), value_count - read); + EXPECT_EQ(decoder.remaining(), value_count - read); // Exhaust iteration EXPECT_EQ(decoder.Advance(value_count - read), value_count - read); - EXPECT_EQ(decoder.Remaining(), 0); + EXPECT_EQ(decoder.remaining(), 0); EXPECT_EQ(decoder.Advance(1), 0); vals = {0, 0}; EXPECT_EQ(decoder.Get(vals.data()), 0); @@ -375,7 +375,7 @@ void TestBitPackedDecoder(std::vector bytes, // Reset the decoder decoder.Reset(run); read = 0; - EXPECT_EQ(decoder.Remaining(), value_count); + EXPECT_EQ(decoder.remaining(), value_count); vals = {0, 0}; EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); EXPECT_EQ(vals.at(0), expected.at(0 + read)); @@ -413,7 +413,7 @@ void TestRleBitPackedParser(std::vector bytes, auto parser = RleBitPackedParser( bytes.data(), static_cast(bytes.size()), bit_width); - EXPECT_FALSE(parser.Exhausted()); + EXPECT_FALSE(parser.exhausted()); // Try to decode all data of all runs in the decoded vector decltype(expected) decoded = {}; @@ -429,11 +429,11 @@ void TestRleBitPackedParser(std::vector bytes, rle_decoder_ptr_->Reset(run); const auto n_decoded = decoded_ptr_->size(); - const auto n_to_decode = rle_decoder_ptr_->Remaining(); + const auto n_to_decode = rle_decoder_ptr_->remaining(); decoded_ptr_->resize(n_decoded + n_to_decode); EXPECT_EQ(rle_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, n_to_decode), n_to_decode); - EXPECT_EQ(rle_decoder_ptr_->Remaining(), 0); + EXPECT_EQ(rle_decoder_ptr_->remaining(), 0); return RleBitPackedParser::ControlFlow::Continue; } @@ -442,12 +442,12 @@ void TestRleBitPackedParser(std::vector bytes, bit_packed_decoder_ptr_->Reset(run); const auto n_decoded = decoded_ptr_->size(); - const auto n_to_decode = bit_packed_decoder_ptr_->Remaining(); + const auto n_to_decode = bit_packed_decoder_ptr_->remaining(); decoded_ptr_->resize(n_decoded + n_to_decode); EXPECT_EQ(bit_packed_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, n_to_decode), n_to_decode); - EXPECT_EQ(bit_packed_decoder_ptr_->Remaining(), 0); + EXPECT_EQ(bit_packed_decoder_ptr_->remaining(), 0); return RleBitPackedParser::ControlFlow::Continue; } @@ -456,7 +456,7 @@ void TestRleBitPackedParser(std::vector bytes, // Iterate over all runs parser.Parse(handler); - EXPECT_TRUE(parser.Exhausted()); + EXPECT_TRUE(parser.exhausted()); EXPECT_EQ(decoded.size(), expected.size()); EXPECT_EQ(decoded, expected); } From 2221741f34cdce4ec19d28f10fc8808165cb2559 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 17 Sep 2025 14:28:01 +0200 Subject: [PATCH 51/56] Rule of zero --- cpp/src/arrow/util/rle_encoding_internal.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index bc4533dca36f..1e84f80188af 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -102,8 +102,6 @@ class RleRun { using DecoderType = RleRunDecoder; constexpr RleRun() noexcept = default; - constexpr RleRun(const RleRun&) noexcept = default; - constexpr RleRun(RleRun&&) noexcept = default; explicit RleRun(raw_data_const_pointer data, values_count_type values_count, bit_size_type value_bit_width) noexcept @@ -113,9 +111,6 @@ class RleRun { std::copy(data, data + raw_data_size(), data_.begin()); } - constexpr RleRun& operator=(const RleRun&) noexcept = default; - constexpr RleRun& operator=(RleRun&&) noexcept = default; - /// The number of repeated values in this run. constexpr values_count_type values_count() const noexcept { return values_count_; } @@ -160,8 +155,6 @@ class BitPackedRun { using DecoderType = BitPackedRunDecoder; constexpr BitPackedRun() noexcept = default; - constexpr BitPackedRun(const BitPackedRun&) noexcept = default; - constexpr BitPackedRun(BitPackedRun&&) noexcept = default; constexpr BitPackedRun(raw_data_const_pointer data, values_count_type values_count, bit_size_type value_bit_width) noexcept @@ -170,9 +163,6 @@ class BitPackedRun { ARROW_CHECK_GE(values_count_, 0); } - constexpr BitPackedRun& operator=(const BitPackedRun&) noexcept = default; - constexpr BitPackedRun& operator=(BitPackedRun&&) noexcept = default; - constexpr values_count_type values_count() const noexcept { return values_count_; } /// The size in bits of each encoded value. From d520ebdb13430a9ec8316301b0d3888dee9265e7 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 17 Sep 2025 15:05:23 +0200 Subject: [PATCH 52/56] Simplify type aliases --- cpp/src/arrow/util/rle_encoding_internal.h | 288 +++++++++------------ cpp/src/arrow/util/rle_encoding_test.cc | 32 +-- 2 files changed, 135 insertions(+), 185 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 1e84f80188af..f657466147fb 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -81,30 +81,27 @@ namespace arrow::util { /// 200 ints = 25 groups of 8 /// <25 bytes of values, bitpacked> /// (total 26 bytes, 1 byte overhead) -// + +/// The type for an encoded Rle of BitPacked run size, between 1 and 2^31-1 as per Parquet +/// spec. +/// This is also pragmatically used for other integer used in the Rle and BitPacked runs +/// and decoder to avoid conversions. +/// It can therefore be referred to as a "typical" size for Rle and BitPacked logic. +using rle_size_t = int32_t; template class RleRunDecoder; class RleRun { public: - /// Enough space to store a 64bit value - using raw_data_storage = std::array; - using raw_data_const_pointer = const uint8_t*; - using raw_data_size_type = int32_t; - /// The type of the size of either run, between 1 and 2^31-1 as per Parquet spec - using values_count_type = int32_t; - /// The type to represent a size in bits - using bit_size_type = int32_t; - /// The decoder class used to decode a single run in the given type. template using DecoderType = RleRunDecoder; constexpr RleRun() noexcept = default; - explicit RleRun(raw_data_const_pointer data, values_count_type values_count, - bit_size_type value_bit_width) noexcept + explicit RleRun(const uint8_t* data, rle_size_t values_count, + rle_size_t value_bit_width) noexcept : values_count_(values_count), value_bit_width_(value_bit_width) { ARROW_DCHECK_GE(value_bit_width, 0); ARROW_DCHECK_GE(values_count, 0); @@ -112,28 +109,29 @@ class RleRun { } /// The number of repeated values in this run. - constexpr values_count_type values_count() const noexcept { return values_count_; } + constexpr rle_size_t values_count() const noexcept { return values_count_; } /// The size in bits of each encoded value. - constexpr bit_size_type values_bit_width() const noexcept { return value_bit_width_; } + constexpr rle_size_t values_bit_width() const noexcept { return value_bit_width_; } /// A pointer to the repeated value raw bytes. - constexpr raw_data_const_pointer raw_data_ptr() const noexcept { return data_.data(); } + constexpr const uint8_t* raw_data_ptr() const noexcept { return data_.data(); } /// The number of bytes used for the raw repeated value. - constexpr raw_data_size_type raw_data_size() const noexcept { + constexpr rle_size_t raw_data_size() const noexcept { auto out = bit_util::BytesForBits(value_bit_width_); - ARROW_DCHECK_LE(out, std::numeric_limits::max()); - return static_cast(out); + ARROW_DCHECK_LE(out, std::numeric_limits::max()); + return static_cast(out); } private: - /// The repeated value raw bytes stored inside the class - raw_data_storage data_ = {}; - /// The number of time the value is repeated - values_count_type values_count_ = 0; - /// The size in bit of a packed value in the run - bit_size_type value_bit_width_ = 0; + /// The repeated value raw bytes stored inside the class with enough space to store + /// up to a 64 bit value. + std::array data_ = {}; + /// The number of time the value is repeated. + rle_size_t values_count_ = 0; + /// The size in bit of a packed value in the run. + rle_size_t value_bit_width_ = 0; }; template @@ -141,70 +139,56 @@ class BitPackedRunDecoder; class BitPackedRun { public: - using raw_data_const_pointer = const uint8_t*; - /// According to the Parquet thrift definition the page size can be written into an - /// int32_t. - using raw_data_size_type = int32_t; - /// The type of the size of either run, between 1 and 2^31-1 as per Parquet spec - using values_count_type = int32_t; - /// The type to represent a size in bits - using bit_size_type = int32_t; - /// The decoder class used to decode a single run in the given type. template using DecoderType = BitPackedRunDecoder; constexpr BitPackedRun() noexcept = default; - constexpr BitPackedRun(raw_data_const_pointer data, values_count_type values_count, - bit_size_type value_bit_width) noexcept + constexpr BitPackedRun(const uint8_t* data, rle_size_t values_count, + rle_size_t value_bit_width) noexcept : data_(data), values_count_(values_count), value_bit_width_(value_bit_width) { ARROW_CHECK_GE(value_bit_width_, 0); ARROW_CHECK_GE(values_count_, 0); } - constexpr values_count_type values_count() const noexcept { return values_count_; } + constexpr rle_size_t values_count() const noexcept { return values_count_; } /// The size in bits of each encoded value. - constexpr bit_size_type values_bit_width() const noexcept { return value_bit_width_; } + constexpr rle_size_t values_bit_width() const noexcept { return value_bit_width_; } - constexpr raw_data_const_pointer raw_data_ptr() const noexcept { return data_; } + constexpr const uint8_t* raw_data_ptr() const noexcept { return data_; } - constexpr raw_data_size_type raw_data_size() const noexcept { + constexpr rle_size_t raw_data_size() const noexcept { auto out = bit_util::BytesForBits(static_cast(value_bit_width_) * static_cast(values_count_)); - ARROW_CHECK_LE(out, std::numeric_limits::max()); - return static_cast(out); + ARROW_CHECK_LE(out, std::numeric_limits::max()); + return static_cast(out); } private: /// The pointer to the beginning of the run - raw_data_const_pointer data_ = nullptr; + const uint8_t* data_ = nullptr; /// Number of values in this run. - raw_data_size_type values_count_ = 0; + rle_size_t values_count_ = 0; /// The size in bit of a packed value in the run - bit_size_type value_bit_width_ = 0; + rle_size_t value_bit_width_ = 0; }; /// A parser that emits either a ``BitPackedRun`` or a ``RleRun``. class RleBitPackedParser { public: - using raw_data_const_pointer = const uint8_t*; - /// By Parquet thrift definition the page size can be written into an int32_t. - using raw_data_size_type = int32_t; - /// The type to represent a size in bits - using bit_size_type = int32_t; /// The different types of runs emitted by the parser using dynamic_run_type = std::variant; constexpr RleBitPackedParser() noexcept = default; - constexpr RleBitPackedParser(raw_data_const_pointer data, raw_data_size_type data_size, - bit_size_type value_bit_width) noexcept + constexpr RleBitPackedParser(const uint8_t* data, rle_size_t data_size, + rle_size_t value_bit_width) noexcept : data_(data), data_size_(data_size), value_bit_width_(value_bit_width) {} - constexpr void Reset(raw_data_const_pointer data, raw_data_size_type data_size, - bit_size_type value_bit_width) noexcept { + constexpr void Reset(const uint8_t* data, rle_size_t data_size, + rle_size_t value_bit_width) noexcept { *this = {data, data_size, value_bit_width}; } @@ -242,16 +226,16 @@ class RleBitPackedParser { private: /// The pointer to the beginning of the run - raw_data_const_pointer data_ = nullptr; + const uint8_t* data_ = nullptr; /// Size in bytes of the run. - raw_data_size_type data_size_ = 0; + rle_size_t data_size_ = 0; /// The size in bit of a packed value in the run - bit_size_type value_bit_width_ = 0; + rle_size_t value_bit_width_ = 0; /// Run the handler on the run read and return the number of values read. /// Does not advance the parser. template - std::pair PeekImpl(Handler&&) const; + std::pair PeekImpl(Handler&&) const; }; /// Decoder class for a single run of RLE encoded data. @@ -261,14 +245,13 @@ class RleRunDecoder { /// The type in which the data should be decoded. using value_type = T; /// The type of run that can be decoded. - using run_type = RleRun; - using values_count_type = run_type::values_count_type; + using RunType = RleRun; constexpr RleRunDecoder() noexcept = default; - explicit RleRunDecoder(const run_type& run) noexcept { Reset(run); } + explicit RleRunDecoder(const RunType& run) noexcept { Reset(run); } - void Reset(const run_type& run) noexcept { + void Reset(const RunType& run) noexcept { remaining_count_ = run.values_count(); if constexpr (std::is_same_v) { // ARROW-18031: just check the LSB of the next byte and move on. @@ -284,7 +267,7 @@ class RleRunDecoder { } /// Return the number of values that can be advanced. - values_count_type remaining() const { return remaining_count_; } + rle_size_t remaining() const { return remaining_count_; } /// Return the repeated value of this decoder. constexpr value_type value() const { return value_; } @@ -292,7 +275,7 @@ class RleRunDecoder { /// Try to advance by as many values as provided. /// Return the number of values skipped. /// May advance by less than asked for. - [[nodiscard]] values_count_type Advance(values_count_type batch_size) { + [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) { const auto steps = std::min(batch_size, remaining_count_); remaining_count_ -= steps; return steps; @@ -305,8 +288,7 @@ class RleRunDecoder { /// Get a batch of values return the number of decoded elements. /// May write fewer elements to the output than requested. - [[nodiscard]] values_count_type GetBatch(value_type* out, - values_count_type batch_size) { + [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size) { if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { return 0; } @@ -319,7 +301,7 @@ class RleRunDecoder { private: value_type value_ = {}; - values_count_type remaining_count_ = 0; + rle_size_t remaining_count_ = 0; static_assert(std::is_integral_v, "This class is meant to decode positive integers"); @@ -332,15 +314,13 @@ class BitPackedRunDecoder { /// The type in which the data should be decoded. using value_type = T; /// The type of run that can be decoded. - using run_type = BitPackedRun; - using values_count_type = run_type::values_count_type; - using bit_size_type = run_type::bit_size_type; + using RunType = BitPackedRun; BitPackedRunDecoder() noexcept = default; - explicit BitPackedRunDecoder(const run_type& run) noexcept { Reset(run); } + explicit BitPackedRunDecoder(const RunType& run) noexcept { Reset(run); } - void Reset(const run_type& run) noexcept { + void Reset(const RunType& run) noexcept { value_bit_width_ = run.values_bit_width(); remaining_count_ = run.values_count(); ARROW_DCHECK_GE(value_bit_width_, 0); @@ -349,14 +329,14 @@ class BitPackedRunDecoder { } /// Return the number of values that can be advanced. - constexpr values_count_type remaining() const { return remaining_count_; } + constexpr rle_size_t remaining() const { return remaining_count_; } /// Return the size in bit in which each encoded value is written. - constexpr bit_size_type value_bit_width() const { return value_bit_width_; } + constexpr rle_size_t value_bit_width() const { return value_bit_width_; } /// Try to advance by as many values as provided. /// Return the number of values skipped or 0 if it fail to advance. - [[nodiscard]] values_count_type Advance(values_count_type batch_size) { + [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) { const auto steps = std::min(batch_size, remaining_count_); if (bit_reader_.Advance(steps * value_bit_width_)) { remaining_count_ -= steps; @@ -369,8 +349,7 @@ class BitPackedRunDecoder { [[nodiscard]] bool Get(value_type* out_value) { return GetBatch(out_value, 1) == 1; } /// Get a batch of values return the number of decoded elements. - [[nodiscard]] values_count_type GetBatch(value_type* out, - values_count_type batch_size) { + [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size) { if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { return 0; } @@ -385,8 +364,8 @@ class BitPackedRunDecoder { private: ::arrow::bit_util::BitReader bit_reader_ = {}; - bit_size_type value_bit_width_ = 0; - values_count_type remaining_count_ = 0; + rle_size_t value_bit_width_ = 0; + rle_size_t remaining_count_ = 0; static_assert(std::is_integral_v, "This class is meant to decode positive integers"); @@ -398,12 +377,7 @@ class RleBitPackedDecoder { public: /// The type in which the data should be decoded. using value_type = T; - using raw_data_const_pointer = RleBitPackedParser::raw_data_const_pointer; - using raw_data_size_type = RleBitPackedParser::raw_data_size_type; - using bit_size_type = RleBitPackedParser::bit_size_type; - using dynamic_run_type = RleBitPackedParser::dynamic_run_type; - /// The type of the size of either run, between 1 and 2^31-1 as per Parquet spec - using values_count_type = int32_t; + using DynamicRun = RleBitPackedParser::dynamic_run_type; RleBitPackedDecoder() noexcept = default; @@ -411,13 +385,13 @@ class RleBitPackedDecoder { /// /// data and data_size are the raw bytes to decode. /// value_bit_width is the size in bits of each encoded value. - RleBitPackedDecoder(raw_data_const_pointer data, raw_data_size_type data_size, - bit_size_type value_bit_width) noexcept { + RleBitPackedDecoder(const uint8_t* data, rle_size_t data_size, + rle_size_t value_bit_width) noexcept { Reset(data, data_size, value_bit_width); } - void Reset(raw_data_const_pointer data, raw_data_size_type data_size, - bit_size_type value_bit_width) noexcept { + void Reset(const uint8_t* data, rle_size_t data_size, + rle_size_t value_bit_width) noexcept { ARROW_DCHECK_GE(value_bit_width, 0); ARROW_DCHECK_LE(value_bit_width, 64); parser_.Reset(data, data_size, value_bit_width); @@ -441,43 +415,39 @@ class RleBitPackedDecoder { [[nodiscard]] bool Get(value_type* val); /// Get a batch of values return the number of decoded elements. - [[nodiscard]] values_count_type GetBatch(value_type* out, values_count_type batch_size); + [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size); /// Like GetBatch but add spacing for null entries - [[nodiscard]] values_count_type GetBatchSpaced(values_count_type batch_size, - values_count_type null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset, - value_type* out); + [[nodiscard]] rle_size_t GetBatchSpaced(rle_size_t batch_size, rle_size_t null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset, value_type* out); /// Like GetBatch but the values are then decoded using the provided dictionary template - [[nodiscard]] values_count_type GetBatchWithDict(const V* dictionary, - int32_t dictionary_length, V* out, - values_count_type batch_size); + [[nodiscard]] rle_size_t GetBatchWithDict(const V* dictionary, + int32_t dictionary_length, V* out, + rle_size_t batch_size); /// Like GetBatchWithDict but add spacing for null entries /// /// Null entries will be zero-initialized in `values` to avoid leaking /// private data. template - [[nodiscard]] values_count_type GetBatchWithDictSpaced( - const V* dictionary, int32_t dictionary_length, V* out, - values_count_type batch_size, values_count_type null_count, - const uint8_t* valid_bits, int64_t valid_bits_offset); + [[nodiscard]] rle_size_t GetBatchWithDictSpaced( + const V* dictionary, int32_t dictionary_length, V* out, rle_size_t batch_size, + rle_size_t null_count, const uint8_t* valid_bits, int64_t valid_bits_offset); private: RleBitPackedParser parser_ = {}; std::variant, BitPackedRunDecoder> decoder_ = {}; /// Return the number of values that are remaining in the current run. - values_count_type run_remaining() const { + rle_size_t run_remaining() const { return std::visit([](const auto& dec) { return dec.remaining(); }, decoder_); } /// Get a batch of values from the current run and return the number elements read. - [[nodiscard]] values_count_type RunGetBatch(value_type* out, - values_count_type batch_size) { + [[nodiscard]] rle_size_t RunGetBatch(value_type* out, rle_size_t batch_size) { return std::visit([&](auto& dec) { return dec.GetBatch(out, batch_size); }, decoder_); } @@ -487,12 +457,10 @@ class RleBitPackedDecoder { /// Utility methods for retrieving spaced values. template - [[nodiscard]] values_count_type GetSpaced(Converter converter, - typename Converter::out_type* out, - values_count_type batch_size, - const uint8_t* valid_bits, - int64_t valid_bits_offset, - values_count_type null_count); + [[nodiscard]] rle_size_t GetSpaced(Converter converter, + typename Converter::out_type* out, + rle_size_t batch_size, const uint8_t* valid_bits, + int64_t valid_bits_offset, rle_size_t null_count); }; /// Class to incrementally build the rle data. This class does not allocate any memory. @@ -656,7 +624,7 @@ constexpr auto max_size_for_v = template auto RleBitPackedParser::PeekImpl(Handler&& handler) const - -> std::pair { + -> std::pair { ARROW_DCHECK(!exhausted()); constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor; @@ -671,19 +639,17 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const const bool is_bit_packed = run_len_type & 1; const uint32_t count = run_len_type >> 1; if (is_bit_packed) { - using values_count_type = BitPackedRun::values_count_type; - constexpr auto kMaxCount = - bit_util::CeilDiv(internal::max_size_for_v, 8); + constexpr auto kMaxCount = bit_util::CeilDiv(internal::max_size_for_v, 8); if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) { // Illegal number of encoded values return {0, ControlFlow::Break}; } - const auto values_count = static_cast(count * 8); - ARROW_DCHECK_LT(count, internal::max_size_for_v); + const auto values_count = static_cast(count * 8); + ARROW_DCHECK_LT(count, internal::max_size_for_v); // Count Already divided by 8 const auto bytes_read = - header_bytes + static_cast(count) * value_bit_width_; + header_bytes + static_cast(count) * value_bit_width_; auto control = handler.OnBitPackedRun( BitPackedRun(data_ + header_bytes, values_count, value_bit_width_)); @@ -691,18 +657,17 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const return {bytes_read, control}; } - using values_count_type = RleRun::values_count_type; if (ARROW_PREDICT_FALSE( count == 0 || - count > static_cast(std::numeric_limits::max()))) { + count > static_cast(std::numeric_limits::max()))) { // Illegal number of encoded values return {0, ControlFlow::Break}; } - const auto values_count = static_cast(count); + const auto values_count = static_cast(count); const auto value_bytes = bit_util::BytesForBits(value_bit_width_); - ARROW_DCHECK_LT(value_bytes, internal::max_size_for_v); - const auto bytes_read = header_bytes + static_cast(value_bytes); + ARROW_DCHECK_LT(value_bytes, internal::max_size_for_v); + const auto bytes_read = header_bytes + static_cast(value_bytes); auto control = handler.OnRleRun(RleRun(data_ + header_bytes, values_count, value_bit_width_)); @@ -732,11 +697,11 @@ bool RleBitPackedDecoder::Get(value_type* val) { } template -auto RleBitPackedDecoder::GetBatch(value_type* out, values_count_type batch_size) - -> values_count_type { +auto RleBitPackedDecoder::GetBatch(value_type* out, rle_size_t batch_size) + -> rle_size_t { using ControlFlow = RleBitPackedParser::ControlFlow; - values_count_type values_read = 0; + rle_size_t values_read = 0; // Remaining from a previous call that would have left some unread data from a run. if (ARROW_PREDICT_FALSE(run_remaining() > 0)) { @@ -779,7 +744,7 @@ namespace internal { /// verbosity. class BatchCounter { public: - using size_type = int32_t; + using size_type = rle_size_t; static constexpr BatchCounter FromBatchSizeAndNulls(size_type batch_size, size_type null_count) { @@ -842,20 +807,18 @@ struct GetSpacedResult { }; /// Overload for GetSpaced for a single run in a RleDecoder -template +template auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, - values_count_type batch_size, values_count_type null_count, + rle_size_t batch_size, rle_size_t null_count, BitRunReader* validity_reader, BitRun* validity_run, - RleRunDecoder* decoder) - -> GetSpacedResult { + RleRunDecoder* decoder) -> GetSpacedResult { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced ARROW_DCHECK_LT(null_count, batch_size); auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); - const values_count_type values_available = decoder->remaining(); + const rle_size_t values_available = decoder->remaining(); ARROW_DCHECK_GT(values_available, 0); auto values_remaining_run = [&]() { auto out = values_available - batch.values_read(); @@ -869,9 +832,9 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, // This proves to be a big efficiency win. while (values_remaining_run() > 0 && !batch.is_done()) { ARROW_DCHECK_GE(validity_run->length, 0); - ARROW_DCHECK_LT(validity_run->length, max_size_for_v); + ARROW_DCHECK_LT(validity_run->length, max_size_for_v); ARROW_DCHECK_LE(validity_run->length, batch.total_remaining()); - const auto& validity_run_size = static_cast(validity_run->length); + const auto& validity_run_size = static_cast(validity_run->length); if (validity_run->set) { // We may end the current RLE run in the middle of the validity run @@ -904,20 +867,19 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, return {/* .values_read= */ batch.values_read(), /* .null_read= */ batch.null_read()}; } -template +template auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, - values_count_type batch_size, values_count_type null_count, + rle_size_t batch_size, rle_size_t null_count, BitRunReader* validity_reader, BitRun* validity_run, BitPackedRunDecoder* decoder) - -> GetSpacedResult { + -> GetSpacedResult { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced ARROW_DCHECK_LT(null_count, batch_size); auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); - const values_count_type values_available = decoder->remaining(); + const rle_size_t values_available = decoder->remaining(); ARROW_DCHECK_GT(values_available, 0); auto run_values_remaining = [&]() { auto out = values_available - batch.values_read(); @@ -926,14 +888,13 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, }; while (run_values_remaining() > 0 && batch.values_remaining() > 0) { - // TODO should this size be tune depending on sizeof(value_size)? cpu cache size? // Pull a batch of values from the bit packed encoded data and store it in a local // buffer to benefit from unpacking intrinsics and data locality. - static constexpr values_count_type kBufferCapacity = 1024; + static constexpr rle_size_t kBufferCapacity = 1024 / sizeof(value_type); std::array buffer = {}; - values_count_type buffer_start = 0; - values_count_type buffer_end = 0; + rle_size_t buffer_start = 0; + rle_size_t buffer_end = 0; auto buffer_size = [&]() { auto out = buffer_end - buffer_start; ARROW_DCHECK_GE(out, 0); @@ -954,10 +915,9 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, // values. while (buffer_size() > 0) { ARROW_DCHECK_GE(validity_run->length, 0); - ARROW_DCHECK_LT(validity_run->length, max_size_for_v); + ARROW_DCHECK_LT(validity_run->length, max_size_for_v); ARROW_DCHECK_LE(validity_run->length, batch.total_remaining()); - const auto validity_run_length = - static_cast(validity_run->length); + const auto validity_run_length = static_cast(validity_run->length); // Copy as much as possible from the buffer into the output while not exceeding // validity run @@ -993,13 +953,12 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, } /// Overload for GetSpaced for a single run in a decoder variant -template +template auto RunGetSpaced( - Converter* converter, typename Converter::out_type* out, values_count_type batch_size, - values_count_type null_count, BitRunReader* validity_reader, BitRun* validity_run, + Converter* converter, typename Converter::out_type* out, rle_size_t batch_size, + rle_size_t null_count, BitRunReader* validity_reader, BitRun* validity_run, std::variant, BitPackedRunDecoder>* decoder) - -> GetSpacedResult { + -> GetSpacedResult { return std::visit( [&](auto& dec) { ARROW_DCHECK_GT(dec.remaining(), 0); @@ -1015,11 +974,10 @@ template template auto RleBitPackedDecoder::GetSpaced(Converter converter, typename Converter::out_type* out, - values_count_type batch_size, + rle_size_t batch_size, const uint8_t* validity_bits, int64_t validity_bits_offset, - values_count_type null_count) - -> values_count_type { + rle_size_t null_count) -> rle_size_t { using ControlFlow = RleBitPackedParser::ControlFlow; ARROW_DCHECK_GT(batch_size, 0); @@ -1104,9 +1062,7 @@ template struct NoOpConverter { using in_type = T; using out_type = T; - using size_type = int32_t; - - static constexpr bool kIsIdentity = true; + using size_type = rle_size_t; static constexpr bool InputIsValid(const in_type& values) { return true; } @@ -1130,11 +1086,10 @@ struct NoOpConverter { } // namespace internal template -auto RleBitPackedDecoder::GetBatchSpaced(values_count_type batch_size, - values_count_type null_count, +auto RleBitPackedDecoder::GetBatchSpaced(rle_size_t batch_size, rle_size_t null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, value_type* out) - -> values_count_type { + -> rle_size_t { if (null_count == 0) { return GetBatch(out, batch_size); } @@ -1159,7 +1114,7 @@ template struct DictionaryConverter { using out_type = V; using in_type = I; - using size_type = int32_t; + using size_type = rle_size_t; static constexpr bool kIsIdentity = false; @@ -1214,8 +1169,7 @@ template template auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, int32_t dictionary_length, V* out, - values_count_type batch_size) - -> values_count_type { + rle_size_t batch_size) -> rle_size_t { using ControlFlow = RleBitPackedParser::ControlFlow; if (ARROW_PREDICT_FALSE(batch_size <= 0)) { @@ -1228,7 +1182,7 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, constexpr internal::UnreachableBitRunReader validity_reader{}; internal::AllSetBitRun validity_run = {batch_size}; - values_count_type values_read = 0; + rle_size_t values_read = 0; auto batch_values_remaining = [&]() { ARROW_DCHECK_LE(values_read, batch_size); return batch_size - values_read; @@ -1281,9 +1235,9 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, template template auto RleBitPackedDecoder::GetBatchWithDictSpaced( - const V* dictionary, int32_t dictionary_length, V* out, values_count_type batch_size, - values_count_type null_count, const uint8_t* valid_bits, int64_t valid_bits_offset) - -> values_count_type { + const V* dictionary, int32_t dictionary_length, V* out, rle_size_t batch_size, + rle_size_t null_count, const uint8_t* valid_bits, int64_t valid_bits_offset) + -> rle_size_t { if (null_count == 0) { return GetBatchWithDict(dictionary, dictionary_length, out, batch_size); } diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index b4455cf28991..f518a2658cec 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -214,7 +214,7 @@ TEST(BitUtil, RoundTripIntValues) { TEST(Rle, RleRun) { const std::array value = {21, 2, 0, 0}; - RleRun::values_count_type value_count = 12; + rle_size_t value_count = 12; // 12 times the value 21 fitting over 5 bits const auto run_5 = RleRun(value.data(), value_count, /* value_bit_width= */ 5); @@ -256,7 +256,7 @@ TEST(BitPacked, BitPackedRun) { const std::array value = {0b10101010, 0, 0, 0b1111111}; // 16 values of 1 bit for a total of 16 bits - BitPackedRun::values_count_type value_count_1 = 16; + rle_size_t value_count_1 = 16; const auto run_1 = BitPackedRun(value.data(), value_count_1, /* value_bit_width= */ 1); EXPECT_EQ(run_1.values_count(), value_count_1); EXPECT_EQ(run_1.values_bit_width(), 1); @@ -264,7 +264,7 @@ TEST(BitPacked, BitPackedRun) { EXPECT_EQ(run_1.raw_data_ptr(), value.data()); // 8 values of 3 bits for a total of 24 bits - BitPackedRun::values_count_type value_count_3 = 8; + rle_size_t value_count_3 = 8; const auto run_3 = BitPackedRun(value.data(), value_count_3, /* value_bit_width= */ 3); EXPECT_EQ(run_3.values_count(), value_count_3); EXPECT_EQ(run_3.values_bit_width(), 3); @@ -273,8 +273,8 @@ TEST(BitPacked, BitPackedRun) { } template -void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_count, - RleRun::bit_size_type bit_width, T expected_value) { +void TestRleDecoder(std::vector bytes, rle_size_t value_count, + rle_size_t bit_width, T expected_value) { // Pre-requisite for this test EXPECT_GT(value_count, 6); @@ -285,7 +285,7 @@ void TestRleDecoder(std::vector bytes, RleRun::values_count_type value_ EXPECT_EQ(decoder.remaining(), value_count); - typename decltype(decoder)::values_count_type read = 0; + rle_size_t read = 0; EXPECT_EQ(decoder.Get(vals.data()), 1); read += 1; EXPECT_EQ(vals.at(0), expected_value); @@ -333,10 +333,8 @@ TEST(Rle, RleDecoder) { } template -void TestBitPackedDecoder(std::vector bytes, - BitPackedRun::values_count_type value_count, - BitPackedRun::bit_size_type bit_width, - std::vector expected) { +void TestBitPackedDecoder(std::vector bytes, rle_size_t value_count, + rle_size_t bit_width, std::vector expected) { // Pre-requisite for this test EXPECT_GT(value_count, 6); @@ -347,7 +345,7 @@ void TestBitPackedDecoder(std::vector bytes, EXPECT_EQ(decoder.remaining(), value_count); - typename decltype(decoder)::values_count_type read = 0; + rle_size_t read = 0; EXPECT_EQ(decoder.Get(vals.data()), 1); EXPECT_EQ(vals.at(0), expected.at(0 + read)); read += 1; @@ -407,12 +405,10 @@ TEST(BitPacked, BitPackedDecoder) { } template -void TestRleBitPackedParser(std::vector bytes, - RleBitPackedParser::bit_size_type bit_width, +void TestRleBitPackedParser(std::vector bytes, rle_size_t bit_width, std::vector expected) { - auto parser = RleBitPackedParser( - bytes.data(), static_cast(bytes.size()), - bit_width); + auto parser = + RleBitPackedParser(bytes.data(), static_cast(bytes.size()), bit_width); EXPECT_FALSE(parser.exhausted()); // Try to decode all data of all runs in the decoded vector @@ -898,7 +894,7 @@ void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts } // We will read the data in `parts` calls to make sure intermediate states are valid - int32_t total_read_count = 0; + rle_size_t total_read_count = 0; while (total_read_count < data_size) { const auto remaining = data_size - total_read_count; auto to_read = data_size / parts; @@ -906,7 +902,7 @@ void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts to_read = remaining; } - int32_t read = 0; + rle_size_t read = 0; if (spaced) { // We need to slice the input array get the proper null count and bitmap auto data_remaining = data.Slice(total_read_count, to_read); From 7f153c0413d3c045439e93710d5d75683fd0e92e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 17 Sep 2025 19:39:08 +0200 Subject: [PATCH 53/56] Set buffer capacity --- cpp/src/arrow/util/rle_encoding_internal.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index f657466147fb..318a83d642b2 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -890,7 +890,9 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, while (run_values_remaining() > 0 && batch.values_remaining() > 0) { // Pull a batch of values from the bit packed encoded data and store it in a local // buffer to benefit from unpacking intrinsics and data locality. - static constexpr rle_size_t kBufferCapacity = 1024 / sizeof(value_type); + // Quick benchmarking on a linux x86-64 cloud instance show that this previously + // hard-coded value is appropriate. + static constexpr rle_size_t kBufferCapacity = 1024; std::array buffer = {}; rle_size_t buffer_start = 0; From 41ff0cde9274246fbadccbca0e74c0188fffc40b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 17 Sep 2025 19:53:33 +0200 Subject: [PATCH 54/56] More doc --- cpp/src/arrow/util/rle_encoding_internal.h | 28 ++++++++++++++++------ 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 318a83d642b2..137a4a0698f9 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -274,7 +274,7 @@ class RleRunDecoder { /// Try to advance by as many values as provided. /// Return the number of values skipped. - /// May advance by less than asked for. + /// May advance by less than asked for if there are not enough values left. [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) { const auto steps = std::min(batch_size, remaining_count_); remaining_count_ -= steps; @@ -287,7 +287,8 @@ class RleRunDecoder { } /// Get a batch of values return the number of decoded elements. - /// May write fewer elements to the output than requested. + /// May write fewer elements to the output than requested if there are not enough values + /// left. [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size) { if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { return 0; @@ -336,6 +337,7 @@ class BitPackedRunDecoder { /// Try to advance by as many values as provided. /// Return the number of values skipped or 0 if it fail to advance. + /// May advance by less than asked for if there are not enough values left. [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) { const auto steps = std::min(batch_size, remaining_count_); if (bit_reader_.Advance(steps * value_bit_width_)) { @@ -345,10 +347,12 @@ class BitPackedRunDecoder { return 0; } - /// Get the next value and return false if there are no more. + /// Get the next value and return false if there are no more or an error occurred. [[nodiscard]] bool Get(value_type* out_value) { return GetBatch(out_value, 1) == 1; } /// Get a batch of values return the number of decoded elements. + /// May write fewer elements to the output than requested if there are not enough values + /// left or if an error occurred. [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size) { if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { return 0; @@ -405,7 +409,7 @@ class RleBitPackedDecoder { /// This is how one can check for errors. bool exhausted() const { return (run_remaining() == 0) && parser_.exhausted(); } - /// Gets the next value. Returns false if there are no more. + /// Gets the next value or returns false if there are no more or an error occurred. /// /// NB: Because the encoding only supports literal runs with lengths /// that are multiples of 8, RleEncoder sometimes pads the end of its @@ -415,14 +419,23 @@ class RleBitPackedDecoder { [[nodiscard]] bool Get(value_type* val); /// Get a batch of values return the number of decoded elements. + /// May write fewer elements to the output than requested if there are not enough values + /// left or if an error occurred. [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size); - /// Like GetBatch but add spacing for null entries + /// Like GetBatch but add spacing for null entries. + /// + /// Null entries will be set to an arbistrary value to avoid leaking private data. + /// May write fewer elements to the output than requested if there are not enough values + /// left or if an error occurred. [[nodiscard]] rle_size_t GetBatchSpaced(rle_size_t batch_size, rle_size_t null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, value_type* out); /// Like GetBatch but the values are then decoded using the provided dictionary + /// + /// May write fewer elements to the output than requested if there are not enough values + /// left or if an error occurred. template [[nodiscard]] rle_size_t GetBatchWithDict(const V* dictionary, int32_t dictionary_length, V* out, @@ -430,8 +443,9 @@ class RleBitPackedDecoder { /// Like GetBatchWithDict but add spacing for null entries /// - /// Null entries will be zero-initialized in `values` to avoid leaking - /// private data. + /// Null entries will be set to an arbistrary value to avoid leaking private data. + /// May write fewer elements to the output than requested if there are not enough values + /// left or if an error occurred. template [[nodiscard]] rle_size_t GetBatchWithDictSpaced( const V* dictionary, int32_t dictionary_length, V* out, rle_size_t batch_size, From dcc6db01c15587a96bd529a327c1b7ebb89fce03 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 19 Sep 2025 17:04:21 +0200 Subject: [PATCH 55/56] Address reviewer comments --- cpp/src/arrow/util/rle_encoding_internal.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 137a4a0698f9..6b722cfd2c40 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -647,7 +647,7 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const if (ARROW_PREDICT_FALSE(header_bytes == 0)) { // Malfomrmed LEB128 data - return {}; + return {0, ControlFlow::Break}; } const bool is_bit_packed = run_len_type & 1; @@ -659,8 +659,9 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const return {0, ControlFlow::Break}; } + ARROW_DCHECK_LT(static_cast(count) * 8, + internal::max_size_for_v); const auto values_count = static_cast(count * 8); - ARROW_DCHECK_LT(count, internal::max_size_for_v); // Count Already divided by 8 const auto bytes_read = header_bytes + static_cast(count) * value_bit_width_; @@ -671,13 +672,12 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const return {bytes_read, control}; } - if (ARROW_PREDICT_FALSE( - count == 0 || - count > static_cast(std::numeric_limits::max()))) { + if (ARROW_PREDICT_FALSE(count == 0)) { // Illegal number of encoded values return {0, ControlFlow::Break}; } + // Safe because created from right shift const auto values_count = static_cast(count); const auto value_bytes = bit_util::BytesForBits(value_bit_width_); ARROW_DCHECK_LT(value_bytes, internal::max_size_for_v); From c2344a8cb4709a5fe77ebeb380adf04dfdd52933 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 22 Sep 2025 18:36:51 +0200 Subject: [PATCH 56/56] Explicitly pass bit width (don't store multiple time) --- cpp/src/arrow/util/rle_encoding_internal.h | 139 +++++++++++---------- cpp/src/arrow/util/rle_encoding_test.cc | 91 +++++++------- 2 files changed, 122 insertions(+), 108 deletions(-) diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 6b722cfd2c40..c231c9a63ebb 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -92,6 +92,12 @@ using rle_size_t = int32_t; template class RleRunDecoder; +/// A Single Run Length Encoded run. +/// +/// Consist of a single value repeated multiple times. +/// A previous version of this class also stored the value bit width to be self contain, +/// removing it and passing it explicitly when needed proved to speed up decoding up to +/// 10 % on some benchmarks. class RleRun { public: /// The decoder class used to decode a single run in the given type. @@ -102,24 +108,21 @@ class RleRun { explicit RleRun(const uint8_t* data, rle_size_t values_count, rle_size_t value_bit_width) noexcept - : values_count_(values_count), value_bit_width_(value_bit_width) { + : values_count_(values_count) { ARROW_DCHECK_GE(value_bit_width, 0); ARROW_DCHECK_GE(values_count, 0); - std::copy(data, data + raw_data_size(), data_.begin()); + std::copy(data, data + raw_data_size(value_bit_width), data_.begin()); } /// The number of repeated values in this run. constexpr rle_size_t values_count() const noexcept { return values_count_; } - /// The size in bits of each encoded value. - constexpr rle_size_t values_bit_width() const noexcept { return value_bit_width_; } - /// A pointer to the repeated value raw bytes. constexpr const uint8_t* raw_data_ptr() const noexcept { return data_.data(); } /// The number of bytes used for the raw repeated value. - constexpr rle_size_t raw_data_size() const noexcept { - auto out = bit_util::BytesForBits(value_bit_width_); + constexpr rle_size_t raw_data_size(rle_size_t value_bit_width) const noexcept { + auto out = bit_util::BytesForBits(value_bit_width); ARROW_DCHECK_LE(out, std::numeric_limits::max()); return static_cast(out); } @@ -130,13 +133,19 @@ class RleRun { std::array data_ = {}; /// The number of time the value is repeated. rle_size_t values_count_ = 0; - /// The size in bit of a packed value in the run. - rle_size_t value_bit_width_ = 0; }; template class BitPackedRunDecoder; +/// A single bit packed run. +/// +/// Consist of a view on a buffer of bytes that encode integers on ``value_bit_width`` +/// bits (that is the numbers are small enough that high order bits are all zeros and can +/// be omitted). +/// A previous version of this class also stored the value bit width to be self contain, +/// removing it and passing it explicitly when needed proved to speed up decoding up to +/// 10 % on some benchmarks. class BitPackedRun { public: /// The decoder class used to decode a single run in the given type. @@ -147,20 +156,17 @@ class BitPackedRun { constexpr BitPackedRun(const uint8_t* data, rle_size_t values_count, rle_size_t value_bit_width) noexcept - : data_(data), values_count_(values_count), value_bit_width_(value_bit_width) { - ARROW_CHECK_GE(value_bit_width_, 0); + : data_(data), values_count_(values_count) { + ARROW_CHECK_GE(value_bit_width, 0); ARROW_CHECK_GE(values_count_, 0); } constexpr rle_size_t values_count() const noexcept { return values_count_; } - /// The size in bits of each encoded value. - constexpr rle_size_t values_bit_width() const noexcept { return value_bit_width_; } - constexpr const uint8_t* raw_data_ptr() const noexcept { return data_; } - constexpr rle_size_t raw_data_size() const noexcept { - auto out = bit_util::BytesForBits(static_cast(value_bit_width_) * + constexpr rle_size_t raw_data_size(rle_size_t value_bit_width) const noexcept { + auto out = bit_util::BytesForBits(static_cast(value_bit_width) * static_cast(values_count_)); ARROW_CHECK_LE(out, std::numeric_limits::max()); return static_cast(out); @@ -171,8 +177,6 @@ class BitPackedRun { const uint8_t* data_ = nullptr; /// Number of values in this run. rle_size_t values_count_ = 0; - /// The size in bit of a packed value in the run - rle_size_t value_bit_width_ = 0; }; /// A parser that emits either a ``BitPackedRun`` or a ``RleRun``. @@ -249,9 +253,11 @@ class RleRunDecoder { constexpr RleRunDecoder() noexcept = default; - explicit RleRunDecoder(const RunType& run) noexcept { Reset(run); } + explicit RleRunDecoder(const RunType& run, rle_size_t value_bit_width) noexcept { + Reset(run, value_bit_width); + } - void Reset(const RunType& run) noexcept { + void Reset(const RunType& run, rle_size_t value_bit_width) noexcept { remaining_count_ = run.values_count(); if constexpr (std::is_same_v) { // ARROW-18031: just check the LSB of the next byte and move on. @@ -261,7 +267,7 @@ class RleRunDecoder { } else { // Memcopy is required to avoid undefined behavior. value_ = {}; - std::memcpy(&value_, run.raw_data_ptr(), run.raw_data_size()); + std::memcpy(&value_, run.raw_data_ptr(), run.raw_data_size(value_bit_width)); value_ = ::arrow::bit_util::FromLittleEndian(value_); } } @@ -275,21 +281,22 @@ class RleRunDecoder { /// Try to advance by as many values as provided. /// Return the number of values skipped. /// May advance by less than asked for if there are not enough values left. - [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) { + [[nodiscard]] rle_size_t Advance(rle_size_t batch_size, rle_size_t value_bit_width) { const auto steps = std::min(batch_size, remaining_count_); remaining_count_ -= steps; return steps; } /// Get the next value and return false if there are no more. - [[nodiscard]] constexpr bool Get(value_type* out_value) { - return GetBatch(out_value, 1) == 1; + [[nodiscard]] constexpr bool Get(value_type* out_value, rle_size_t value_bit_width) { + return GetBatch(out_value, 1, value_bit_width) == 1; } /// Get a batch of values return the number of decoded elements. /// May write fewer elements to the output than requested if there are not enough values /// left. - [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size) { + [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size, + rle_size_t value_bit_width) { if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { return 0; } @@ -319,28 +326,26 @@ class BitPackedRunDecoder { BitPackedRunDecoder() noexcept = default; - explicit BitPackedRunDecoder(const RunType& run) noexcept { Reset(run); } + explicit BitPackedRunDecoder(const RunType& run, rle_size_t value_bit_width) noexcept { + Reset(run, value_bit_width); + } - void Reset(const RunType& run) noexcept { - value_bit_width_ = run.values_bit_width(); + void Reset(const RunType& run, rle_size_t value_bit_width) noexcept { remaining_count_ = run.values_count(); - ARROW_DCHECK_GE(value_bit_width_, 0); - ARROW_DCHECK_LE(value_bit_width_, 64); - bit_reader_.Reset(run.raw_data_ptr(), run.raw_data_size()); + ARROW_DCHECK_GE(value_bit_width, 0); + ARROW_DCHECK_LE(value_bit_width, 64); + bit_reader_.Reset(run.raw_data_ptr(), run.raw_data_size(value_bit_width)); } /// Return the number of values that can be advanced. constexpr rle_size_t remaining() const { return remaining_count_; } - /// Return the size in bit in which each encoded value is written. - constexpr rle_size_t value_bit_width() const { return value_bit_width_; } - /// Try to advance by as many values as provided. /// Return the number of values skipped or 0 if it fail to advance. /// May advance by less than asked for if there are not enough values left. - [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) { + [[nodiscard]] rle_size_t Advance(rle_size_t batch_size, rle_size_t value_bit_width) { const auto steps = std::min(batch_size, remaining_count_); - if (bit_reader_.Advance(steps * value_bit_width_)) { + if (bit_reader_.Advance(steps * value_bit_width)) { remaining_count_ -= steps; return steps; } @@ -348,18 +353,21 @@ class BitPackedRunDecoder { } /// Get the next value and return false if there are no more or an error occurred. - [[nodiscard]] bool Get(value_type* out_value) { return GetBatch(out_value, 1) == 1; } + [[nodiscard]] constexpr bool Get(value_type* out_value, rle_size_t value_bit_width) { + return GetBatch(out_value, 1, value_bit_width) == 1; + } /// Get a batch of values return the number of decoded elements. /// May write fewer elements to the output than requested if there are not enough values /// left or if an error occurred. - [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size) { + [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size, + rle_size_t value_bit_width) { if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { return 0; } const auto to_read = std::min(remaining_count_, batch_size); - const auto actual_read = bit_reader_.GetBatch(value_bit_width_, out, to_read); + const auto actual_read = bit_reader_.GetBatch(value_bit_width, out, to_read); // There should not be any reason why the actual read would be different // but this is error resistant. remaining_count_ -= actual_read; @@ -368,7 +376,6 @@ class BitPackedRunDecoder { private: ::arrow::bit_util::BitReader bit_reader_ = {}; - rle_size_t value_bit_width_ = 0; rle_size_t remaining_count_ = 0; static_assert(std::is_integral_v, @@ -400,6 +407,7 @@ class RleBitPackedDecoder { ARROW_DCHECK_LE(value_bit_width, 64); parser_.Reset(data, data_size, value_bit_width); decoder_ = {}; + value_bit_width_ = value_bit_width; } /// Whether there is still runs to iterate over. @@ -454,6 +462,7 @@ class RleBitPackedDecoder { private: RleBitPackedParser parser_ = {}; std::variant, BitPackedRunDecoder> decoder_ = {}; + rle_size_t value_bit_width_; /// Return the number of values that are remaining in the current run. rle_size_t run_remaining() const { @@ -462,7 +471,9 @@ class RleBitPackedDecoder { /// Get a batch of values from the current run and return the number elements read. [[nodiscard]] rle_size_t RunGetBatch(value_type* out, rle_size_t batch_size) { - return std::visit([&](auto& dec) { return dec.GetBatch(out, batch_size); }, decoder_); + return std::visit( + [&](auto& dec) { return dec.GetBatch(out, batch_size, value_bit_width_); }, + decoder_); } /// Call the parser with a single callable for all event types. @@ -734,8 +745,8 @@ auto RleBitPackedDecoder::GetBatch(value_type* out, rle_size_t batch_size) using RunDecoder = typename decltype(run)::template DecoderType; ARROW_DCHECK_LT(values_read, batch_size); - RunDecoder decoder(run); - const auto read = decoder.GetBatch(out, batch_size - values_read); + RunDecoder decoder(run, value_bit_width_); + const auto read = decoder.GetBatch(out, batch_size - values_read, value_bit_width_); ARROW_DCHECK_LE(read, batch_size - values_read); values_read += read; out += read; @@ -824,8 +835,9 @@ struct GetSpacedResult { template auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, rle_size_t batch_size, rle_size_t null_count, - BitRunReader* validity_reader, BitRun* validity_run, - RleRunDecoder* decoder) -> GetSpacedResult { + rle_size_t value_bit_width, BitRunReader* validity_reader, + BitRun* validity_run, RleRunDecoder* decoder) + -> GetSpacedResult { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced ARROW_DCHECK_LT(null_count, batch_size); @@ -873,7 +885,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, return {0, 0}; } converter->WriteRepeated(out, out + batch.total_read(), value); - const auto actual_values_read = decoder->Advance(batch.values_read()); + const auto actual_values_read = decoder->Advance(batch.values_read(), value_bit_width); // We always cropped the number of values_read by the remaining values in the run. // What's more the RLE decoder should not encounter any errors. ARROW_DCHECK_EQ(actual_values_read, batch.values_read()); @@ -884,8 +896,8 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, template auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, rle_size_t batch_size, rle_size_t null_count, - BitRunReader* validity_reader, BitRun* validity_run, - BitPackedRunDecoder* decoder) + rle_size_t value_bit_width, BitRunReader* validity_reader, + BitRun* validity_run, BitPackedRunDecoder* decoder) -> GetSpacedResult { ARROW_DCHECK_GT(batch_size, 0); // The equality case is handled in the main loop in GetSpaced @@ -920,7 +932,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, // buffer_start is 0 at this point so size is end buffer_end = std::min(std::min(run_values_remaining(), batch.values_remaining()), kBufferCapacity); - buffer_end = decoder->GetBatch(buffer.data(), buffer_size()); + buffer_end = decoder->GetBatch(buffer.data(), buffer_size(), value_bit_width); ARROW_DCHECK_LE(buffer_size(), kBufferCapacity); if (ARROW_PREDICT_FALSE(!converter->InputIsValid(buffer.data(), buffer_size()))) { @@ -972,14 +984,15 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, template auto RunGetSpaced( Converter* converter, typename Converter::out_type* out, rle_size_t batch_size, - rle_size_t null_count, BitRunReader* validity_reader, BitRun* validity_run, + rle_size_t null_count, rle_size_t value_bit_width, BitRunReader* validity_reader, + BitRun* validity_run, std::variant, BitPackedRunDecoder>* decoder) -> GetSpacedResult { return std::visit( [&](auto& dec) { ARROW_DCHECK_GT(dec.remaining(), 0); - return RunGetSpaced(converter, out, batch_size, null_count, validity_reader, - validity_run, &dec); + return RunGetSpaced(converter, out, batch_size, null_count, value_bit_width, + validity_reader, validity_run, &dec); }, *decoder); } @@ -1023,8 +1036,8 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, // Remaining from a previous call that would have left some unread data from a run. if (ARROW_PREDICT_FALSE(run_remaining() > 0)) { const auto read = internal::RunGetSpaced(&converter, out, batch.total_remaining(), - batch.null_remaining(), &validity_reader, - &validity_run, &decoder_); + batch.null_remaining(), value_bit_width_, + &validity_reader, &validity_run, &decoder_); batch.AccrueReadNulls(read.null_read); batch.AccrueReadValues(read.values_read); @@ -1044,11 +1057,11 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, ParseWithCallable([&](auto run) { using RunDecoder = typename decltype(run)::template DecoderType; - RunDecoder decoder(run); + RunDecoder decoder(run, value_bit_width_); const auto read = internal::RunGetSpaced(&converter, out, batch.total_remaining(), - batch.null_remaining(), &validity_reader, - &validity_run, &decoder); + batch.null_remaining(), value_bit_width_, + &validity_reader, &validity_run, &decoder); batch.AccrueReadNulls(read.null_read); batch.AccrueReadValues(read.values_read); @@ -1205,9 +1218,9 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, }; if (ARROW_PREDICT_FALSE(run_remaining() > 0)) { - const auto read = - internal::RunGetSpaced(&converter, out, batch_size, /* null_count= */ 0, - &validity_reader, &validity_run, &decoder_); + const auto read = internal::RunGetSpaced(&converter, out, batch_size, + /* null_count= */ 0, value_bit_width_, + &validity_reader, &validity_run, &decoder_); ARROW_DCHECK_EQ(read.null_read, 0); values_read += read.values_read; @@ -1226,11 +1239,11 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, ParseWithCallable([&](auto run) { using RunDecoder = typename decltype(run)::template DecoderType; - RunDecoder decoder(run); + RunDecoder decoder(run, value_bit_width_); const auto read = internal::RunGetSpaced(&converter, out, batch_values_remaining(), - /* null_count= */ 0, &validity_reader, - &validity_run, &decoder); + /* null_count= */ 0, value_bit_width_, + &validity_reader, &validity_run, &decoder); ARROW_DCHECK_EQ(read.null_read, 0); values_read += read.values_read; diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index f518a2658cec..c7f4878b741d 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -214,36 +214,35 @@ TEST(BitUtil, RoundTripIntValues) { TEST(Rle, RleRun) { const std::array value = {21, 2, 0, 0}; - rle_size_t value_count = 12; + const rle_size_t value_count = 12; // 12 times the value 21 fitting over 5 bits - const auto run_5 = RleRun(value.data(), value_count, /* value_bit_width= */ 5); + const rle_size_t value_bit_width_5 = 5; + const auto run_5 = RleRun(value.data(), value_count, value_bit_width_5); EXPECT_EQ(run_5.values_count(), value_count); - EXPECT_EQ(run_5.values_bit_width(), 5); - EXPECT_EQ(run_5.raw_data_size(), 1); // 5 bits fit in one byte + EXPECT_EQ(run_5.raw_data_size(value_bit_width_5), 1); // 5 bits fit in one byte EXPECT_EQ(*run_5.raw_data_ptr(), 21); // 12 times the value 21 fitting over 8 bits - const auto run_8 = RleRun(value.data(), value_count, /* value_bit_width= */ 8); + const rle_size_t value_bit_width_8 = 8; + const auto run_8 = RleRun(value.data(), value_count, value_bit_width_8); EXPECT_EQ(run_8.values_count(), value_count); - EXPECT_EQ(run_8.values_bit_width(), 8); - EXPECT_EQ(run_8.raw_data_size(), 1); // 8 bits fit in 1 byte + EXPECT_EQ(run_8.raw_data_size(value_bit_width_8), 1); // 8 bits fit in 1 byte EXPECT_EQ(*run_8.raw_data_ptr(), 21); // 12 times the value 533 (21 + 2 * 2^8) fitting over 10 bits - const auto run_10 = RleRun(value.data(), value_count, /* value_bit_width= */ 10); - + const rle_size_t value_bit_width_10 = 10; + const auto run_10 = RleRun(value.data(), value_count, value_bit_width_10); EXPECT_EQ(run_10.values_count(), value_count); - EXPECT_EQ(run_10.values_bit_width(), 10); - EXPECT_EQ(run_10.raw_data_size(), 2); // 10 bits fit in 2 bytes + EXPECT_EQ(run_10.raw_data_size(value_bit_width_10), 2); // 10 bits fit in 2 bytes EXPECT_EQ(*(run_10.raw_data_ptr() + 0), 21); EXPECT_EQ(*(run_10.raw_data_ptr() + 1), 2); // 12 times the value 533 (21 + 2 * 2^8) fitting over 32 bits - const auto run_32 = RleRun(value.data(), value_count, /* value_bit_width= */ 32); + const rle_size_t value_bit_width_32 = 32; + const auto run_32 = RleRun(value.data(), value_count, value_bit_width_32); EXPECT_EQ(run_32.values_count(), value_count); - EXPECT_EQ(run_32.values_bit_width(), 32); - EXPECT_EQ(run_32.raw_data_size(), 4); // 32 bits fit in 4 bytes + EXPECT_EQ(run_32.raw_data_size(value_bit_width_32), 4); // 32 bits fit in 4 bytes EXPECT_EQ(*(run_32.raw_data_ptr() + 0), 21); EXPECT_EQ(*(run_32.raw_data_ptr() + 1), 2); EXPECT_EQ(*(run_32.raw_data_ptr() + 2), 0); @@ -256,19 +255,19 @@ TEST(BitPacked, BitPackedRun) { const std::array value = {0b10101010, 0, 0, 0b1111111}; // 16 values of 1 bit for a total of 16 bits - rle_size_t value_count_1 = 16; - const auto run_1 = BitPackedRun(value.data(), value_count_1, /* value_bit_width= */ 1); + const rle_size_t value_count_1 = 16; + const rle_size_t value_bit_width_1 = 1; + const auto run_1 = BitPackedRun(value.data(), value_count_1, value_bit_width_1); EXPECT_EQ(run_1.values_count(), value_count_1); - EXPECT_EQ(run_1.values_bit_width(), 1); - EXPECT_EQ(run_1.raw_data_size(), 2); // 16 bits fit in 2 bytes + EXPECT_EQ(run_1.raw_data_size(value_bit_width_1), 2); // 16 bits fit in 2 bytes EXPECT_EQ(run_1.raw_data_ptr(), value.data()); // 8 values of 3 bits for a total of 24 bits - rle_size_t value_count_3 = 8; - const auto run_3 = BitPackedRun(value.data(), value_count_3, /* value_bit_width= */ 3); + const rle_size_t value_count_3 = 8; + const rle_size_t value_bit_width_3 = 3; + const auto run_3 = BitPackedRun(value.data(), value_count_3, value_bit_width_3); EXPECT_EQ(run_3.values_count(), value_count_3); - EXPECT_EQ(run_3.values_bit_width(), 3); - EXPECT_EQ(run_3.raw_data_size(), 3); // 24 bits fit in 3 bytes + EXPECT_EQ(run_3.raw_data_size(value_bit_width_3), 3); // 24 bits fit in 3 bytes EXPECT_EQ(run_3.raw_data_ptr(), value.data()); } @@ -280,41 +279,41 @@ void TestRleDecoder(std::vector bytes, rle_size_t value_count, const auto run = RleRun(bytes.data(), value_count, bit_width); - auto decoder = RleRunDecoder(run); + auto decoder = RleRunDecoder(run, bit_width); std::vector vals = {0, 0}; EXPECT_EQ(decoder.remaining(), value_count); rle_size_t read = 0; - EXPECT_EQ(decoder.Get(vals.data()), 1); + EXPECT_EQ(decoder.Get(vals.data(), bit_width), 1); read += 1; EXPECT_EQ(vals.at(0), expected_value); EXPECT_EQ(decoder.remaining(), value_count - read); - EXPECT_EQ(decoder.Advance(3), 3); + EXPECT_EQ(decoder.Advance(3, bit_width), 3); read += 3; EXPECT_EQ(decoder.remaining(), value_count - read); vals = {0, 0}; - EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(decoder.GetBatch(vals.data(), 2, bit_width), vals.size()); EXPECT_EQ(vals.at(0), expected_value); EXPECT_EQ(vals.at(1), expected_value); read += static_cast(vals.size()); EXPECT_EQ(decoder.remaining(), value_count - read); // Exhaust iteration - EXPECT_EQ(decoder.Advance(value_count - read), value_count - read); + EXPECT_EQ(decoder.Advance(value_count - read, bit_width), value_count - read); EXPECT_EQ(decoder.remaining(), 0); - EXPECT_EQ(decoder.Advance(1), 0); + EXPECT_EQ(decoder.Advance(1, bit_width), 0); vals = {0, 0}; - EXPECT_EQ(decoder.Get(vals.data()), 0); + EXPECT_EQ(decoder.Get(vals.data(), bit_width), 0); EXPECT_EQ(vals.at(0), 0); // Reset the decoder - decoder.Reset(run); + decoder.Reset(run, bit_width); EXPECT_EQ(decoder.remaining(), value_count); vals = {0, 0}; - EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(decoder.GetBatch(vals.data(), 2, bit_width), vals.size()); EXPECT_EQ(vals.at(0), expected_value); EXPECT_EQ(vals.at(1), expected_value); } @@ -340,42 +339,42 @@ void TestBitPackedDecoder(std::vector bytes, rle_size_t value_count, const auto run = BitPackedRun(bytes.data(), value_count, bit_width); - auto decoder = BitPackedRunDecoder(run); + auto decoder = BitPackedRunDecoder(run, bit_width); std::vector vals = {0, 0}; EXPECT_EQ(decoder.remaining(), value_count); rle_size_t read = 0; - EXPECT_EQ(decoder.Get(vals.data()), 1); + EXPECT_EQ(decoder.Get(vals.data(), bit_width), 1); EXPECT_EQ(vals.at(0), expected.at(0 + read)); read += 1; EXPECT_EQ(decoder.remaining(), value_count - read); - EXPECT_EQ(decoder.Advance(3), 3); + EXPECT_EQ(decoder.Advance(3, bit_width), 3); read += 3; EXPECT_EQ(decoder.remaining(), value_count - read); vals = {0, 0}; - EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(decoder.GetBatch(vals.data(), 2, bit_width), vals.size()); EXPECT_EQ(vals.at(0), expected.at(0 + read)); EXPECT_EQ(vals.at(1), expected.at(1 + read)); read += static_cast(vals.size()); EXPECT_EQ(decoder.remaining(), value_count - read); // Exhaust iteration - EXPECT_EQ(decoder.Advance(value_count - read), value_count - read); + EXPECT_EQ(decoder.Advance(value_count - read, bit_width), value_count - read); EXPECT_EQ(decoder.remaining(), 0); - EXPECT_EQ(decoder.Advance(1), 0); + EXPECT_EQ(decoder.Advance(1, bit_width), 0); vals = {0, 0}; - EXPECT_EQ(decoder.Get(vals.data()), 0); + EXPECT_EQ(decoder.Get(vals.data(), bit_width), 0); EXPECT_EQ(vals.at(0), 0); // Reset the decoder - decoder.Reset(run); + decoder.Reset(run, bit_width); read = 0; EXPECT_EQ(decoder.remaining(), value_count); vals = {0, 0}; - EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(decoder.GetBatch(vals.data(), 2, bit_width), vals.size()); EXPECT_EQ(vals.at(0), expected.at(0 + read)); EXPECT_EQ(vals.at(1), expected.at(1 + read)); } @@ -420,14 +419,16 @@ void TestRleBitPackedParser(std::vector bytes, rle_size_t bit_width, decltype(rle_decoder)* rle_decoder_ptr_; decltype(bit_packed_decoder)* bit_packed_decoder_ptr_; decltype(decoded)* decoded_ptr_; + decltype(bit_width) bit_width_; auto OnRleRun(RleRun run) { - rle_decoder_ptr_->Reset(run); + rle_decoder_ptr_->Reset(run, bit_width_); const auto n_decoded = decoded_ptr_->size(); const auto n_to_decode = rle_decoder_ptr_->remaining(); decoded_ptr_->resize(n_decoded + n_to_decode); - EXPECT_EQ(rle_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, n_to_decode), + EXPECT_EQ(rle_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, n_to_decode, + bit_width_), n_to_decode); EXPECT_EQ(rle_decoder_ptr_->remaining(), 0); @@ -435,19 +436,19 @@ void TestRleBitPackedParser(std::vector bytes, rle_size_t bit_width, } auto OnBitPackedRun(BitPackedRun run) { - bit_packed_decoder_ptr_->Reset(run); + bit_packed_decoder_ptr_->Reset(run, bit_width_); const auto n_decoded = decoded_ptr_->size(); const auto n_to_decode = bit_packed_decoder_ptr_->remaining(); decoded_ptr_->resize(n_decoded + n_to_decode); EXPECT_EQ(bit_packed_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, - n_to_decode), + n_to_decode, bit_width_), n_to_decode); EXPECT_EQ(bit_packed_decoder_ptr_->remaining(), 0); return RleBitPackedParser::ControlFlow::Continue; } - } handler{&rle_decoder, &bit_packed_decoder, &decoded}; + } handler{&rle_decoder, &bit_packed_decoder, &decoded, bit_width}; // Iterate over all runs parser.Parse(handler);