From 50eff67799eaaf317c39099628553446bf946111 Mon Sep 17 00:00:00 2001 From: shouzhi Date: Fri, 22 May 2026 14:40:44 +0800 Subject: [PATCH 1/2] ORC-2167: [C++] Fix integer overflow in PostScript footer length validation Added overflow-safe bounds checking in three locations in Reader.cc: 1. readMetadata() - check footer/metadata/postscript lengths before bounds validation 2. createReader() - check footer/postscript lengths before calculating tailSize 3. startNextStripe() - check stripe offset/length values Added unit tests to verify malformed ORC files with extremely large footer_length or metadata_length values throw ParseError instead of causing integer overflow or crash: - testMalformedFooterLengthOverflow - testMalformedMetadataLengthOverflow --- c++/src/Reader.cc | 91 ++++++++++++++++++++++++++++++--- c++/test/TestReader.cc | 113 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+), 8 deletions(-) diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 511618e4ac..94818672a9 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -884,13 +884,28 @@ namespace orc { void ReaderImpl::readMetadata() const { uint64_t metadataSize = contents_->postscript->metadata_length(); uint64_t footerLength = contents_->postscript->footer_length(); - if (fileLength_ < metadataSize + footerLength + postscriptLength_ + 1) { + + // Check for potential overflow before bounds validation + if (footerLength > fileLength_ || metadataSize > fileLength_ || + postscriptLength_ > fileLength_) { + std::stringstream msg; + msg << "Invalid length values: footerLength=" << footerLength + << ", metadataLength=" << metadataSize + << ", postscriptLength=" << postscriptLength_ + << ", fileLength=" << fileLength_; + throw ParseError(msg.str()); + } + + // Use subtraction-based check to avoid integer overflow + if (fileLength_ - footerLength < postscriptLength_ + 1 || + fileLength_ - footerLength - postscriptLength_ - 1 < metadataSize) { std::stringstream msg; msg << "Invalid Metadata length: fileLength=" << fileLength_ << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength << ", postscriptLength=" << postscriptLength_; throw ParseError(msg.str()); } + uint64_t metadataStart = fileLength_ - metadataSize - footerLength - postscriptLength_ - 1; if (metadataSize != 0) { std::unique_ptr pbStream = createDecompressor( @@ -1226,18 +1241,59 @@ namespace orc { do { currentStripeInfo_ = footer_->stripes(static_cast(currentStripe_)); uint64_t fileLength = contents_->stream->getLength(); - if (currentStripeInfo_.offset() + currentStripeInfo_.index_length() + - currentStripeInfo_.data_length() + currentStripeInfo_.footer_length() >= - fileLength) { + + // Check for potential overflow in stripe length calculations + uint64_t stripeOffset = currentStripeInfo_.offset(); + uint64_t indexLength = currentStripeInfo_.index_length(); + uint64_t dataLength = currentStripeInfo_.data_length(); + uint64_t footerLength = currentStripeInfo_.footer_length(); + + if (stripeOffset > fileLength || indexLength > fileLength || + dataLength > fileLength || footerLength > fileLength) { + std::stringstream msg; + msg << "Invalid stripe length values at stripe index " << currentStripe_ + << ": offset=" << stripeOffset << ", indexLength=" << indexLength + << ", dataLength=" << dataLength << ", footerLength=" << footerLength + << ", fileLength=" << fileLength; + throw ParseError(msg.str()); + } + + // Use subtraction-based checks to avoid addition overflow + uint64_t remainingSpace = fileLength - stripeOffset; + if (remainingSpace <= indexLength) { + std::stringstream msg; + msg << "Malformed StripeInformation at stripe index " << currentStripe_ + << ": fileLength=" << fileLength + << ", StripeInfo=(offset=" << stripeOffset + << ", indexLength=" << indexLength + << ", dataLength=" << dataLength + << ", footerLength=" << footerLength << ")"; + throw ParseError(msg.str()); + } + + if (remainingSpace - indexLength <= dataLength) { std::stringstream msg; msg << "Malformed StripeInformation at stripe index " << currentStripe_ << ": fileLength=" << fileLength - << ", StripeInfo=(offset=" << currentStripeInfo_.offset() - << ", indexLength=" << currentStripeInfo_.index_length() - << ", dataLength=" << currentStripeInfo_.data_length() - << ", footerLength=" << currentStripeInfo_.footer_length() << ")"; + << ", StripeInfo=(offset=" << stripeOffset + << ", indexLength=" << indexLength + << ", dataLength=" << dataLength + << ", footerLength=" << footerLength << ")"; throw ParseError(msg.str()); } + + if (remainingSpace - indexLength - dataLength <= footerLength) { + std::stringstream msg; + msg << "Malformed StripeInformation at stripe index " << currentStripe_ + << ": fileLength=" << fileLength + << ", StripeInfo=(offset=" << stripeOffset + << ", indexLength=" << indexLength + << ", dataLength=" << dataLength + << ", footerLength=" << footerLength << ")"; + throw ParseError(msg.str()); + } + + uint64_t stripeTotalLength = indexLength + dataLength + footerLength; rowsInCurrentStripe_ = currentStripeInfo_.number_of_rows(); processingStripe_ = currentStripe_; @@ -1635,6 +1691,25 @@ namespace orc { postscriptLength = buffer->data()[readSize - 1] & 0xff; contents->postscript = readPostscript(stream.get(), buffer.get(), postscriptLength); uint64_t footerSize = contents->postscript->footer_length(); + + // Check for potential overflow before calculating tailSize + if (footerSize > fileLength || postscriptLength > fileLength) { + std::stringstream msg; + msg << "Invalid length values: footerSize=" << footerSize + << ", postscriptLength=" << postscriptLength + << ", fileLength=" << fileLength; + throw ParseError(msg.str()); + } + + // Use subtraction-based check to avoid addition overflow + if (fileLength - footerSize < postscriptLength + 1) { + std::stringstream msg; + msg << "Invalid tail size: footerSize=" << footerSize + << ", postscriptLength=" << postscriptLength + << ", fileLength=" << fileLength; + throw ParseError(msg.str()); + } + uint64_t tailSize = 1 + postscriptLength + footerSize; if (tailSize >= fileLength) { std::stringstream msg; diff --git a/c++/test/TestReader.cc b/c++/test/TestReader.cc index ab6e5ac662..b7e5b2b1c1 100644 --- a/c++/test/TestReader.cc +++ b/c++/test/TestReader.cc @@ -1240,4 +1240,117 @@ namespace orc { EXPECT_LT(largeLimitIOCount, smallLimitIOCount); } + /** + * Test that reading a malformed ORC file with extremely large footer_length + * throws ParseError instead of causing integer overflow or crash. + * This tests the fix for ORC-2167. + */ + TEST(TestReader, testMalformedFooterLengthOverflow) { + // Construct a minimal malformed ORC file with footer_length = UINT64_MAX + // ORC file tail structure: + // - Footer (not included here, will be invalid anyway) + // - PostScript (protobuf encoded) + // - 1 byte: postscript length + // + // PostScript protobuf fields: + // - field 1: footer_length (varint) - we set this to UINT64_MAX + // - field 8000: magic "ORC" + // + // Protobuf encoding for footer_length = UINT64_MAX: + // - tag = (field_number << 3 | wire_type) = (1 << 3 | 0) = 0x08 + // - value = varint(UINT64_MAX) = 0xFF,FF,FF,FF,FF,FF,FF,FF,FF,01 (10 bytes) + // + // Protobuf encoding for magic "ORC": + // - field 8000 is a special field, but simpler to include "ORC" at end + + // Create a minimal byte sequence simulating malicious file + // We need at least: PostScript + postscriptLength byte + // PostScript: footer_length(UINT64_MAX) encoded as protobuf varint + + std::vector maliciousData; + + // Add some dummy footer bytes (not valid, but we're testing bounds check) + for (int i = 0; i < 20; i++) { + maliciousData.push_back(0x00); + } + + // PostScript with footer_length = UINT64_MAX + // tag (field 1, varint): 0x08 + maliciousData.push_back(0x08); + // value: UINT64_MAX as varint (10 bytes) + for (int i = 0; i < 9; i++) { + maliciousData.push_back(0xFF); + } + maliciousData.push_back(0x01); + + // Add compression field (field 2) - NONE = 0 + maliciousData.push_back(0x10); // tag: field 2, wire type 0 + maliciousData.push_back(0x00); // value: compression NONE + + // Add magic "ORC" at the end (this is a simplified approach) + maliciousData.push_back('O'); + maliciousData.push_back('R'); + maliciousData.push_back('C'); + + // PostScript length byte - points to all bytes before it (excluding magic) + // This is a simplified structure; real ORC has magic inside PostScript + uint8_t postscriptLen = 14; // footer_length encoding + compression encoding + maliciousData.push_back(static_cast(postscriptLen)); + + // Create reader and expect ParseError + auto stream = std::make_unique( + maliciousData.data(), maliciousData.size()); + + ReaderOptions options; + EXPECT_THROW(createReader(std::move(stream), options), ParseError); + } + + /** + * Test that reading a malformed ORC file with extremely large metadata_length + * throws ParseError instead of causing integer overflow. + */ + TEST(TestReader, testMalformedMetadataLengthOverflow) { + // Similar to testMalformedFooterLengthOverflow but with metadata_length overflow + // PostScript field 3 is metadata_length (actually it's field 4 in proto) + // Let's create a file with both footer_length and metadata_length set to large values + + std::vector maliciousData; + + // Add some dummy bytes + for (int i = 0; i < 20; i++) { + maliciousData.push_back(0x00); + } + + // PostScript: + // footer_length = 100 (reasonable small value) + maliciousData.push_back(0x08); // tag: field 1 + maliciousData.push_back(0x64); // value: 100 + + // metadata_length = UINT64_MAX (field 4) + maliciousData.push_back(0x20); // tag: field 4, wire type 0 (4 << 3 | 0 = 32 = 0x20) + for (int i = 0; i < 9; i++) { + maliciousData.push_back(0xFF); + } + maliciousData.push_back(0x01); + + // compression = NONE + maliciousData.push_back(0x10); + maliciousData.push_back(0x00); + + // magic "ORC" + maliciousData.push_back('O'); + maliciousData.push_back('R'); + maliciousData.push_back('C'); + + // postscript length + uint8_t postscriptLen = 16; + maliciousData.push_back(static_cast(postscriptLen)); + + auto stream = std::make_unique( + maliciousData.data(), maliciousData.size()); + + ReaderOptions options; + EXPECT_THROW(createReader(std::move(stream), options), ParseError); + } + } // namespace orc From 50a2ca00483868ca691624a91689a95b8ce9ec78 Mon Sep 17 00:00:00 2001 From: shouzhi Date: Mon, 25 May 2026 17:01:12 +0800 Subject: [PATCH 2/2] ORC-2167: [C++] Refactor overflow checks to use __builtin_add_overflow Use compiler built-in for safer and cleaner overflow detection in Reader.cc metadata, stripe info, and postscript footer validations. Co-Authored-By: Claude Opus 4.7 --- c++/src/Reader.cc | 89 ++++++++++------------------------------------- 1 file changed, 18 insertions(+), 71 deletions(-) diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 94818672a9..8ec3b540bc 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -885,20 +885,12 @@ namespace orc { uint64_t metadataSize = contents_->postscript->metadata_length(); uint64_t footerLength = contents_->postscript->footer_length(); - // Check for potential overflow before bounds validation - if (footerLength > fileLength_ || metadataSize > fileLength_ || - postscriptLength_ > fileLength_) { - std::stringstream msg; - msg << "Invalid length values: footerLength=" << footerLength - << ", metadataLength=" << metadataSize - << ", postscriptLength=" << postscriptLength_ - << ", fileLength=" << fileLength_; - throw ParseError(msg.str()); - } - - // Use subtraction-based check to avoid integer overflow - if (fileLength_ - footerLength < postscriptLength_ + 1 || - fileLength_ - footerLength - postscriptLength_ - 1 < metadataSize) { + // Check for overflow in length calculations + uint64_t totalTail; + if (__builtin_add_overflow(footerLength, metadataSize, &totalTail) || + __builtin_add_overflow(totalTail, postscriptLength_, &totalTail) || + __builtin_add_overflow(totalTail, 1ULL, &totalTail) || + totalTail > fileLength_) { std::stringstream msg; msg << "Invalid Metadata length: fileLength=" << fileLength_ << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength @@ -906,7 +898,7 @@ namespace orc { throw ParseError(msg.str()); } - uint64_t metadataStart = fileLength_ - metadataSize - footerLength - postscriptLength_ - 1; + uint64_t metadataStart = fileLength_ - totalTail; if (metadataSize != 0) { std::unique_ptr pbStream = createDecompressor( contents_->compression, @@ -1242,47 +1234,17 @@ namespace orc { currentStripeInfo_ = footer_->stripes(static_cast(currentStripe_)); uint64_t fileLength = contents_->stream->getLength(); - // Check for potential overflow in stripe length calculations uint64_t stripeOffset = currentStripeInfo_.offset(); uint64_t indexLength = currentStripeInfo_.index_length(); uint64_t dataLength = currentStripeInfo_.data_length(); uint64_t footerLength = currentStripeInfo_.footer_length(); - if (stripeOffset > fileLength || indexLength > fileLength || - dataLength > fileLength || footerLength > fileLength) { - std::stringstream msg; - msg << "Invalid stripe length values at stripe index " << currentStripe_ - << ": offset=" << stripeOffset << ", indexLength=" << indexLength - << ", dataLength=" << dataLength << ", footerLength=" << footerLength - << ", fileLength=" << fileLength; - throw ParseError(msg.str()); - } - - // Use subtraction-based checks to avoid addition overflow - uint64_t remainingSpace = fileLength - stripeOffset; - if (remainingSpace <= indexLength) { - std::stringstream msg; - msg << "Malformed StripeInformation at stripe index " << currentStripe_ - << ": fileLength=" << fileLength - << ", StripeInfo=(offset=" << stripeOffset - << ", indexLength=" << indexLength - << ", dataLength=" << dataLength - << ", footerLength=" << footerLength << ")"; - throw ParseError(msg.str()); - } - - if (remainingSpace - indexLength <= dataLength) { - std::stringstream msg; - msg << "Malformed StripeInformation at stripe index " << currentStripe_ - << ": fileLength=" << fileLength - << ", StripeInfo=(offset=" << stripeOffset - << ", indexLength=" << indexLength - << ", dataLength=" << dataLength - << ", footerLength=" << footerLength << ")"; - throw ParseError(msg.str()); - } - - if (remainingSpace - indexLength - dataLength <= footerLength) { + // Check for overflow and bounds validity + uint64_t stripeTotalLength; + if (__builtin_add_overflow(indexLength, dataLength, &stripeTotalLength) || + __builtin_add_overflow(stripeTotalLength, footerLength, &stripeTotalLength) || + __builtin_add_overflow(stripeOffset, stripeTotalLength, &stripeTotalLength) || + stripeTotalLength >= fileLength) { std::stringstream msg; msg << "Malformed StripeInformation at stripe index " << currentStripe_ << ": fileLength=" << fileLength @@ -1292,8 +1254,6 @@ namespace orc { << ", footerLength=" << footerLength << ")"; throw ParseError(msg.str()); } - - uint64_t stripeTotalLength = indexLength + dataLength + footerLength; rowsInCurrentStripe_ = currentStripeInfo_.number_of_rows(); processingStripe_ = currentStripe_; @@ -1692,30 +1652,17 @@ namespace orc { contents->postscript = readPostscript(stream.get(), buffer.get(), postscriptLength); uint64_t footerSize = contents->postscript->footer_length(); - // Check for potential overflow before calculating tailSize - if (footerSize > fileLength || postscriptLength > fileLength) { - std::stringstream msg; - msg << "Invalid length values: footerSize=" << footerSize - << ", postscriptLength=" << postscriptLength - << ", fileLength=" << fileLength; - throw ParseError(msg.str()); - } - - // Use subtraction-based check to avoid addition overflow - if (fileLength - footerSize < postscriptLength + 1) { + // Check for overflow before calculating tailSize + uint64_t tailSize; + if (__builtin_add_overflow(1ULL, postscriptLength, &tailSize) || + __builtin_add_overflow(tailSize, footerSize, &tailSize) || + tailSize >= fileLength) { std::stringstream msg; msg << "Invalid tail size: footerSize=" << footerSize << ", postscriptLength=" << postscriptLength << ", fileLength=" << fileLength; throw ParseError(msg.str()); } - - uint64_t tailSize = 1 + postscriptLength + footerSize; - if (tailSize >= fileLength) { - std::stringstream msg; - msg << "Invalid ORC tailSize=" << tailSize << ", fileLength=" << fileLength; - throw ParseError(msg.str()); - } uint64_t footerOffset; if (tailSize > readSize) {