diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 511618e4ac..8ec3b540bc 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -884,14 +884,21 @@ namespace orc { void ReaderImpl::readMetadata() const { uint64_t metadataSize = contents_->postscript->metadata_length(); uint64_t footerLength = contents_->postscript->footer_length(); - if (fileLength_ < metadataSize + footerLength + postscriptLength_ + 1) { + + // Check for overflow in length calculations + uint64_t totalTail; + if (__builtin_add_overflow(footerLength, metadataSize, &totalTail) || + __builtin_add_overflow(totalTail, postscriptLength_, &totalTail) || + __builtin_add_overflow(totalTail, 1ULL, &totalTail) || + totalTail > fileLength_) { std::stringstream msg; msg << "Invalid Metadata length: fileLength=" << fileLength_ << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength << ", postscriptLength=" << postscriptLength_; throw ParseError(msg.str()); } - uint64_t metadataStart = fileLength_ - metadataSize - footerLength - postscriptLength_ - 1; + + uint64_t metadataStart = fileLength_ - totalTail; if (metadataSize != 0) { std::unique_ptr pbStream = createDecompressor( contents_->compression, @@ -1226,16 +1233,25 @@ namespace orc { do { currentStripeInfo_ = footer_->stripes(static_cast(currentStripe_)); uint64_t fileLength = contents_->stream->getLength(); - if (currentStripeInfo_.offset() + currentStripeInfo_.index_length() + - currentStripeInfo_.data_length() + currentStripeInfo_.footer_length() >= - fileLength) { + + uint64_t stripeOffset = currentStripeInfo_.offset(); + uint64_t indexLength = currentStripeInfo_.index_length(); + uint64_t dataLength = currentStripeInfo_.data_length(); + uint64_t footerLength = currentStripeInfo_.footer_length(); + + // Check for overflow and bounds validity + uint64_t stripeTotalLength; + if (__builtin_add_overflow(indexLength, dataLength, &stripeTotalLength) || + __builtin_add_overflow(stripeTotalLength, footerLength, &stripeTotalLength) || + __builtin_add_overflow(stripeOffset, stripeTotalLength, &stripeTotalLength) || + stripeTotalLength >= fileLength) { std::stringstream msg; msg << "Malformed StripeInformation at stripe index " << currentStripe_ << ": fileLength=" << fileLength - << ", StripeInfo=(offset=" << currentStripeInfo_.offset() - << ", indexLength=" << currentStripeInfo_.index_length() - << ", dataLength=" << currentStripeInfo_.data_length() - << ", footerLength=" << currentStripeInfo_.footer_length() << ")"; + << ", StripeInfo=(offset=" << stripeOffset + << ", indexLength=" << indexLength + << ", dataLength=" << dataLength + << ", footerLength=" << footerLength << ")"; throw ParseError(msg.str()); } rowsInCurrentStripe_ = currentStripeInfo_.number_of_rows(); @@ -1635,10 +1651,16 @@ namespace orc { postscriptLength = buffer->data()[readSize - 1] & 0xff; contents->postscript = readPostscript(stream.get(), buffer.get(), postscriptLength); uint64_t footerSize = contents->postscript->footer_length(); - uint64_t tailSize = 1 + postscriptLength + footerSize; - if (tailSize >= fileLength) { + + // Check for overflow before calculating tailSize + uint64_t tailSize; + if (__builtin_add_overflow(1ULL, postscriptLength, &tailSize) || + __builtin_add_overflow(tailSize, footerSize, &tailSize) || + tailSize >= fileLength) { std::stringstream msg; - msg << "Invalid ORC tailSize=" << tailSize << ", fileLength=" << fileLength; + msg << "Invalid tail size: footerSize=" << footerSize + << ", postscriptLength=" << postscriptLength + << ", fileLength=" << fileLength; throw ParseError(msg.str()); } uint64_t footerOffset; diff --git a/c++/test/TestReader.cc b/c++/test/TestReader.cc index ab6e5ac662..b7e5b2b1c1 100644 --- a/c++/test/TestReader.cc +++ b/c++/test/TestReader.cc @@ -1240,4 +1240,117 @@ namespace orc { EXPECT_LT(largeLimitIOCount, smallLimitIOCount); } + /** + * Test that reading a malformed ORC file with extremely large footer_length + * throws ParseError instead of causing integer overflow or crash. + * This tests the fix for ORC-2167. + */ + TEST(TestReader, testMalformedFooterLengthOverflow) { + // Construct a minimal malformed ORC file with footer_length = UINT64_MAX + // ORC file tail structure: + // - Footer (not included here, will be invalid anyway) + // - PostScript (protobuf encoded) + // - 1 byte: postscript length + // + // PostScript protobuf fields: + // - field 1: footer_length (varint) - we set this to UINT64_MAX + // - field 8000: magic "ORC" + // + // Protobuf encoding for footer_length = UINT64_MAX: + // - tag = (field_number << 3 | wire_type) = (1 << 3 | 0) = 0x08 + // - value = varint(UINT64_MAX) = 0xFF,FF,FF,FF,FF,FF,FF,FF,FF,01 (10 bytes) + // + // Protobuf encoding for magic "ORC": + // - field 8000 is a special field, but simpler to include "ORC" at end + + // Create a minimal byte sequence simulating malicious file + // We need at least: PostScript + postscriptLength byte + // PostScript: footer_length(UINT64_MAX) encoded as protobuf varint + + std::vector maliciousData; + + // Add some dummy footer bytes (not valid, but we're testing bounds check) + for (int i = 0; i < 20; i++) { + maliciousData.push_back(0x00); + } + + // PostScript with footer_length = UINT64_MAX + // tag (field 1, varint): 0x08 + maliciousData.push_back(0x08); + // value: UINT64_MAX as varint (10 bytes) + for (int i = 0; i < 9; i++) { + maliciousData.push_back(0xFF); + } + maliciousData.push_back(0x01); + + // Add compression field (field 2) - NONE = 0 + maliciousData.push_back(0x10); // tag: field 2, wire type 0 + maliciousData.push_back(0x00); // value: compression NONE + + // Add magic "ORC" at the end (this is a simplified approach) + maliciousData.push_back('O'); + maliciousData.push_back('R'); + maliciousData.push_back('C'); + + // PostScript length byte - points to all bytes before it (excluding magic) + // This is a simplified structure; real ORC has magic inside PostScript + uint8_t postscriptLen = 14; // footer_length encoding + compression encoding + maliciousData.push_back(static_cast(postscriptLen)); + + // Create reader and expect ParseError + auto stream = std::make_unique( + maliciousData.data(), maliciousData.size()); + + ReaderOptions options; + EXPECT_THROW(createReader(std::move(stream), options), ParseError); + } + + /** + * Test that reading a malformed ORC file with extremely large metadata_length + * throws ParseError instead of causing integer overflow. + */ + TEST(TestReader, testMalformedMetadataLengthOverflow) { + // Similar to testMalformedFooterLengthOverflow but with metadata_length overflow + // PostScript field 3 is metadata_length (actually it's field 4 in proto) + // Let's create a file with both footer_length and metadata_length set to large values + + std::vector maliciousData; + + // Add some dummy bytes + for (int i = 0; i < 20; i++) { + maliciousData.push_back(0x00); + } + + // PostScript: + // footer_length = 100 (reasonable small value) + maliciousData.push_back(0x08); // tag: field 1 + maliciousData.push_back(0x64); // value: 100 + + // metadata_length = UINT64_MAX (field 4) + maliciousData.push_back(0x20); // tag: field 4, wire type 0 (4 << 3 | 0 = 32 = 0x20) + for (int i = 0; i < 9; i++) { + maliciousData.push_back(0xFF); + } + maliciousData.push_back(0x01); + + // compression = NONE + maliciousData.push_back(0x10); + maliciousData.push_back(0x00); + + // magic "ORC" + maliciousData.push_back('O'); + maliciousData.push_back('R'); + maliciousData.push_back('C'); + + // postscript length + uint8_t postscriptLen = 16; + maliciousData.push_back(static_cast(postscriptLen)); + + auto stream = std::make_unique( + maliciousData.data(), maliciousData.size()); + + ReaderOptions options; + EXPECT_THROW(createReader(std::move(stream), options), ParseError); + } + } // namespace orc