From d9d390b8951497652c082ebcc6c37295d37c280c Mon Sep 17 00:00:00 2001 From: Keuin Date: Mon, 25 May 2026 14:10:19 +0800 Subject: [PATCH] Fix DataPageHeaderV2.num_nulls=-1 when column statistics are disabled --- .../column/impl/ColumnValueCollector.java | 14 ++++ .../parquet/column/impl/ColumnWriterBase.java | 2 + .../parquet/column/impl/ColumnWriterV1.java | 1 + .../parquet/column/impl/ColumnWriterV2.java | 3 +- .../parquet/hadoop/TestParquetWriter.java | 69 +++++++++++++++++++ 5 files changed, 88 insertions(+), 1 deletion(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java index 72d2dd4e55..b863d79d25 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java @@ -45,6 +45,10 @@ class ColumnValueCollector { private SizeStatistics.Builder sizeStatisticsBuilder; private GeospatialStatistics.Builder geospatialStatisticsBuilder; + // track the required `num_nulls` field in DataPageHeaderV2 + // https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift + private int nullCount; + ColumnValueCollector(ColumnDescriptor path, BloomFilterWriter bloomFilterWriter, ParquetProperties props) { this.path = path; this.statisticsEnabled = props.getStatisticsEnabled(path); @@ -54,6 +58,7 @@ class ColumnValueCollector { } void resetPageStatistics() { + this.nullCount = 0; this.statistics = statisticsEnabled ? Statistics.createStats(path.getPrimitiveType()) : Statistics.noopStats(path.getPrimitiveType()); @@ -68,6 +73,7 @@ void resetPageStatistics() { } void writeNull(int repetitionLevel, int definitionLevel) { + ++nullCount; statistics.incrementNumNulls(); sizeStatisticsBuilder.add(repetitionLevel, definitionLevel); } @@ -198,6 +204,14 @@ void finalizeColumnChunk() { } } + /** + * Returns the number of null values written in the current page. + * This counter is to supply the required field `num_nulls` in DataPageHeaderV2. + */ + int getNullCount() { + return nullCount; + } + Statistics getStatistics() { return statistics; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java index 8fc4aa2722..0d5a217b8e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java @@ -379,6 +379,7 @@ void writePage() { writePage( pageRowCount, valueCount, + collector.getNullCount(), collector.getStatistics(), collector.getSizeStatistics(), collector.getGeospatialStatistics(), @@ -403,6 +404,7 @@ void writePage() { abstract void writePage( int rowCount, int valueCount, + int nullCount, Statistics statistics, SizeStatistics sizeStatistics, GeospatialStatistics geospatialStatistics, diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java index 882be23811..eaba1309fc 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java @@ -61,6 +61,7 @@ ValuesWriter createDLWriter(ParquetProperties props, ColumnDescriptor path) { void writePage( int rowCount, int valueCount, + int nullCount, Statistics statistics, SizeStatistics sizeStatistics, GeospatialStatistics geospatialStatistics, diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java index e7af6aaadf..313baf9ea8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java @@ -87,6 +87,7 @@ ValuesWriter createDLWriter(ParquetProperties props, ColumnDescriptor path) { void writePage( int rowCount, int valueCount, + int nullCount, Statistics statistics, SizeStatistics sizeStatistics, GeospatialStatistics geospatialStatistics, @@ -100,7 +101,7 @@ void writePage( Encoding encoding = values.getEncoding(); pageWriter.writePageV2( rowCount, - Math.toIntExact(statistics.getNumNulls()), + nullCount, valueCount, repetitionLevels.getBytes(), definitionLevels.getBytes(), diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java index a7888b58d8..26eee95268 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java @@ -58,9 +58,14 @@ import org.apache.parquet.ParquetReadOptions; import org.apache.parquet.bytes.HeapByteBufferAllocator; import org.apache.parquet.bytes.TrackingByteBufferAllocator; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.column.page.PageReader; import org.apache.parquet.column.values.bloomfilter.BloomFilter; import org.apache.parquet.crypto.AesCipher; import org.apache.parquet.crypto.ColumnEncryptionProperties; @@ -858,4 +863,68 @@ public void testNoFlushAfterException() throws Exception { FileSystem fs = file.getFileSystem(conf); assertTrue(!fs.exists(file) || fs.getFileStatus(file).getLen() == 0); } + + @Test + public void testV2PageNullCountWithStatisticsDisabled() throws Exception { + // Regression test: when using PARQUET_2_0 with statistics disabled on a nullable column, + // DataPageHeaderV2.num_nulls must still contain the correct null count (not -1). + MessageType schema = Types.buildMessage() + .required(INT32) + .named("id") + .optional(BINARY) + .as(stringType()) + .named("value") + .named("test_schema"); + + File file = temp.newFile(); + temp.delete(); + Path path = new Path(file.getAbsolutePath()); + + int totalRecords = 10; + int expectedNulls = 4; // records where i % 3 == 0: i=0,3,6,9 + + // Write with PARQUET_2_0 and statistics disabled on the nullable "value" column + try (ParquetWriter writer = ExampleParquetWriter.builder(path) + .withType(schema) + .withWriterVersion(PARQUET_2_0) + .withStatisticsEnabled("value", false) + .withPageSize(1024 * 1024) // large page to keep all records in one page + .build()) { + SimpleGroupFactory factory = new SimpleGroupFactory(schema); + for (int i = 0; i < totalRecords; i++) { + Group group = factory.newGroup().append("id", i); + if (i % 3 != 0) { + group.append("value", "hello-" + i); + } + writer.write(group); + } + } + + // Read back the page-level metadata and verify num_nulls + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + MessageType fileSchema = reader.getFooter().getFileMetaData().getSchema(); + + // Find the "value" column descriptor + ColumnDescriptor valueColumn = fileSchema.getColumns().stream() + .filter(c -> c.getPath()[0].equals("value")) + .findFirst() + .orElseThrow(() -> new AssertionError("Column 'value' not found")); + + PageReadStore rowGroup = reader.readNextRowGroup(); + PageReader pageReader = rowGroup.getPageReader(valueColumn); + DataPage page = pageReader.readPage(); + + // Verify it's a V2 page (because we used PARQUET_2_0) + assertTrue( + "PARQUET_2_0 writer should produce DataPageV2 pages, got: " + + page.getClass().getSimpleName(), + page instanceof DataPageV2); + + DataPageV2 pageV2 = (DataPageV2) page; + assertEquals( + "DataPageV2.num_nulls should be the actual null count even when statistics are disabled", + expectedNulls, + pageV2.getNullCount()); + } + } }