Describe the bug, including details regarding any error messages, version, and platform.
num_nulls is a required field in parquet v2 header (https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift#L729). However current implementation tracks this field only when statistics is enabled. When statistics is disabled, this required field will be set to -1, a default value in the tracker, causing incorrect num_nulls and will fail in some reader which do explicit sanity check like Rust arrow-rs.
This bug can be reproduced using:
package com.keuin.testparquet;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.column.page.DataPage;
import org.apache.parquet.column.page.DataPageV2;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.column.page.PageReader;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.schema.MessageType;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import java.nio.file.Path;
import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0;
import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0;
import static org.junit.jupiter.api.Assertions.*;
/**
* Minimal reproduction of parquet-java bug:
*
* When using WriterVersion.PARQUET_2_0 (which produces DataPageV2 pages) combined with
* .withStatisticsEnabled("column", false), the DataPageHeaderV2.num_nulls field is
* written as -1 (0xFFFFFFFF).
*
* Root cause: ColumnWriterV2.writePage() calls Math.toIntExact(statistics.getNumNulls()),
* and NoopStatistics.getNumNulls() unconditionally returns -1 (meaning "not set").
* But DataPageHeaderV2.num_nulls is a required i32 field in the Thrift schema,
* so -1 gets serialized as-is.
*
* Affected versions: parquet-java 1.15.2 (and likely all versions with NoopStatistics)
* Bug location: ColumnWriterV2.java line ~107, NoopStatistics.java line ~97
*/
class ParquetV2NumNullsBugTest {
/**
* Minimal Avro schema: one non-null int field + one nullable string field.
*/
private static final String SCHEMA_JSON = "{\n"
+ " \"type\": \"record\",\n"
+ " \"name\": \"TestRecord\",\n"
+ " \"fields\": [\n"
+ " {\"name\": \"id\", \"type\": \"int\"},\n"
+ " {\"name\": \"value\", \"type\": [\"null\", \"string\"], \"default\": null}\n"
+ " ]\n"
+ "}";
/**
* BUG REPRODUCTION: PARQUET_2_0 + withStatisticsEnabled(false) → num_nulls = -1
*
* This test demonstrates the bug. With statistics disabled on a nullable column,
* the DataPageV2 header's num_nulls field is incorrectly set to -1.
*/
@Test
void bugRepro_parquet2_statisticsDisabled_numNullsIsMinusOne(@TempDir Path tempDir) throws Exception {
Schema schema = new Schema.Parser().parse(SCHEMA_JSON);
org.apache.hadoop.fs.Path parquetPath =
new org.apache.hadoop.fs.Path(tempDir.resolve("bug.parquet").toUri());
// Write: PARQUET_2_0 + statistics disabled on "value" column
try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(parquetPath)
.withSchema(schema)
.withCompressionCodec(CompressionCodecName.UNCOMPRESSED)
.withWriterVersion(PARQUET_2_0)
.withStatisticsEnabled("value", false) // ← triggers NoopStatistics
.withPageSize(1024 * 1024) // large page to keep all records in one page
.build()) {
for (int i = 0; i < 10; i++) {
GenericRecord record = new GenericData.Record(schema);
record.put("id", i);
// 4 nulls: i = 0, 3, 6, 9
record.put("value", i % 3 == 0 ? null : "hello-" + i);
writer.write(record);
}
}
// Read back using low-level page API
Configuration conf = new Configuration();
InputFile inputFile = HadoopInputFile.fromPath(parquetPath, conf);
try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
MessageType fileSchema = reader.getFileMetaData().getSchema();
// "value" is the second leaf column in the schema
// For union ["null","string"], Avro maps to optional binary field
var columns = fileSchema.getColumns();
var valueColumn = columns.stream()
.filter(c -> c.getPath()[0].equals("value"))
.findFirst()
.orElseThrow(() -> new AssertionError("Column 'value' not found"));
PageReadStore rowGroup = reader.readNextRowGroup();
assertNotNull(rowGroup, "Should have at least one row group");
PageReader pageReader = rowGroup.getPageReader(valueColumn);
DataPage page = pageReader.readPage();
assertNotNull(page, "Should have at least one data page");
// Verify it's a V2 page (because we used PARQUET_2_0)
assertInstanceOf(DataPageV2.class, page,
"PARQUET_2_0 writer should produce DataPageV2 pages");
DataPageV2 pageV2 = (DataPageV2) page;
int numNulls = pageV2.getNullCount();
System.out.println("=== BUG REPRODUCTION ===");
System.out.println("WriterVersion: PARQUET_2_0");
System.out.println("Statistics enabled: false (for 'value' column)");
System.out.println("Expected num_nulls: 4 (records with i=0,3,6,9)");
System.out.println("Actual num_nulls: " + numNulls);
System.out.println("Bug present: " + (numNulls == -1 ? "YES" : "NO"));
System.out.println();
// This assertion demonstrates the bug exists:
// num_nulls should be 4, but is actually -1
assertEquals(-1, numNulls,
"BUG: NoopStatistics.getNumNulls() returns -1, which gets written to "
+ "DataPageHeaderV2.num_nulls. This value is the 'required i32 num_nulls' "
+ "field in the Thrift page header.");
}
}
/**
* CONTROL: PARQUET_2_0 + statistics ENABLED → num_nulls is correct
*
* When statistics are enabled (default), num_nulls correctly reflects the actual null count.
*/
@Test
void control_parquet2_statisticsEnabled_numNullsIsCorrect(@TempDir Path tempDir) throws Exception {
Schema schema = new Schema.Parser().parse(SCHEMA_JSON);
org.apache.hadoop.fs.Path parquetPath =
new org.apache.hadoop.fs.Path(tempDir.resolve("control.parquet").toUri());
// Write: PARQUET_2_0 + statistics enabled (default)
try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(parquetPath)
.withSchema(schema)
.withCompressionCodec(CompressionCodecName.UNCOMPRESSED)
.withWriterVersion(PARQUET_2_0)
// No withStatisticsEnabled(false) → default statistics are collected
.withPageSize(1024 * 1024)
.build()) {
for (int i = 0; i < 10; i++) {
GenericRecord record = new GenericData.Record(schema);
record.put("id", i);
record.put("value", i % 3 == 0 ? null : "hello-" + i);
writer.write(record);
}
}
// Read back
Configuration conf = new Configuration();
InputFile inputFile = HadoopInputFile.fromPath(parquetPath, conf);
try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
MessageType fileSchema = reader.getFileMetaData().getSchema();
var valueColumn = fileSchema.getColumns().stream()
.filter(c -> c.getPath()[0].equals("value"))
.findFirst()
.orElseThrow();
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(valueColumn);
DataPage page = pageReader.readPage();
assertInstanceOf(DataPageV2.class, page);
DataPageV2 pageV2 = (DataPageV2) page;
int numNulls = pageV2.getNullCount();
System.out.println("=== CONTROL (statistics enabled) ===");
System.out.println("WriterVersion: PARQUET_2_0");
System.out.println("Statistics enabled: true (default)");
System.out.println("Expected num_nulls: 4");
System.out.println("Actual num_nulls: " + numNulls);
System.out.println();
assertEquals(4, numNulls,
"With statistics enabled, num_nulls should correctly be 4");
}
}
/**
* CONTROL: PARQUET_1_0 + statistics disabled → V1 page, no num_nulls in header
*
* DataPageV1 does not have a required num_nulls field, so this combination
* does not trigger the bug. Statistics are simply omitted from the optional field.
*/
@Test
void control_parquet1_statisticsDisabled_noIssue(@TempDir Path tempDir) throws Exception {
Schema schema = new Schema.Parser().parse(SCHEMA_JSON);
org.apache.hadoop.fs.Path parquetPath =
new org.apache.hadoop.fs.Path(tempDir.resolve("v1.parquet").toUri());
// Write: PARQUET_1_0 + statistics disabled
try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(parquetPath)
.withSchema(schema)
.withCompressionCodec(CompressionCodecName.UNCOMPRESSED)
.withWriterVersion(PARQUET_1_0)
.withStatisticsEnabled("value", false)
.withPageSize(1024 * 1024)
.build()) {
for (int i = 0; i < 10; i++) {
GenericRecord record = new GenericData.Record(schema);
record.put("id", i);
record.put("value", i % 3 == 0 ? null : "hello-" + i);
writer.write(record);
}
}
// Read back — V1 pages don't have a separate num_nulls field in the header
Configuration conf = new Configuration();
InputFile inputFile = HadoopInputFile.fromPath(parquetPath, conf);
try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
MessageType fileSchema = reader.getFileMetaData().getSchema();
var valueColumn = fileSchema.getColumns().stream()
.filter(c -> c.getPath()[0].equals("value"))
.findFirst()
.orElseThrow();
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(valueColumn);
DataPage page = pageReader.readPage();
// V1 writer produces DataPageV1, not DataPageV2
assertFalse(page instanceof DataPageV2,
"PARQUET_1_0 should NOT produce DataPageV2 pages");
System.out.println("=== CONTROL (PARQUET_1_0) ===");
System.out.println("WriterVersion: PARQUET_1_0");
System.out.println("Statistics enabled: false");
System.out.println("Page type: " + page.getClass().getSimpleName());
System.out.println("Result: No issue — V1 pages don't have required num_nulls");
System.out.println();
}
}
}
Component(s)
No response
Describe the bug, including details regarding any error messages, version, and platform.
num_nulls is a required field in parquet v2 header (https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift#L729). However current implementation tracks this field only when statistics is enabled. When statistics is disabled, this required field will be set to -1, a default value in the tracker, causing incorrect num_nulls and will fail in some reader which do explicit sanity check like Rust arrow-rs.
This bug can be reproduced using:
Component(s)
No response