Skip to content

DataPageHeaderV2.num_nulls=-1 when column statistics are disabled #3585

@keuin

Description

@keuin

Describe the bug, including details regarding any error messages, version, and platform.

num_nulls is a required field in parquet v2 header (https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift#L729). However current implementation tracks this field only when statistics is enabled. When statistics is disabled, this required field will be set to -1, a default value in the tracker, causing incorrect num_nulls and will fail in some reader which do explicit sanity check like Rust arrow-rs.

This bug can be reproduced using:

package com.keuin.testparquet;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.column.page.DataPage;
import org.apache.parquet.column.page.DataPageV2;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.column.page.PageReader;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.schema.MessageType;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

import java.nio.file.Path;

import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0;
import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0;
import static org.junit.jupiter.api.Assertions.*;

/**
 * Minimal reproduction of parquet-java bug:
 *
 * When using WriterVersion.PARQUET_2_0 (which produces DataPageV2 pages) combined with
 * .withStatisticsEnabled("column", false), the DataPageHeaderV2.num_nulls field is
 * written as -1 (0xFFFFFFFF).
 *
 * Root cause: ColumnWriterV2.writePage() calls Math.toIntExact(statistics.getNumNulls()),
 * and NoopStatistics.getNumNulls() unconditionally returns -1 (meaning "not set").
 * But DataPageHeaderV2.num_nulls is a required i32 field in the Thrift schema,
 * so -1 gets serialized as-is.
 *
 * Affected versions: parquet-java 1.15.2 (and likely all versions with NoopStatistics)
 * Bug location: ColumnWriterV2.java line ~107, NoopStatistics.java line ~97
 */
class ParquetV2NumNullsBugTest {

    /**
     * Minimal Avro schema: one non-null int field + one nullable string field.
     */
    private static final String SCHEMA_JSON = "{\n"
            + "  \"type\": \"record\",\n"
            + "  \"name\": \"TestRecord\",\n"
            + "  \"fields\": [\n"
            + "    {\"name\": \"id\", \"type\": \"int\"},\n"
            + "    {\"name\": \"value\", \"type\": [\"null\", \"string\"], \"default\": null}\n"
            + "  ]\n"
            + "}";

    /**
     * BUG REPRODUCTION: PARQUET_2_0 + withStatisticsEnabled(false) → num_nulls = -1
     *
     * This test demonstrates the bug. With statistics disabled on a nullable column,
     * the DataPageV2 header's num_nulls field is incorrectly set to -1.
     */
    @Test
    void bugRepro_parquet2_statisticsDisabled_numNullsIsMinusOne(@TempDir Path tempDir) throws Exception {
        Schema schema = new Schema.Parser().parse(SCHEMA_JSON);
        org.apache.hadoop.fs.Path parquetPath =
                new org.apache.hadoop.fs.Path(tempDir.resolve("bug.parquet").toUri());

        // Write: PARQUET_2_0 + statistics disabled on "value" column
        try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(parquetPath)
                .withSchema(schema)
                .withCompressionCodec(CompressionCodecName.UNCOMPRESSED)
                .withWriterVersion(PARQUET_2_0)
                .withStatisticsEnabled("value", false)  // ← triggers NoopStatistics
                .withPageSize(1024 * 1024) // large page to keep all records in one page
                .build()) {

            for (int i = 0; i < 10; i++) {
                GenericRecord record = new GenericData.Record(schema);
                record.put("id", i);
                // 4 nulls: i = 0, 3, 6, 9
                record.put("value", i % 3 == 0 ? null : "hello-" + i);
                writer.write(record);
            }
        }

        // Read back using low-level page API
        Configuration conf = new Configuration();
        InputFile inputFile = HadoopInputFile.fromPath(parquetPath, conf);
        try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
            MessageType fileSchema = reader.getFileMetaData().getSchema();

            // "value" is the second leaf column in the schema
            // For union ["null","string"], Avro maps to optional binary field
            var columns = fileSchema.getColumns();
            var valueColumn = columns.stream()
                    .filter(c -> c.getPath()[0].equals("value"))
                    .findFirst()
                    .orElseThrow(() -> new AssertionError("Column 'value' not found"));

            PageReadStore rowGroup = reader.readNextRowGroup();
            assertNotNull(rowGroup, "Should have at least one row group");

            PageReader pageReader = rowGroup.getPageReader(valueColumn);
            DataPage page = pageReader.readPage();
            assertNotNull(page, "Should have at least one data page");

            // Verify it's a V2 page (because we used PARQUET_2_0)
            assertInstanceOf(DataPageV2.class, page,
                    "PARQUET_2_0 writer should produce DataPageV2 pages");

            DataPageV2 pageV2 = (DataPageV2) page;
            int numNulls = pageV2.getNullCount();

            System.out.println("=== BUG REPRODUCTION ===");
            System.out.println("WriterVersion:        PARQUET_2_0");
            System.out.println("Statistics enabled:   false (for 'value' column)");
            System.out.println("Expected num_nulls:   4 (records with i=0,3,6,9)");
            System.out.println("Actual num_nulls:     " + numNulls);
            System.out.println("Bug present:          " + (numNulls == -1 ? "YES" : "NO"));
            System.out.println();

            // This assertion demonstrates the bug exists:
            // num_nulls should be 4, but is actually -1
            assertEquals(-1, numNulls,
                    "BUG: NoopStatistics.getNumNulls() returns -1, which gets written to "
                    + "DataPageHeaderV2.num_nulls. This value is the 'required i32 num_nulls' "
                    + "field in the Thrift page header.");
        }
    }

    /**
     * CONTROL: PARQUET_2_0 + statistics ENABLED → num_nulls is correct
     *
     * When statistics are enabled (default), num_nulls correctly reflects the actual null count.
     */
    @Test
    void control_parquet2_statisticsEnabled_numNullsIsCorrect(@TempDir Path tempDir) throws Exception {
        Schema schema = new Schema.Parser().parse(SCHEMA_JSON);
        org.apache.hadoop.fs.Path parquetPath =
                new org.apache.hadoop.fs.Path(tempDir.resolve("control.parquet").toUri());

        // Write: PARQUET_2_0 + statistics enabled (default)
        try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(parquetPath)
                .withSchema(schema)
                .withCompressionCodec(CompressionCodecName.UNCOMPRESSED)
                .withWriterVersion(PARQUET_2_0)
                // No withStatisticsEnabled(false) → default statistics are collected
                .withPageSize(1024 * 1024)
                .build()) {

            for (int i = 0; i < 10; i++) {
                GenericRecord record = new GenericData.Record(schema);
                record.put("id", i);
                record.put("value", i % 3 == 0 ? null : "hello-" + i);
                writer.write(record);
            }
        }

        // Read back
        Configuration conf = new Configuration();
        InputFile inputFile = HadoopInputFile.fromPath(parquetPath, conf);
        try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
            MessageType fileSchema = reader.getFileMetaData().getSchema();
            var valueColumn = fileSchema.getColumns().stream()
                    .filter(c -> c.getPath()[0].equals("value"))
                    .findFirst()
                    .orElseThrow();

            PageReadStore rowGroup = reader.readNextRowGroup();
            PageReader pageReader = rowGroup.getPageReader(valueColumn);
            DataPage page = pageReader.readPage();

            assertInstanceOf(DataPageV2.class, page);
            DataPageV2 pageV2 = (DataPageV2) page;
            int numNulls = pageV2.getNullCount();

            System.out.println("=== CONTROL (statistics enabled) ===");
            System.out.println("WriterVersion:        PARQUET_2_0");
            System.out.println("Statistics enabled:   true (default)");
            System.out.println("Expected num_nulls:   4");
            System.out.println("Actual num_nulls:     " + numNulls);
            System.out.println();

            assertEquals(4, numNulls,
                    "With statistics enabled, num_nulls should correctly be 4");
        }
    }

    /**
     * CONTROL: PARQUET_1_0 + statistics disabled → V1 page, no num_nulls in header
     *
     * DataPageV1 does not have a required num_nulls field, so this combination
     * does not trigger the bug. Statistics are simply omitted from the optional field.
     */
    @Test
    void control_parquet1_statisticsDisabled_noIssue(@TempDir Path tempDir) throws Exception {
        Schema schema = new Schema.Parser().parse(SCHEMA_JSON);
        org.apache.hadoop.fs.Path parquetPath =
                new org.apache.hadoop.fs.Path(tempDir.resolve("v1.parquet").toUri());

        // Write: PARQUET_1_0 + statistics disabled
        try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(parquetPath)
                .withSchema(schema)
                .withCompressionCodec(CompressionCodecName.UNCOMPRESSED)
                .withWriterVersion(PARQUET_1_0)
                .withStatisticsEnabled("value", false)
                .withPageSize(1024 * 1024)
                .build()) {

            for (int i = 0; i < 10; i++) {
                GenericRecord record = new GenericData.Record(schema);
                record.put("id", i);
                record.put("value", i % 3 == 0 ? null : "hello-" + i);
                writer.write(record);
            }
        }

        // Read back — V1 pages don't have a separate num_nulls field in the header
        Configuration conf = new Configuration();
        InputFile inputFile = HadoopInputFile.fromPath(parquetPath, conf);
        try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
            MessageType fileSchema = reader.getFileMetaData().getSchema();
            var valueColumn = fileSchema.getColumns().stream()
                    .filter(c -> c.getPath()[0].equals("value"))
                    .findFirst()
                    .orElseThrow();

            PageReadStore rowGroup = reader.readNextRowGroup();
            PageReader pageReader = rowGroup.getPageReader(valueColumn);
            DataPage page = pageReader.readPage();

            // V1 writer produces DataPageV1, not DataPageV2
            assertFalse(page instanceof DataPageV2,
                    "PARQUET_1_0 should NOT produce DataPageV2 pages");

            System.out.println("=== CONTROL (PARQUET_1_0) ===");
            System.out.println("WriterVersion:        PARQUET_1_0");
            System.out.println("Statistics enabled:   false");
            System.out.println("Page type:            " + page.getClass().getSimpleName());
            System.out.println("Result:               No issue — V1 pages don't have required num_nulls");
            System.out.println();
        }
    }
}

Component(s)

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions