apache · lurnagao-dahua · Feb 1, 2026 · Feb 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java
@@ -43,6 +43,7 @@
 import org.apache.iceberg.MetadataColumns;
 import org.apache.iceberg.arrow.ArrowAllocation;
 import org.apache.iceberg.arrow.ArrowSchemaUtil;
+import org.apache.iceberg.arrow.vectorized.parquet.ParquetReadState;
 import org.apache.iceberg.arrow.vectorized.parquet.VectorizedColumnIterator;
 import org.apache.iceberg.parquet.ParquetUtil;
 import org.apache.iceberg.parquet.VectorizedReader;
@@ -372,11 +373,13 @@ private void allocateVectorBasedOnTypeName(PrimitiveType primitive, Field arrowF
 
   @Override
   public void setRowGroupInfo(PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
+    ParquetReadState readState = new ParquetReadState(source.getRowIndexes().orElse(null));
     ColumnChunkMetaData chunkMetaData = metadata.get(ColumnPath.get(columnDescriptor.getPath()));
     this.dictionary =
         vectorizedColumnIterator.setRowGroupInfo(
             source.getPageReader(columnDescriptor),
-            !ParquetUtil.hasNonDictionaryPages(chunkMetaData));
+            !ParquetUtil.hasNonDictionaryPages(chunkMetaData),
+            readState);
   }
 
   @Override
@@ -649,6 +652,9 @@ private static final class PositionVectorReader extends VectorizedArrowReader {
     private long rowStart;
     private int batchSize;
     private NullabilityHolder nulls;
+    private ParquetReadState readState;
+    private long readOrder;
+    private long curRowPosInRowGroup;
 
     PositionVectorReader(boolean setArrowValidityVector) {
       super(MetadataColumns.ROW_POSITION);
@@ -667,7 +673,11 @@ public VectorHolder read(VectorHolder reuse, int numValsToRead) {
 
       ArrowBuf dataBuffer = vec.getDataBuffer();
       for (int i = 0; i < numValsToRead; i += 1) {
-        dataBuffer.setLong((long) i * Long.BYTES, rowStart + i);
+        curRowPosInRowGroup =
+            readState.getReadOrderToRowGroupPosMap().getOrDefault(readOrder, curRowPosInRowGroup);
+        dataBuffer.setLong((long) i * Long.BYTES, rowStart + curRowPosInRowGroup);
+        readOrder++;
+        curRowPosInRowGroup++;
       }
 
       if (setArrowValidityVector) {
@@ -677,7 +687,6 @@ public VectorHolder read(VectorHolder reuse, int numValsToRead) {
         }
       }
 
-      rowStart += numValsToRead;
       vec.setValueCount(numValsToRead);
 
       return new VectorHolder.PositionVectorHolder(vec, MetadataColumns.ROW_POSITION, nulls);
@@ -700,6 +709,9 @@ public void setRowGroupInfo(
                   () ->
                       new IllegalArgumentException(
                           "PageReadStore does not contain row index offset"));
+      readState = new ParquetReadState(source.getRowIndexes().orElse(null));
+      readOrder = 0;
+      curRowPosInRowGroup = 0;
     }
 
     @Override

diff --git a/...n/java/org/apache/iceberg/arrow/vectorized/parquet/BaseVectorizedParquetValuesReader.java b/...n/java/org/apache/iceberg/arrow/vectorized/parquet/BaseVectorizedParquetValuesReader.java
@@ -217,7 +217,7 @@ public boolean readBoolean() {
 
   @Override
   public void skip() {
-    throw new UnsupportedOperationException();
+    this.readInteger();
   }
 
   @Override
@@ -240,4 +240,24 @@ public int readInteger() {
     }
     throw new RuntimeException("Unrecognized mode: " + mode);
   }
+
+  /** Skip `n` values from the current reader. */
+  public void skipValues(int total) {
+    int left = total;
+    while (left > 0) {
+      if (this.currentCount == 0) {
+        this.readNextGroup();
+      }
+      int num = Math.min(left, this.currentCount);
+      switch (mode) {
+        case RLE:
+          break;
+        case PACKED:
+          packedValuesBufferIdx += num;
+          break;
+      }
+      currentCount -= num;
+      left -= num;
+    }
+  }
 }
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/ParquetReadState.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/ParquetReadState.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.arrow.vectorized.parquet;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.PrimitiveIterator;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+
+public class ParquetReadState {
+  /** A special row range used when there is no row indexes (hence all rows must be included) */
+  private static final RowRange MAX_ROW_RANGE = new RowRange(Long.MIN_VALUE, Long.MAX_VALUE);
+
+  /**
+   * A special row range used when the row indexes are present AND all the row ranges have been
+   * processed. This serves as a sentinel at the end indicating that all rows come after the last
+   * row range should be skipped.
+   */
+  private static final RowRange END_ROW_RANGE = new RowRange(Long.MAX_VALUE, Long.MIN_VALUE);
+
+  /**
+   * The current index over all rows within the column chunk. This is used to check if the current
+   * row should be skipped by comparing against the row ranges.
+   */
+  private long currentRowIndex;
+
+  /** The offset in the current batch to put the next value in value vector */
+  private int valueOffset;
+
+  /** The remaining number of values to read in the current page */
+  private int valuesToReadInPage;
+
+  /** Iterator over all row ranges, only not-null if column index is present */
+  private final Iterator<RowRange> rowRanges;
+
+  /** The current row range */
+  private RowRange currentRange;
+
+  /** The remaining number of rows to read in the current batch */
+  private int rowsToReadInBatch;
+
+  /** The actual number of rows read in this batch on the current page, including skipped rows */
+  private int rowsWithSkipsInThisBatch;
+
+  /**
+   * Mapping from read order to actual position in row group. rowIndexes: [0, 1, 2, 4, 5, 7, 8, 9]
+   * -> rowRanges: [0-2], [4-5], [7-9] readOrderToRowGroupPosMap: [0 -> 0, 3 -> 4, 5 -> 7]
+   */
+  private final Map<Long, Long> readOrderToRowGroupPosMap = Maps.newHashMap();
+
+  public ParquetReadState(PrimitiveIterator.OfLong rowIndexes) {
+    this.rowRanges = constructRanges(rowIndexes);
+    nextRange();
+  }
+
+  /** Advance to the next range. */
+  void nextRange() {
+    if (rowRanges == null) {
+      currentRange = MAX_ROW_RANGE;
+    } else if (!rowRanges.hasNext()) {
+      currentRange = END_ROW_RANGE;
+    } else {
+      currentRange = rowRanges.next();
+    }
+  }
+
+  public long currentRangeStart() {
+    return currentRange.getStart();
+  }
+
+  public long currentRangeEnd() {
+    return currentRange.getEnd();
+  }
+
+  /** Must be called at the beginning of reading a new page. */
+  void resetForNewPage(int totalValuesInPage, long pageFirstRowIndex) {
+    this.valuesToReadInPage = totalValuesInPage;
+    this.currentRowIndex = pageFirstRowIndex;
+  }
+
+  /** Must be called at the beginning of reading a new batch. */
+  void resetForNewBatch(int batchSize) {
+    this.valueOffset = 0;
+    this.rowsToReadInBatch = batchSize;
+  }
+
+  /**
+   * Construct a list of row ranges from the given `rowIndexes`. For example, suppose the
+   * `rowIndexes` are `[0, 1, 2, 4, 5, 7, 8, 9]`, it will be converted into 3 row ranges: `[0-2],
+   * [4-5], [7-9]`.
+   */
+  private Iterator<RowRange> constructRanges(PrimitiveIterator.OfLong rowIndexes) {
+    if (rowIndexes == null) {
+      return null;
+    }
+
+    List<RowRange> ranges = Lists.newArrayList();
+    long currentStart = Long.MIN_VALUE;
+    long previous = Long.MIN_VALUE;
+    long readOrder = 0;
+
+    while (rowIndexes.hasNext()) {
+      long idx = rowIndexes.nextLong();
+      if (currentStart == Long.MIN_VALUE) {
+        currentStart = idx;
+      } else if (previous + 1 != idx) {
+        RowRange range = new RowRange(currentStart, previous);
+        readOrderToRowGroupPosMap.put(readOrder, currentStart);
+        readOrder += previous - currentStart + 1;
+        ranges.add(range);
+        currentStart = idx;
+      }
+      previous = idx;
+    }
+
+    if (previous != Long.MIN_VALUE) {
+      ranges.add(new RowRange(currentStart, previous));
+      readOrderToRowGroupPosMap.put(readOrder, currentStart);
+    }
+
+    return ranges.iterator();
+  }
+
+  int getValuesToReadInPage() {
+    return valuesToReadInPage;
+  }
+
+  void setValuesToReadInPage(int valuesToReadInPage) {
+    this.valuesToReadInPage = valuesToReadInPage;
+  }
+
+  long getCurrentRowIndex() {
+    return currentRowIndex;
+  }
+
+  void setCurrentRowIndex(long currentRowIndex) {
+    this.currentRowIndex = currentRowIndex;
+  }
+
+  int getRowsToReadInBatch() {
+    return rowsToReadInBatch;
+  }
+
+  public void setRowsToReadInBatch(int rowsToReadInBatch) {
+    this.rowsToReadInBatch = rowsToReadInBatch;
+  }
+
+  public int getRowsWithSkipsInThisBatch() {
+    return rowsWithSkipsInThisBatch;
+  }
+
+  public void setRowsWithSkipsInThisBatch(int rowsWithSkipsInThisBatch) {
+    this.rowsWithSkipsInThisBatch = rowsWithSkipsInThisBatch;
+  }
+
+  public int getValueOffset() {
+    return valueOffset;
+  }
+
+  public void setValueOffset(int valueOffset) {
+    this.valueOffset = valueOffset;
+  }
+
+  public Map<Long, Long> getReadOrderToRowGroupPosMap() {
+    return readOrderToRowGroupPosMap;
+  }
+
+  /** Helper struct to represent a range of row indexes `[start, end]`. */
+  private static class RowRange {
+    private final long start;
+    private final long end;
+
+    RowRange(long start, long end) {
+      this.start = start;
+      this.end = end;
+    }
+
+    public long getStart() {
+      return start;
+    }
+
+    public long getEnd() {
+      return end;
+    }
+  }
+}