From e3f72438b503419a1a02c76a98f1d179babaf712 Mon Sep 17 00:00:00 2001
From: samarthjain <samarth@apache.org>
Date: Wed, 4 Mar 2020 15:33:22 -0800
Subject: [PATCH 01/12] iceberg-spark changes for vectorized reads

---
 .../arrow/vectorized/VectorHolder.java        |   2 +-
 build.gradle                                  |   1 +
 .../spark/source/IcebergSourceBenchmark.java  |  20 +-
 .../VectorizedDictionaryEncodedBenchmark.java |  28 +
 ...rizedDictionaryEncodedFloatsBenchmark.java |  62 ++
 ...zedDictionaryEncodedIntegersBenchmark.java |  62 ++
 ...orizedDictionaryEncodedLongsBenchmark.java |  62 ++
 ...izedDictionaryEncodedStringsBenchmark.java | 101 +++
 ...llbackToPlainEncodingStringsBenchmark.java |  93 +++
 .../VectorizedIcebergSourceBenchmark.java     | 223 ++++++
 .../VectorizedReadFloatsBenchmark.java        |  60 ++
 ...dReadFloatsTwentyPercentNullBenchmark.java |  42 +
 ...torizedReadIntBackedDecimalsBenchmark.java |  59 ++
 .../VectorizedReadIntegersBenchmark.java      |  59 ++
 ...eadIntegersTwentyPercentNullBenchmark.java |  42 +
 .../VectorizedReadLongsBenchmark.java         |  57 ++
 ...edReadLongsTwentyPercentNullBenchmark.java |  42 +
 .../VectorizedReadPrimitivesBenchmark.java    |  79 ++
 .../VectorizedReadStringsBenchmark.java       |  60 ++
 ...ReadStringsTwentyPercentNullBenchmark.java |  42 +
 .../iceberg/spark/arrow/ArrowUtils.java       | 113 +++
 .../data/vectorized/ColumnarBatchReaders.java |  96 +++
 .../vectorized/IcebergArrowColumnVector.java  | 753 ++++++++++++++++++
 .../vectorized/NullValuesColumnVector.java    | 130 +++
 .../VectorizedSparkParquetReaders.java        | 146 ++++
 .../spark/source/BaseTaskDataReader.java      | 115 +++
 .../source/ColumnarBatchTaskDataReader.java   |  96 +++
 .../source/InternalRowTaskDataReader.java     | 296 +++++++
 .../apache/iceberg/spark/source/Reader.java   | 160 +++-
 .../iceberg/spark/data/DictionaryData.java    | 297 +++++++
 .../apache/iceberg/spark/data/RandomData.java | 193 +++++
 .../iceberg/spark/data/TestHelpers.java       |  45 ++
 ...quetDictionaryEncodedVectorizedReader.java |  58 ++
 ...DictionaryEncodingForVectorizedReader.java |  59 ++
 .../TestSparkParquetVectorizedReader.java     | 134 ++++
 .../spark/source/TestReadProjection.java      |  10 +
 .../spark/source/TestSparkDataWrite.java      |   3 +
 .../spark/source/TestStructuredStreaming.java |   3 +
 38 files changed, 3873 insertions(+), 30 deletions(-)
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedFloatsBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedIntegersBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedLongsBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedStringsBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedFallbackToPlainEncodingStringsBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedIcebergSourceBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntBackedDecimalsBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadPrimitivesBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsBenchmark.java
 create mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowUtils.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/data/vectorized/NullValuesColumnVector.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/source/BaseTaskDataReader.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/source/ColumnarBatchTaskDataReader.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/source/InternalRowTaskDataReader.java
 create mode 100644 spark/src/test/java/org/apache/iceberg/spark/data/DictionaryData.java
 create mode 100644 spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java
 create mode 100644 spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
 create mode 100644 spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java
index d59292f14101..337111097e47 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java
@@ -24,7 +24,7 @@
 import org.apache.parquet.column.Dictionary;
 
 /**
- * Container class for holding the Arrow vector holding a batch of values along with other state needed for reading
+ * Container class for holding the Arrow vector storing a batch of values along with other state needed for reading
  * values out of it.
  */
 public class VectorHolder {
diff --git a/build.gradle b/build.gradle
index c5bed3291b45..ae33b5f130f1 100644
--- a/build.gradle
+++ b/build.gradle
@@ -410,6 +410,7 @@ project(':iceberg-spark') {
     compile project(':iceberg-parquet')
     compile project(':iceberg-arrow')
     compile project(':iceberg-hive')
+    compile project(':iceberg-arrow')
 
     compileOnly "org.apache.avro:avro"
     compileOnly("org.apache.spark:spark-hive_2.11") {
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java
index 57863e0a0169..91568db0517c 100644
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java
@@ -27,6 +27,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.UpdateProperties;
 import org.apache.iceberg.spark.SparkSchemaUtil;
 import org.apache.spark.sql.Dataset;
@@ -92,15 +93,24 @@ protected void cleanupFiles() throws IOException {
     }
   }
 
-  protected void setupSpark() {
-    spark = SparkSession.builder()
-        .config("spark.ui.enabled", false)
-        .master("local")
-        .getOrCreate();
+  protected void setupSpark(boolean enableDictionaryEncoding) {
+    SparkSession.Builder builder = SparkSession.builder()
+            .config("spark.ui.enabled", false);
+    if (!enableDictionaryEncoding) {
+      builder.config("parquet.dictionary.page.size", "1")
+              .config("parquet.enable.dictionary", false)
+              .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
+    }
+    builder.master("local");
+    spark = builder.getOrCreate();
     Configuration sparkHadoopConf = spark.sessionState().newHadoopConf();
     hadoopConf.forEach(entry -> sparkHadoopConf.set(entry.getKey(), entry.getValue()));
   }
 
+  protected void setupSpark() {
+    setupSpark(false);
+  }
+
   protected void tearDownSpark() {
     spark.stop();
   }
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedBenchmark.java
new file mode 100644
index 000000000000..3c7b72ac29ed
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedBenchmark.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+public abstract class VectorizedDictionaryEncodedBenchmark extends VectorizedIcebergSourceBenchmark {
+
+  @Override
+  protected void setupSpark() {
+    setupSpark(true);
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedFloatsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedFloatsBenchmark.java
new file mode 100644
index 000000000000..22555ef0bac4
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedFloatsBenchmark.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.lit;
+import static org.apache.spark.sql.functions.pmod;
+import static org.apache.spark.sql.functions.when;
+
+public class VectorizedDictionaryEncodedFloatsBenchmark extends VectorizedDictionaryEncodedBenchmark {
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()),
+        optional(2, "floatCol", Types.FloatType.get()));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumnRenamed("id", "longCol")
+          .drop("id")
+          .withColumn("floatCol", when(pmod(col("longCol"), lit(2)).equalTo(lit(0)), lit(0.0f)).otherwise(lit(1.0f)));
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedIntegersBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedIntegersBenchmark.java
new file mode 100644
index 000000000000..6095b472bcb0
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedIntegersBenchmark.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.lit;
+import static org.apache.spark.sql.functions.pmod;
+import static org.apache.spark.sql.functions.when;
+
+public class VectorizedDictionaryEncodedIntegersBenchmark extends VectorizedDictionaryEncodedBenchmark {
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()),
+        optional(2, "intCol", Types.IntegerType.get()));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumnRenamed("id", "longCol")
+          .drop("id")
+          .withColumn("intCol", when(pmod(col("longCol"), lit(2)).equalTo(lit(0)), lit(0)).otherwise(lit(1)));
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedLongsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedLongsBenchmark.java
new file mode 100644
index 000000000000..20b0d5f7f952
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedLongsBenchmark.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.lit;
+import static org.apache.spark.sql.functions.pmod;
+import static org.apache.spark.sql.functions.when;
+
+public class VectorizedDictionaryEncodedLongsBenchmark extends VectorizedDictionaryEncodedBenchmark {
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()),
+        optional(2, "longCol2", Types.LongType.get()));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumnRenamed("id", "longCol")
+          .drop("id")
+          .withColumn("longCol2", when(pmod(col("longCol"), lit(2)).equalTo(lit(0L)), lit(0)).otherwise(lit(1L)));
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedStringsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedStringsBenchmark.java
new file mode 100644
index 000000000000..30cc01ec4d3d
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedStringsBenchmark.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.lit;
+import static org.apache.spark.sql.functions.pmod;
+import static org.apache.spark.sql.functions.when;
+
+public class VectorizedDictionaryEncodedStringsBenchmark extends VectorizedDictionaryEncodedBenchmark {
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()), optional(2, "stringCol", Types.StringType.get()));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumn(
+              "longCol",
+              when(pmod(col("id"), lit(9))
+                  .equalTo(lit(0)), lit(0L))
+                  //.when(expr("id > NUM_ROWS/2"), lit(UUID.randomUUID().toString()))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(1)), lit(1L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(2)), lit(2L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(3)), lit(3L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(4)), lit(4L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(5)), lit(5L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(6)), lit(6L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(7)), lit(7L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(8)), lit(8L))
+                  .otherwise(lit(2L)))
+          .drop("id")
+          .withColumn(
+              "stringCol",
+              when(col("longCol")
+                  .equalTo(lit(1L)), lit("1"))
+                  .when(col("longCol")
+                      .equalTo(lit(2L)), lit("2"))
+                  .when(col("longCol")
+                      .equalTo(lit(3L)), lit("3"))
+                  .when(col("longCol")
+                      .equalTo(lit(4L)), lit("4"))
+                  .when(col("longCol")
+                      .equalTo(lit(5L)), lit("5"))
+                  .when(col("longCol")
+                      .equalTo(lit(6L)), lit("6"))
+                  .when(col("longCol")
+                      .equalTo(lit(7L)), lit("7"))
+                  .when(col("longCol")
+                      .equalTo(lit(8L)), lit("8"))
+                  .when(col("longCol")
+                      .equalTo(lit(9L)), lit("9")));
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedFallbackToPlainEncodingStringsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedFallbackToPlainEncodingStringsBenchmark.java
new file mode 100644
index 000000000000..6660df8823e0
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedFallbackToPlainEncodingStringsBenchmark.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import java.util.UUID;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.expr;
+import static org.apache.spark.sql.functions.lit;
+import static org.apache.spark.sql.functions.pmod;
+import static org.apache.spark.sql.functions.when;
+
+public class VectorizedFallbackToPlainEncodingStringsBenchmark extends VectorizedDictionaryEncodedBenchmark {
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()), optional(2, "stringCol", Types.StringType.get()));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumn(
+              "longCol",
+              when(expr("id > 10000000/2"), lit(3L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(0)), lit(1L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(1)), lit(1L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(2)), lit(1L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(3)), lit(1L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(4)), lit(1L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(5)), lit(2L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(6)), lit(2L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(7)), lit(2L))
+                  .when(pmod(col("id"), lit(9))
+                      .equalTo(lit(8)), lit(2L))
+                  .otherwise(lit(2L)))
+          .drop("id")
+          .withColumn(
+              "stringCol",
+              when(col("longCol")
+                  .equalTo(lit(1L)), lit("1"))
+                  .when(col("longCol")
+                      .equalTo(lit(2L)), lit("2"))
+                  .when(col("longCol")
+                      .equalTo(lit(3L)), lit(UUID.randomUUID().toString()))
+                  .otherwise(lit(UUID.randomUUID().toString())));
+      appendAsFile(df);
+    }
+  }
+}
+
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedIcebergSourceBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedIcebergSourceBenchmark.java
new file mode 100644
index 000000000000..a5b0c81783f5
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedIcebergSourceBenchmark.java
@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.io.IOException;
+import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.spark.source.IcebergSourceBenchmark;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.internal.SQLConf;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+
+import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST;
+
+/**
+ * Parent class of the benchmarks that compare performance of performance of reading Parquet data with a flat schema
+ * using vectorized Iceberg read path and the built-in file source in Spark.
+ * <p>
+ * To run all the the benchmarks that extend this class:
+ * <code>
+ * ./gradlew :iceberg-spark:jmh -PjmhIncludeRegex=VectorizedRead*Benchmark
+ * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-read-benchmark-result.txt
+ * </code>
+ */
+
+public abstract class VectorizedIcebergSourceBenchmark extends IcebergSourceBenchmark {
+  static final int NUM_FILES = 10;
+  static final int NUM_ROWS = 10000000;
+
+  @Setup
+  public void setupBenchmark() {
+    setupSpark();
+    appendData();
+    // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds
+    System.setProperty("arrow.enable_unsafe_memory_access", "true");
+    // Disable expensive null check for every get(index) call.
+    // Iceberg manages nullability checks itself instead of relying on arrow.
+    System.setProperty("arrow.enable_null_check_for_get", "false");
+  }
+
+  @TearDown
+  public void tearDownBenchmark() throws IOException {
+    tearDownSpark();
+    cleanupFiles();
+  }
+
+  @Override
+  protected Configuration initHadoopConf() {
+    return new Configuration();
+  }
+
+  protected abstract void appendData();
+
+  @Benchmark
+  @Threads(1)
+  public void readIcebergVectorized100() {
+    Map<String, String> tableProperties = Maps.newHashMap();
+    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
+    withTableProperties(tableProperties, () -> {
+      String tableLocation = table().location();
+      Dataset<Row> df = spark().read().format("iceberg")
+          .option("iceberg.read.numrecordsperbatch", "100")
+          .load(tableLocation);
+      materialize(df);
+    });
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void readIcebergVectorized1k() {
+    Map<String, String> tableProperties = Maps.newHashMap();
+    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
+    withTableProperties(tableProperties, () -> {
+      String tableLocation = table().location();
+      Dataset<Row> df = spark().read().format("iceberg")
+          .option("iceberg.read.numrecordsperbatch", "1000")
+          .load(tableLocation);
+      materialize(df);
+    });
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void readFileSourceIcebergVectorized5k() {
+    Map<String, String> tableProperties = Maps.newHashMap();
+    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
+    withTableProperties(tableProperties, () -> {
+      String tableLocation = table().location();
+      Dataset<Row> df = spark().read().format("iceberg")
+          .option("iceberg.read.numrecordsperbatch", "5000")
+          .load(tableLocation);
+      materialize(df);
+    });
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void readIcebergVectorized10k() {
+    Map<String, String> tableProperties = Maps.newHashMap();
+    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
+    withTableProperties(tableProperties, () -> {
+      String tableLocation = table().location();
+      Dataset<Row> df = spark().read().format("iceberg")
+          .option("iceberg.read.numrecordsperbatch", "10000")
+          .load(tableLocation);
+      materialize(df);
+    });
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void readFileSourceVectorized5k() {
+    Map<String, String> conf = Maps.newHashMap();
+    conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true");
+    conf.put(SQLConf.PARQUET_VECTORIZED_READER_BATCH_SIZE().key(), "5000");
+    conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024));
+    conf.put(SQLConf.COLUMN_VECTOR_OFFHEAP_ENABLED().key(), "true");
+    withSQLConf(conf, () -> {
+      Dataset<Row> df = spark().read().parquet(dataLocation());
+      materialize(df);
+    });
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void readFileSourceNonVectorized() {
+    Map<String, String> conf = Maps.newHashMap();
+    conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false");
+    conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024));
+    withSQLConf(conf, () -> {
+      Dataset<Row> df = spark().read().parquet(dataLocation());
+      materialize(df);
+    });
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void readWithProjectionIcebergVectorized1k() {
+    Map<String, String> tableProperties = Maps.newHashMap();
+    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
+    withTableProperties(tableProperties, () -> {
+      String tableLocation = table().location();
+      Dataset<Row> df = spark().read().format("iceberg")
+          .option("iceberg.read.numrecordsperbatch", "1000")
+          .load(tableLocation).select("longCol");
+      materialize(df);
+    });
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void readWithProjectionIcebergVectorized5k() {
+    Map<String, String> tableProperties = Maps.newHashMap();
+    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
+    withTableProperties(tableProperties, () -> {
+      String tableLocation = table().location();
+      Dataset<Row> df = spark().read().format("iceberg")
+          .option("iceberg.read.numrecordsperbatch", "5000")
+          .load(tableLocation).select("longCol");
+      materialize(df);
+    });
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void readWithProjectionIcebergVectorized10k() {
+    Map<String, String> tableProperties = Maps.newHashMap();
+    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
+    withTableProperties(tableProperties, () -> {
+      String tableLocation = table().location();
+      Dataset<Row> df = spark().read().format("iceberg")
+          .option("iceberg.read.numrecordsperbatch", "10000")
+          .load(tableLocation).select("longCol");
+      materialize(df);
+    });
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void readWithProjectionFileSourceVectorized() {
+    Map<String, String> conf = Maps.newHashMap();
+    conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true");
+    conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024));
+    withSQLConf(conf, () -> {
+      Dataset<Row> df = spark().read().parquet(dataLocation()).select("longCol");
+      materialize(df);
+    });
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void readWithProjectionFileSourceNonVectorized() {
+    Map<String, String> conf = Maps.newHashMap();
+    conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false");
+    conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024));
+    withSQLConf(conf, () -> {
+      Dataset<Row> df = spark().read().parquet(dataLocation()).select("longCol");
+      materialize(df);
+    });
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsBenchmark.java
new file mode 100644
index 000000000000..15111514a11c
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsBenchmark.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.spark.sql.functions.expr;
+
+public class VectorizedReadFloatsBenchmark extends VectorizedIcebergSourceBenchmark {
+
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()),
+        optional(2, "floatCol", Types.FloatType.get()));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumnRenamed("id", "longCol")
+          .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"));
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java
new file mode 100644
index 000000000000..d4c1d411214f
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.expr;
+import static org.apache.spark.sql.functions.lit;
+import static org.apache.spark.sql.functions.pmod;
+import static org.apache.spark.sql.functions.when;
+
+public class VectorizedReadFloatsTwentyPercentNullBenchmark extends VectorizedReadFloatsBenchmark {
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumn("longCol", when(pmod(col("id"), lit(2)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
+          .drop("id")
+          .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"));
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntBackedDecimalsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntBackedDecimalsBenchmark.java
new file mode 100644
index 000000000000..2fcab1615977
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntBackedDecimalsBenchmark.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.spark.sql.functions.expr;
+
+public class VectorizedReadIntBackedDecimalsBenchmark extends VectorizedIcebergSourceBenchmark {
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()),
+        optional(2, "decimalCol", Types.DecimalType.of(9, 0)));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumnRenamed("id", "longCol")
+          .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(9, 0))"));
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersBenchmark.java
new file mode 100644
index 000000000000..df8848663ea7
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersBenchmark.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.spark.sql.functions.expr;
+
+public class VectorizedReadIntegersBenchmark extends VectorizedIcebergSourceBenchmark {
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()),
+        optional(2, "intCol", Types.IntegerType.get()));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumnRenamed("id", "longCol")
+          .withColumn("intCol", expr("CAST(longCol AS INT)"));
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java
new file mode 100644
index 000000000000..61ae0cf50c60
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.expr;
+import static org.apache.spark.sql.functions.lit;
+import static org.apache.spark.sql.functions.pmod;
+import static org.apache.spark.sql.functions.when;
+
+public class VectorizedReadIntegersTwentyPercentNullBenchmark extends VectorizedReadIntegersBenchmark {
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumn("longCol", when(pmod(col("id"), lit(2)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
+          .drop("id")
+          .withColumn("intCol", expr("CAST(longCol AS INT)"));
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsBenchmark.java
new file mode 100644
index 000000000000..cdac8ef05c52
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsBenchmark.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+
+public class VectorizedReadLongsBenchmark extends VectorizedIcebergSourceBenchmark {
+
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumnRenamed("id", "longCol");
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java
new file mode 100644
index 000000000000..ecf9c6b21084
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.lit;
+import static org.apache.spark.sql.functions.pmod;
+import static org.apache.spark.sql.functions.when;
+
+public class VectorizedReadLongsTwentyPercentNullBenchmark extends VectorizedReadLongsBenchmark {
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumn("longCol", when(pmod(col("id"), lit(2)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
+          .drop("id");
+
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadPrimitivesBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadPrimitivesBenchmark.java
new file mode 100644
index 000000000000..3619f584fa81
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadPrimitivesBenchmark.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.current_date;
+import static org.apache.spark.sql.functions.date_add;
+import static org.apache.spark.sql.functions.expr;
+import static org.apache.spark.sql.functions.lit;
+import static org.apache.spark.sql.functions.pmod;
+import static org.apache.spark.sql.functions.when;
+
+public class VectorizedReadPrimitivesBenchmark extends VectorizedIcebergSourceBenchmark {
+
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()),
+        optional(2, "intCol", Types.LongType.get()),
+        optional(3, "floatCol", Types.LongType.get()),
+        optional(4, "doubleCol", Types.LongType.get()),
+        optional(5, "decimalCol", Types.DecimalType.of(20, 5)),
+        optional(6, "dateCol", Types.DateType.get()),
+        optional(7, "timestampCol", Types.TimestampType.withZone()),
+        optional(8, "stringCol", Types.StringType.get()));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumn("longCol", when(pmod(col("id"), lit(2)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
+          .drop("id")
+          .withColumn("intCol", expr("CAST(longCol AS BIGINT)"))
+          .withColumn("floatCol", expr("CAST(longCol AS BIGINT)"))
+          .withColumn("doubleCol", expr("CAST(longCol AS BIGINT)"))
+          .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))"))
+          .withColumn("dateCol", date_add(current_date(), fileNum))
+          .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
+          .withColumn("stringCol", expr("CAST(longCol AS STRING)"));
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsBenchmark.java
new file mode 100644
index 000000000000..94c95f93e64f
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsBenchmark.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import com.google.common.collect.Maps;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.spark.sql.functions.expr;
+
+public class VectorizedReadStringsBenchmark extends VectorizedIcebergSourceBenchmark {
+  @Override
+  protected final Table initTable() {
+    Schema schema = new Schema(
+        optional(1, "longCol", Types.LongType.get()),
+        optional(2, "stringCol", Types.StringType.get()));
+    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
+    HadoopTables tables = new HadoopTables(hadoopConf());
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
+    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
+    return tables.create(schema, partitionSpec, properties, newTableLocation());
+  }
+
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumnRenamed("id", "longCol")
+          .withColumn("stringCol", expr("CAST(longCol AS STRING)"));
+
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java
new file mode 100644
index 000000000000..d2a4037b89f6
--- /dev/null
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet.vectorized;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.expr;
+import static org.apache.spark.sql.functions.lit;
+import static org.apache.spark.sql.functions.pmod;
+import static org.apache.spark.sql.functions.when;
+
+public class VectorizedReadStringsTwentyPercentNullBenchmark extends VectorizedReadStringsBenchmark {
+  @Override
+  protected void appendData() {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      Dataset<Row> df = spark().range(NUM_ROWS)
+          .withColumn("id", when(pmod(col("id"), lit(2)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
+          .withColumn("stringCol", expr("CAST(longCol AS STRING)"));
+
+      appendAsFile(df);
+    }
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowUtils.java b/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowUtils.java
new file mode 100644
index 000000000000..02fbc435dc81
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowUtils.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.arrow;
+
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.types.DateUnit;
+import org.apache.arrow.vector.types.FloatingPointPrecision;
+import org.apache.arrow.vector.types.TimeUnit;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.spark.sql.types.ArrayType;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.DecimalType;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+public class ArrowUtils {
+
+  private static ArrowUtils instance;
+  private RootAllocator rootAllocator;
+
+  private ArrowUtils() {
+    rootAllocator = new RootAllocator(Long.MAX_VALUE);
+  }
+
+  public static ArrowUtils instance() {
+    if (instance == null) {
+      instance = new ArrowUtils();
+    }
+    return instance;
+  }
+
+  public RootAllocator rootAllocator() {
+    return rootAllocator;
+  }
+
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
+  public DataType fromArrowType(ArrowType data) {
+
+    if (data instanceof ArrowType.Bool) {
+      return DataTypes.BooleanType;
+    } else if (data instanceof ArrowType.Int) {
+      ArrowType.Int intData = (ArrowType.Int) data;
+      if (intData.getIsSigned() && intData.getBitWidth() == 8) {
+        return DataTypes.ByteType;
+      } else if (intData.getIsSigned() && intData.getBitWidth() == 8 * 2) {
+        return DataTypes.ShortType;
+      } else if (intData.getIsSigned() && intData.getBitWidth() == 8 * 4) {
+        return DataTypes.IntegerType;
+      } else if (intData.getIsSigned() && intData.getBitWidth() == 8 * 8) {
+        return DataTypes.LongType;
+      }
+    } else if (data instanceof ArrowType.FloatingPoint) {
+      ArrowType.FloatingPoint floatData = (ArrowType.FloatingPoint) data;
+      if (floatData.getPrecision() == FloatingPointPrecision.SINGLE) {
+        return DataTypes.FloatType;
+      } else if (floatData.getPrecision() == FloatingPointPrecision.DOUBLE) {
+        return DataTypes.DoubleType;
+      }
+    } else if (data instanceof ArrowType.Utf8) {
+      return DataTypes.StringType;
+    } else if (data instanceof ArrowType.Binary) {
+      return DataTypes.BinaryType;
+    } else if (data instanceof ArrowType.Decimal) {
+      ArrowType.Decimal decimalData = (ArrowType.Decimal) data;
+      return new DecimalType(decimalData.getPrecision(), decimalData.getScale());
+    } else if (data instanceof ArrowType.Date && ((ArrowType.Date) data).getUnit() == DateUnit.DAY) {
+      return DataTypes.DateType;
+    } else if (data instanceof ArrowType.Timestamp && ((ArrowType.Timestamp) data).getUnit() == TimeUnit.MICROSECOND) {
+      return DataTypes.TimestampType;
+    }
+
+    throw new UnsupportedOperationException("Unsupported data type: " + data);
+  }
+
+  public DataType fromArrowField(Field field) {
+    ArrowType arrowType = field.getType();
+    if (arrowType instanceof ArrowType.List) {
+      Field elementField = field.getChildren().get(0);
+      DataType elementType = fromArrowField(elementField);
+      return new ArrayType(elementType, elementField.isNullable());
+    } else if (arrowType instanceof ArrowType.Struct) {
+      StructField[] fields = new StructField[field.getChildren().size()];
+      int index = 0;
+      for (Field f : field.getChildren()) {
+        DataType dt = fromArrowField(f);
+        fields[index++] = new StructField(f.getName(), dt, f.isNullable(), Metadata.empty());
+      }
+      return new StructType(fields);
+    } else {
+      return fromArrowType(arrowType);
+    }
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java
new file mode 100644
index 000000000000..478b5a9b8c50
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data.vectorized;
+
+import java.lang.reflect.Array;
+import java.util.List;
+import java.util.Map;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.iceberg.arrow.vectorized.VectorHolder;
+import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader;
+import org.apache.iceberg.parquet.VectorizedReader;
+import org.apache.parquet.column.page.PageReadStore;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnPath;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+/**
+ * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized read path. The
+ * {@link ColumnarBatch} returned is created by passing in the Arrow vectors populated via delegated read calls to
+ * {@linkplain VectorizedArrowReader VectorReader(s)}.
+ */
+public class ColumnarBatchReaders implements VectorizedReader<ColumnarBatch> {
+  private final VectorizedArrowReader[] readers;
+  private final int batchSize;
+
+  public ColumnarBatchReaders(List<VectorizedReader> readers, int bSize) {
+    this.readers = (VectorizedArrowReader[]) Array.newInstance(
+        VectorizedArrowReader.class, readers.size());
+    int idx = 0;
+    for (VectorizedReader reader : readers) {
+      this.readers[idx] = (VectorizedArrowReader) reader;
+      idx++;
+    }
+    this.batchSize = bSize;
+  }
+
+  @Override
+  public final void setRowGroupInfo(PageReadStore pageStore, Map<ColumnPath, ColumnChunkMetaData> metaData) {
+    for (int i = 0; i < readers.length; i += 1) {
+      if (readers[i] != null) {
+        readers[i].setRowGroupInfo(pageStore, metaData);
+      }
+    }
+  }
+
+  @Override
+  public void reuseContainers(boolean reuse) {
+    for (VectorizedReader reader : readers) {
+      reader.reuseContainers(reuse);
+    }
+  }
+
+  @Override
+  public final ColumnarBatch read(int numValsToRead) {
+    ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length];
+    int numRows = 0;
+    for (int i = 0; i < readers.length; i += 1) {
+      VectorHolder holder = readers[i].read(numValsToRead);
+      FieldVector vector = holder.vector();
+      if (vector == null) {
+        arrowColumnVectors[i] = new NullValuesColumnVector(batchSize);
+      } else {
+        arrowColumnVectors[i] = new IcebergArrowColumnVector(holder);
+        numRows = vector.getValueCount();
+      }
+    }
+    ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors);
+    batch.setNumRows(numRows);
+    return batch;
+  }
+
+  @Override
+  public void close() {
+    for (VectorizedReader reader : readers) {
+      reader.close();
+    }
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
new file mode 100644
index 000000000000..ac4002a34ec9
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
@@ -0,0 +1,753 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data.vectorized;
+
+import io.netty.buffer.ArrowBuf;
+import java.math.BigInteger;
+import org.apache.arrow.vector.BigIntVector;
+import org.apache.arrow.vector.BitVector;
+import org.apache.arrow.vector.DateDayVector;
+import org.apache.arrow.vector.FixedSizeBinaryVector;
+import org.apache.arrow.vector.Float4Vector;
+import org.apache.arrow.vector.Float8Vector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.SmallIntVector;
+import org.apache.arrow.vector.TimeStampMicroTZVector;
+import org.apache.arrow.vector.TinyIntVector;
+import org.apache.arrow.vector.ValueVector;
+import org.apache.arrow.vector.VarBinaryVector;
+import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.StructVector;
+import org.apache.arrow.vector.holders.NullableVarCharHolder;
+import org.apache.iceberg.arrow.vectorized.IcebergArrowVectors;
+import org.apache.iceberg.arrow.vectorized.NullabilityHolder;
+import org.apache.iceberg.arrow.vectorized.VectorHolder;
+import org.apache.iceberg.spark.arrow.ArrowUtils;
+import org.apache.parquet.Preconditions;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.DecimalMetadata;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.vectorized.ArrowColumnVector;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarMap;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily inspired from Spark's
+ * {@link ArrowColumnVector} The main difference is in how nullability checks are made in this class by relying on
+ * {@link NullabilityHolder} instead of the validity vector in the Arrow vector.
+ */
+
+public class IcebergArrowColumnVector extends ColumnVector {
+
+  private final ArrowVectorAccessor accessor;
+  private final NullabilityHolder nullabilityHolder;
+  private final Dictionary dictionary;
+  private final boolean isVectorDictEncoded;
+  private ArrowColumnVector[] childColumns;
+
+  public IcebergArrowColumnVector(VectorHolder holder) {
+    super(ArrowUtils.instance().fromArrowField(holder.vector().getField()));
+    this.nullabilityHolder = holder.nullabilityHolder();
+    this.dictionary = holder.dictionary();
+    this.isVectorDictEncoded = holder.isDictionaryEncoded();
+    this.accessor = getVectorAccessor(holder.descriptor(), holder.vector());
+  }
+
+  @Override
+  public void close() {
+    if (childColumns != null) {
+      for (int i = 0; i < childColumns.length; i++) {
+        childColumns[i].close();
+        childColumns[i] = null;
+      }
+      childColumns = null;
+    }
+    accessor.close();
+  }
+
+  @Override
+  public boolean hasNull() {
+    return nullabilityHolder.hasNulls();
+  }
+
+  @Override
+  public int numNulls() {
+    return nullabilityHolder.numNulls();
+  }
+
+  @Override
+  public boolean isNullAt(int rowId) {
+    return nullabilityHolder.isNullAt(rowId) == 1;
+  }
+
+  @Override
+  public boolean getBoolean(int rowId) {
+    return accessor.getBoolean(rowId);
+  }
+
+  @Override
+  public byte getByte(int rowId) {
+    return accessor.getByte(rowId);
+  }
+
+  @Override
+  public short getShort(int rowId) {
+    return accessor.getShort(rowId);
+  }
+
+  @Override
+  public int getInt(int rowId) {
+    return accessor.getInt(rowId);
+  }
+
+  @Override
+  public long getLong(int rowId) {
+    return accessor.getLong(rowId);
+  }
+
+  @Override
+  public float getFloat(int rowId) {
+    return accessor.getFloat(rowId);
+  }
+
+  @Override
+  public double getDouble(int rowId) {
+    return accessor.getDouble(rowId);
+  }
+
+  @Override
+  public ColumnarArray getArray(int rowId) {
+    if (isNullAt(rowId)) {
+      return null;
+    }
+    return accessor.getArray(rowId);
+  }
+
+  @Override
+  public ColumnarMap getMap(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Decimal getDecimal(int rowId, int precision, int scale) {
+    if (isNullAt(rowId)) {
+      return null;
+    }
+    return accessor.getDecimal(rowId, precision, scale);
+  }
+
+  @Override
+  public UTF8String getUTF8String(int rowId) {
+    if (isNullAt(rowId)) {
+      return null;
+    }
+    return accessor.getUTF8String(rowId);
+  }
+
+  @Override
+  public byte[] getBinary(int rowId) {
+    if (isNullAt(rowId)) {
+      return null;
+    }
+    return accessor.getBinary(rowId);
+  }
+
+  @Override
+  public ArrowColumnVector getChild(int ordinal) {
+    return childColumns[ordinal];
+  }
+
+  private abstract class ArrowVectorAccessor {
+
+    private final ValueVector vector;
+
+    ArrowVectorAccessor(ValueVector vector) {
+      this.vector = vector;
+    }
+
+    final boolean isNullAt(int rowId) {
+      return nullabilityHolder.isNullAt(rowId) == 1;
+    }
+
+    final void close() {
+      vector.close();
+    }
+
+    boolean getBoolean(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    byte getByte(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    short getShort(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    int getInt(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    long getLong(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    float getFloat(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    double getDouble(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    Decimal getDecimal(int rowId, int precision, int scale) {
+      throw new UnsupportedOperationException();
+    }
+
+    UTF8String getUTF8String(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    byte[] getBinary(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    ColumnarArray getArray(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
+  private ArrowVectorAccessor getVectorAccessor(ColumnDescriptor desc, ValueVector vector) {
+    PrimitiveType primitive = desc.getPrimitiveType();
+    if (isVectorDictEncoded) {
+      Preconditions.checkState(vector instanceof IntVector, "Dictionary ids should be stored in IntVectors only");
+      if (primitive.getOriginalType() != null) {
+        switch (desc.getPrimitiveType().getOriginalType()) {
+          case ENUM:
+          case JSON:
+          case UTF8:
+          case BSON:
+            return new DictionaryStringAccessor((IntVector) vector);
+          case INT_8:
+          case INT_16:
+          case INT_32:
+          case DATE:
+            return new DictionaryIntAccessor((IntVector) vector);
+          case INT_64:
+          case TIMESTAMP_MILLIS:
+          case TIMESTAMP_MICROS:
+            return new DictionaryLongAccessor((IntVector) vector);
+          case DECIMAL:
+            DecimalMetadata decimal = primitive.getDecimalMetadata();
+            switch (primitive.getPrimitiveTypeName()) {
+              case BINARY:
+              case FIXED_LEN_BYTE_ARRAY:
+                return new DictionaryDecimalBinaryAccessor(
+                    (IntVector) vector,
+                    decimal.getPrecision(),
+                    decimal.getScale());
+              case INT64:
+                return new DictionaryDecimalLongAccessor(
+                    (IntVector) vector,
+                    decimal.getPrecision(),
+                    decimal.getScale());
+              case INT32:
+                return new DictionaryDecimalIntAccessor(
+                    (IntVector) vector,
+                    decimal.getPrecision(),
+                    decimal.getScale());
+              default:
+                throw new UnsupportedOperationException(
+                    "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
+            }
+          default:
+            throw new UnsupportedOperationException(
+                "Unsupported logical type: " + primitive.getOriginalType());
+        }
+      } else {
+        switch (primitive.getPrimitiveTypeName()) {
+          case FIXED_LEN_BYTE_ARRAY:
+          case BINARY:
+            return new DictionaryBinaryAccessor((IntVector) vector);
+          case INT32:
+            return new DictionaryIntAccessor((IntVector) vector);
+          case FLOAT:
+            return new DictionaryFloatAccessor((IntVector) vector);
+          case INT64:
+            return new DictionaryLongAccessor((IntVector) vector);
+          case DOUBLE:
+            return new DictionaryDoubleAccessor((IntVector) vector);
+          default:
+            throw new UnsupportedOperationException("Unsupported type: " + primitive);
+        }
+      }
+    } else {
+      if (vector instanceof BitVector) {
+        return new BooleanAccessor((BitVector) vector);
+      } else if (vector instanceof TinyIntVector) {
+        return new ByteAccessor((TinyIntVector) vector);
+      } else if (vector instanceof SmallIntVector) {
+        return new ShortAccessor((SmallIntVector) vector);
+      } else if (vector instanceof IntVector) {
+        return new IntAccessor((IntVector) vector);
+      } else if (vector instanceof BigIntVector) {
+        return new LongAccessor((BigIntVector) vector);
+      } else if (vector instanceof Float4Vector) {
+        return new FloatAccessor((Float4Vector) vector);
+      } else if (vector instanceof Float8Vector) {
+        return new DoubleAccessor((Float8Vector) vector);
+      } else if (vector instanceof IcebergArrowVectors.DecimalArrowVector) {
+        return new DecimalAccessor((IcebergArrowVectors.DecimalArrowVector) vector);
+      } else if (vector instanceof IcebergArrowVectors.VarcharArrowVector) {
+        return new StringAccessor((IcebergArrowVectors.VarcharArrowVector) vector);
+      } else if (vector instanceof IcebergArrowVectors.VarBinaryArrowVector) {
+        return new BinaryAccessor((IcebergArrowVectors.VarBinaryArrowVector) vector);
+      } else if (vector instanceof DateDayVector) {
+        return new DateAccessor((DateDayVector) vector);
+      } else if (vector instanceof TimeStampMicroTZVector) {
+        return new TimestampAccessor((TimeStampMicroTZVector) vector);
+      } else if (vector instanceof ListVector) {
+        ListVector listVector = (ListVector) vector;
+        return new ArrayAccessor(listVector);
+      } else if (vector instanceof StructVector) {
+        StructVector structVector = (StructVector) vector;
+        ArrowVectorAccessor structAccessor = new StructAccessor(structVector);
+        childColumns = new ArrowColumnVector[structVector.size()];
+        for (int i = 0; i < childColumns.length; ++i) {
+          childColumns[i] = new ArrowColumnVector(structVector.getVectorById(i));
+        }
+        return structAccessor;
+      }
+    }
+    throw new UnsupportedOperationException();
+  }
+
+  private class BooleanAccessor extends ArrowVectorAccessor {
+
+    private final BitVector vector;
+
+    BooleanAccessor(BitVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final boolean getBoolean(int rowId) {
+      return vector.get(rowId) == 1;
+    }
+  }
+
+  private class ByteAccessor extends ArrowVectorAccessor {
+
+    private final TinyIntVector vector;
+
+    ByteAccessor(TinyIntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final byte getByte(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private class ShortAccessor extends ArrowVectorAccessor {
+
+    private final SmallIntVector vector;
+
+    ShortAccessor(SmallIntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final short getShort(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private class IntAccessor extends ArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    IntAccessor(IntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final int getInt(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private class DictionaryIntAccessor extends ArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    DictionaryIntAccessor(IntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final int getInt(int rowId) {
+      return dictionary.decodeToInt(vector.get(rowId));
+    }
+  }
+
+  private class LongAccessor extends ArrowVectorAccessor {
+
+    private final BigIntVector vector;
+
+    LongAccessor(BigIntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final long getLong(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private class DictionaryLongAccessor extends ArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    DictionaryLongAccessor(IntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final long getLong(int rowId) {
+      return dictionary.decodeToLong(vector.get(rowId));
+    }
+  }
+
+  private class FloatAccessor extends ArrowVectorAccessor {
+
+    private final Float4Vector vector;
+
+    FloatAccessor(Float4Vector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final float getFloat(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private class DictionaryFloatAccessor extends ArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    DictionaryFloatAccessor(IntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final float getFloat(int rowId) {
+      return dictionary.decodeToFloat(vector.get(rowId));
+    }
+  }
+
+  private class DoubleAccessor extends ArrowVectorAccessor {
+
+    private final Float8Vector vector;
+
+    DoubleAccessor(Float8Vector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final double getDouble(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private class DictionaryDoubleAccessor extends ArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    DictionaryDoubleAccessor(IntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final double getDouble(int rowId) {
+      return dictionary.decodeToDouble(vector.get(rowId));
+    }
+  }
+
+  private class DecimalAccessor extends ArrowVectorAccessor {
+
+    private final IcebergArrowVectors.DecimalArrowVector vector;
+
+    DecimalAccessor(IcebergArrowVectors.DecimalArrowVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final Decimal getDecimal(int rowId, int precision, int scale) {
+      if (isNullAt(rowId)) {
+        return null;
+      }
+      return Decimal.apply(vector.getObject(rowId), precision, scale);
+    }
+  }
+
+  private class StringAccessor extends ArrowVectorAccessor {
+
+    private final IcebergArrowVectors.VarcharArrowVector vector;
+    private final NullableVarCharHolder stringResult = new NullableVarCharHolder();
+
+    StringAccessor(IcebergArrowVectors.VarcharArrowVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final UTF8String getUTF8String(int rowId) {
+      vector.get(rowId, stringResult);
+      if (stringResult.isSet == 0) {
+        return null;
+      } else {
+        return UTF8String.fromAddress(
+            null,
+            stringResult.buffer.memoryAddress() + stringResult.start,
+            stringResult.end - stringResult.start);
+      }
+    }
+  }
+
+  private class DictionaryStringAccessor extends ArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    DictionaryStringAccessor(IntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final UTF8String getUTF8String(int rowId) {
+      if (isNullAt(rowId)) {
+        return null;
+      }
+      Binary binary = dictionary.decodeToBinary(vector.get(rowId));
+      return UTF8String.fromBytes(binary.getBytesUnsafe());
+    }
+  }
+
+  private class FixedSizeBinaryAccessor extends ArrowVectorAccessor {
+
+    private final FixedSizeBinaryVector vector;
+
+    FixedSizeBinaryAccessor(FixedSizeBinaryVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final byte[] getBinary(int rowId) {
+      return vector.getObject(rowId);
+    }
+  }
+
+  private class BinaryAccessor extends ArrowVectorAccessor {
+
+    private final VarBinaryVector vector;
+
+    BinaryAccessor(VarBinaryVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final byte[] getBinary(int rowId) {
+      return vector.getObject(rowId);
+    }
+  }
+
+  private class DictionaryBinaryAccessor extends ArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    DictionaryBinaryAccessor(IntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final byte[] getBinary(int rowId) {
+      Binary binary = dictionary.decodeToBinary(vector.get(rowId));
+      return binary.getBytesUnsafe();
+    }
+  }
+
+  private class DateAccessor extends ArrowVectorAccessor {
+
+    private final DateDayVector vector;
+
+    DateAccessor(DateDayVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final int getInt(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private class DictionaryDateAccessor extends DictionaryIntAccessor {
+    DictionaryDateAccessor(IntVector vector) {
+      super(vector);
+    }
+  }
+
+  private class TimestampAccessor extends ArrowVectorAccessor {
+
+    private final TimeStampMicroTZVector vector;
+
+    TimestampAccessor(TimeStampMicroTZVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final long getLong(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private class DictionaryTimestampAccessor extends DictionaryLongAccessor {
+    DictionaryTimestampAccessor(IntVector vector) {
+      super(vector);
+    }
+  }
+
+  private class ArrayAccessor extends ArrowVectorAccessor {
+
+    private final ListVector vector;
+    private final ArrowColumnVector arrayData;
+
+    ArrayAccessor(ListVector vector) {
+      super(vector);
+      this.vector = vector;
+      this.arrayData = new ArrowColumnVector(vector.getDataVector());
+    }
+
+    @Override
+    final ColumnarArray getArray(int rowId) {
+      ArrowBuf offsets = vector.getOffsetBuffer();
+      int index = rowId * ListVector.OFFSET_WIDTH;
+      int start = offsets.getInt(index);
+      int end = offsets.getInt(index + ListVector.OFFSET_WIDTH);
+      return new ColumnarArray(arrayData, start, end - start);
+    }
+  }
+
+  /**
+   * Any call to "get" method will throw UnsupportedOperationException.
+   * <p>
+   * Access struct values in a ArrowColumnVector doesn't use this vector. Instead, it uses getStruct() method defined in
+   * the parent class. Any call to "get" method in this class is a bug in the code.
+   */
+  private class StructAccessor extends ArrowVectorAccessor {
+
+    StructAccessor(StructVector vector) {
+      super(vector);
+    }
+  }
+
+  private class DictionaryDecimalBinaryAccessor extends ArrowVectorAccessor {
+    private final IntVector vector;
+
+    DictionaryDecimalBinaryAccessor(IntVector vector, int precision, int scale) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    //TODO: still need to evaluate if this is the most efficient way
+    @Override
+    final Decimal getDecimal(int rowId, int precision, int scale) {
+      if (isNullAt(rowId)) {
+        return null;
+      }
+      Binary value = dictionary.decodeToBinary(vector.get(rowId));
+      BigInteger unscaledValue = new BigInteger(value.getBytesUnsafe());
+      return Decimal.apply(unscaledValue.longValue(), precision, scale);
+    }
+  }
+
+  private class DictionaryDecimalLongAccessor extends ArrowVectorAccessor {
+    private final IntVector vector;
+
+    DictionaryDecimalLongAccessor(IntVector vector, int precision, int scale) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    //TODO: still need to evaluate if this is the most efficient way
+    @Override
+    final Decimal getDecimal(int rowId, int precision, int scale) {
+      if (isNullAt(rowId)) {
+        return null;
+      }
+      long unscaledValue = dictionary.decodeToLong(vector.get(rowId));
+      return Decimal.apply(unscaledValue, precision, scale);
+    }
+  }
+
+  private class DictionaryDecimalIntAccessor extends ArrowVectorAccessor {
+    private final IntVector vector;
+
+    DictionaryDecimalIntAccessor(IntVector vector, int precision, int scale) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final Decimal getDecimal(int rowId, int precision, int scale) {
+      if (isNullAt(rowId)) {
+        return null;
+      }
+      int unscaledValue = dictionary.decodeToInt(vector.get(rowId));
+      return Decimal.apply(unscaledValue, precision, scale);
+    }
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/NullValuesColumnVector.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/NullValuesColumnVector.java
new file mode 100644
index 000000000000..933fd8c00927
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/NullValuesColumnVector.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data.vectorized;
+
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
+import org.apache.iceberg.spark.arrow.ArrowUtils;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarMap;
+import org.apache.spark.unsafe.types.UTF8String;
+
+public class NullValuesColumnVector extends ColumnVector {
+
+  private final int numNulls;
+  private static final String NULL_FIELD_NAME = "NULL_FIELD";
+  private static final Field NULL_ARROW_FIELD = new Field(
+      NULL_FIELD_NAME,
+      new FieldType(true, new ArrowType.Int(Integer.SIZE, true), null, null),
+      null);
+
+  public NullValuesColumnVector(int nValues) {
+    super(ArrowUtils.instance().fromArrowField(NULL_ARROW_FIELD));
+    this.numNulls = nValues;
+  }
+
+  @Override
+  public void close() {
+
+  }
+
+  @Override
+  public boolean hasNull() {
+    return true;
+  }
+
+  @Override
+  public int numNulls() {
+    return numNulls;
+  }
+
+  @Override
+  public boolean isNullAt(int rowId) {
+    return true;
+  }
+
+  @Override
+  public boolean getBoolean(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public byte getByte(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public short getShort(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public int getInt(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public long getLong(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public float getFloat(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public double getDouble(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public ColumnarArray getArray(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public ColumnarMap getMap(int ordinal) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Decimal getDecimal(int rowId, int precision, int scale) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public UTF8String getUTF8String(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public byte[] getBinary(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  protected ColumnVector getChild(int ordinal) {
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
new file mode 100644
index 000000000000..61fe6d664fd5
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data.vectorized;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader;
+import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
+import org.apache.iceberg.parquet.VectorizedReader;
+import org.apache.iceberg.spark.arrow.ArrowUtils;
+import org.apache.iceberg.types.Types;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+
+public class VectorizedSparkParquetReaders {
+
+  private VectorizedSparkParquetReaders() {
+  }
+
+  @SuppressWarnings("unchecked")
+  public static ColumnarBatchReaders buildReader(
+      Schema tableSchema,
+      Schema expectedSchema,
+      MessageType fileSchema,
+      Integer recordsPerBatch) {
+    return (ColumnarBatchReaders)
+        TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
+            new VectorizedReaderBuilder(tableSchema, expectedSchema, fileSchema, recordsPerBatch));
+  }
+
+  private static class VectorizedReaderBuilder extends TypeWithSchemaVisitor<VectorizedReader> {
+    private final MessageType parquetSchema;
+    private final Schema tableIcebergSchema;
+    private final BufferAllocator rootAllocator;
+    private final int batchSize;
+
+    VectorizedReaderBuilder(
+        Schema tableSchema,
+        Schema projectedIcebergSchema,
+        MessageType parquetSchema,
+        int bSize) {
+      this.parquetSchema = parquetSchema;
+      this.tableIcebergSchema = tableSchema;
+      this.batchSize = bSize;
+      this.rootAllocator = ArrowUtils.instance().rootAllocator()
+          .newChildAllocator("VectorizedReadBuilder", 0, Long.MAX_VALUE);
+    }
+
+    @Override
+    public VectorizedReader message(
+            Types.StructType expected, MessageType message,
+            List<VectorizedReader> fieldReaders) {
+      return struct(expected, message.asGroupType(), fieldReaders);
+    }
+
+    @Override
+    public VectorizedReader struct(
+            Types.StructType expected, GroupType struct,
+            List<VectorizedReader> fieldReaders) {
+
+      Map<Integer, VectorizedReader> readersById = Maps.newHashMap();
+      List<Type> fields = struct.getFields();
+
+      for (int i = 0; i < fields.size(); i += 1) {
+        Type fieldType = fields.get(i);
+        int id = fieldType.getId().intValue();
+        readersById.put(id, fieldReaders.get(i));
+      }
+
+      List<Types.NestedField> icebergFields = expected != null ?
+          expected.fields() : ImmutableList.of();
+
+      List<VectorizedReader> reorderedFields = Lists.newArrayListWithExpectedSize(
+          icebergFields.size());
+
+      for (Types.NestedField field : icebergFields) {
+        int id = field.fieldId();
+        VectorizedReader reader = readersById.get(id);
+        if (reader != null) {
+          reorderedFields.add(reader);
+        } else {
+          reorderedFields.add(VectorizedArrowReader.NULL_VALUES_READER);
+        }
+      }
+      return new ColumnarBatchReaders(reorderedFields, batchSize);
+    }
+
+    @Override
+    public VectorizedReader primitive(
+        org.apache.iceberg.types.Type.PrimitiveType expected,
+        PrimitiveType primitive) {
+
+      // Create arrow vector for this field
+      int parquetFieldId = primitive.getId().intValue();
+      ColumnDescriptor desc = parquetSchema.getColumnDescription(currentPath());
+      // Nested types not yet supported for vectorized reads
+      if (desc.getMaxRepetitionLevel() > 0) {
+        return null;
+      }
+      Types.NestedField icebergField = tableIcebergSchema.findField(parquetFieldId);
+      return new VectorizedArrowReader(desc, icebergField, rootAllocator,
+          batchSize, /* setArrowValidityVector */ false);
+    }
+
+    private String[] currentPath() {
+      String[] path = new String[fieldNames.size()];
+      if (!fieldNames.isEmpty()) {
+        Iterator<String> iter = fieldNames.descendingIterator();
+        for (int i = 0; iter.hasNext(); i += 1) {
+          path[i] = iter.next();
+        }
+      }
+      return path;
+    }
+
+    protected MessageType type() {
+      return parquetSchema;
+    }
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/BaseTaskDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/BaseTaskDataReader.java
new file mode 100644
index 000000000000..fff2c20dff5f
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/BaseTaskDataReader.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Iterables;
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
+import org.apache.iceberg.CombinedScanTask;
+import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.common.DynMethods;
+import org.apache.iceberg.encryption.EncryptedFiles;
+import org.apache.iceberg.encryption.EncryptionManager;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.InputFile;
+import org.apache.spark.rdd.InputFileBlockHolder;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
+
+@SuppressWarnings("checkstyle:VisibilityModifier")
+abstract class BaseTaskDataReader<T> implements Closeable {
+  // for some reason, the apply method can't be called from Java without reflection
+  static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply")
+      .impl(UnsafeProjection.class, InternalRow.class)
+      .build();
+
+  final Iterator<FileScanTask> tasks;
+  final Schema tableSchema;
+  final Schema expectedSchema;
+  final FileIO fileIo;
+  final Map<String, InputFile> inputFiles;
+  final boolean caseSensitive;
+
+  Iterator<T> currentIterator;
+  Closeable currentCloseable = null;
+  T current = null;
+  final int batchSize;
+
+  BaseTaskDataReader(
+      CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
+      EncryptionManager encryptionManager, boolean caseSensitive) {
+    this(task, tableSchema, expectedSchema, fileIo, encryptionManager, caseSensitive, -1);
+  }
+
+  BaseTaskDataReader(
+      CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
+      EncryptionManager encryptionManager, boolean caseSensitive, int bSize) {
+    this.fileIo = fileIo;
+    this.tasks = task.files().iterator();
+    this.tableSchema = tableSchema;
+    this.expectedSchema = expectedSchema;
+    Iterable<InputFile> decryptedFiles = encryptionManager.decrypt(Iterables.transform(
+        task.files(),
+        fileScanTask ->
+            EncryptedFiles.encryptedInput(
+                this.fileIo.newInputFile(fileScanTask.file().path().toString()),
+                fileScanTask.file().keyMetadata())));
+    ImmutableMap.Builder<String, InputFile> inputFileBuilder = ImmutableMap.builder();
+    decryptedFiles.forEach(decrypted -> inputFileBuilder.put(decrypted.location(), decrypted));
+    this.inputFiles = inputFileBuilder.build();
+    this.caseSensitive = caseSensitive;
+    this.batchSize = bSize;
+    // open last because the schemas, fileIo and batchSize must be set
+    this.currentIterator = open(tasks.next());
+  }
+
+  public boolean next() throws IOException {
+    while (true) {
+      if (currentIterator.hasNext()) {
+        this.current = currentIterator.next();
+        return true;
+      } else if (tasks.hasNext()) {
+        this.currentCloseable.close();
+        this.currentIterator = open(tasks.next());
+      } else {
+        return false;
+      }
+    }
+  }
+
+  abstract Iterator<T> open(FileScanTask task);
+
+  @Override
+  public void close() throws IOException {
+    InputFileBlockHolder.unset();
+
+    // close the current iterator
+    this.currentCloseable.close();
+
+    // exhaust the task iterator
+    while (tasks.hasNext()) {
+      tasks.next();
+    }
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/ColumnarBatchTaskDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/ColumnarBatchTaskDataReader.java
new file mode 100644
index 000000000000..aa470759cb43
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/ColumnarBatchTaskDataReader.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source;
+
+import com.google.common.base.Preconditions;
+import java.util.Iterator;
+import org.apache.iceberg.CombinedScanTask;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.encryption.EncryptionManager;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.spark.SparkSchemaUtil;
+import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
+import org.apache.spark.sql.sources.v2.reader.InputPartitionReader;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+class ColumnarBatchTaskDataReader extends BaseTaskDataReader<ColumnarBatch>
+    implements InputPartitionReader<ColumnarBatch> {
+
+  ColumnarBatchTaskDataReader(
+      CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
+      EncryptionManager encryptionManager, boolean caseSensitive, int bSize) {
+    super(task, tableSchema, expectedSchema, fileIo, encryptionManager, caseSensitive, bSize);
+  }
+
+  @Override
+  public ColumnarBatch get() {
+    return current;
+  }
+
+  @Override
+  Iterator<ColumnarBatch> open(FileScanTask task) {
+    // schema or rows returned by readers
+    Schema finalSchema = expectedSchema;
+    // schema needed for the projection and filtering
+    StructType sparkType = SparkSchemaUtil.convert(finalSchema);
+    Schema requiredSchema = SparkSchemaUtil.prune(tableSchema, sparkType, task.residual(), caseSensitive);
+    boolean hasExtraFilterColumns = requiredSchema.columns().size() != finalSchema.columns().size();
+    Iterator<ColumnarBatch> iter;
+    if (hasExtraFilterColumns) {
+      iter = open(task, requiredSchema);
+    } else {
+      iter = open(task, finalSchema);
+    }
+    return iter;
+  }
+
+  private Iterator<ColumnarBatch> open(FileScanTask task, Schema readSchema) {
+    CloseableIterable<ColumnarBatch> iter;
+    InputFile location = inputFiles.get(task.file().path().toString());
+    Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
+    if (task.file().format() == FileFormat.PARQUET) {
+      iter = Parquet.read(location)
+          .project(readSchema)
+          .split(task.start(), task.length())
+          .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(tableSchema, readSchema,
+              fileSchema, batchSize))
+          .filter(task.residual())
+          .caseSensitive(caseSensitive)
+          .recordsPerBatch(batchSize)
+          // Spark eagerly consumes the batches so the underlying memory allocated could be reused
+          // without worrying about subsequent reads clobbering over each other. This improves
+          // read performance as every batch read doesn't have to pay the cost of allocating memory.
+          .reuseContainers()
+          .build();
+    } else {
+      throw new UnsupportedOperationException(
+          "Format: " + task.file().format() + " not supported for batched reads");
+    }
+    this.currentCloseable = iter;
+    return iter.iterator();
+  }
+
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowTaskDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowTaskDataReader.java
new file mode 100644
index 000000000000..65563c16bc5c
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowTaskDataReader.java
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.function.Function;
+import org.apache.iceberg.CombinedScanTask;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DataTask;
+import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.PartitionField;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.StructLike;
+import org.apache.iceberg.avro.Avro;
+import org.apache.iceberg.encryption.EncryptionManager;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.orc.ORC;
+import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.spark.SparkSchemaUtil;
+import org.apache.iceberg.spark.data.SparkAvroReader;
+import org.apache.iceberg.spark.data.SparkOrcReader;
+import org.apache.iceberg.spark.data.SparkParquetReaders;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.ByteBuffers;
+import org.apache.spark.rdd.InputFileBlockHolder;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.Attribute;
+import org.apache.spark.sql.catalyst.expressions.AttributeReference;
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
+import org.apache.spark.sql.catalyst.expressions.JoinedRow;
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
+import org.apache.spark.sql.sources.v2.reader.InputPartitionReader;
+import org.apache.spark.sql.types.BinaryType;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.types.DecimalType;
+import org.apache.spark.sql.types.StringType;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.unsafe.types.UTF8String;
+import scala.collection.JavaConverters;
+
+class InternalRowTaskDataReader extends BaseTaskDataReader<InternalRow> implements InputPartitionReader<InternalRow> {
+
+  InternalRowTaskDataReader(
+      CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
+      EncryptionManager encryptionManager, boolean caseSensitive) {
+    super(task, tableSchema, expectedSchema, fileIo, encryptionManager, caseSensitive);
+  }
+
+  @Override
+  public InternalRow get() {
+    return current;
+  }
+
+  @Override
+  Iterator<InternalRow> open(FileScanTask task) {
+    DataFile file = task.file();
+
+    // update the current file for Spark's filename() function
+    InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());
+
+    // schema or rows returned by readers
+    Schema finalSchema = expectedSchema;
+    PartitionSpec spec = task.spec();
+    Set<Integer> idColumns = spec.identitySourceIds();
+
+    // schema needed for the projection and filtering
+    StructType sparkType = SparkSchemaUtil.convert(finalSchema);
+    Schema requiredSchema = SparkSchemaUtil.prune(tableSchema, sparkType, task.residual(), caseSensitive);
+    boolean hasJoinedPartitionColumns = !idColumns.isEmpty();
+    boolean hasExtraFilterColumns = requiredSchema.columns().size() != finalSchema.columns().size();
+
+    Schema iterSchema;
+    Iterator<InternalRow> iter;
+
+    if (hasJoinedPartitionColumns) {
+      // schema used to read data files
+      Schema readSchema = TypeUtil.selectNot(requiredSchema, idColumns);
+      Schema partitionSchema = TypeUtil.select(requiredSchema, idColumns);
+      PartitionRowConverter convertToRow = new PartitionRowConverter(partitionSchema, spec);
+      JoinedRow joined = new JoinedRow();
+
+      InternalRow partition = convertToRow.apply(file.partition());
+      joined.withRight(partition);
+
+      // create joined rows and project from the joined schema to the final schema
+      iterSchema = TypeUtil.join(readSchema, partitionSchema);
+      iter = Iterators.transform(open(task, readSchema), joined::withLeft);
+    } else if (hasExtraFilterColumns) {
+      // add projection to the final schema
+      iterSchema = requiredSchema;
+      iter = open(task, requiredSchema);
+    } else {
+      // return the base iterator
+      iterSchema = finalSchema;
+      iter = open(task, finalSchema);
+    }
+
+    // TODO: remove the projection by reporting the iterator's schema back to Spark
+    return Iterators.transform(
+        iter,
+        APPLY_PROJECTION.bind(projection(finalSchema, iterSchema))::invoke);
+  }
+
+  private Iterator<InternalRow> open(FileScanTask task, Schema readSchema) {
+    CloseableIterable<InternalRow> iter;
+    //TODO: samarth can there be a data task for columnar batch counterpart?
+    if (task.isDataTask()) {
+      iter = newDataIterable(task.asDataTask(), readSchema);
+    } else {
+      InputFile location = inputFiles.get(task.file().path().toString());
+      Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
+
+      switch (task.file().format()) {
+        case PARQUET:
+          iter = newParquetIterable(location, task, readSchema);
+          break;
+
+        case AVRO:
+          iter = newAvroIterable(location, task, readSchema);
+          break;
+
+        case ORC:
+          iter = newOrcIterable(location, task, readSchema);
+          break;
+
+        default:
+          throw new UnsupportedOperationException(
+              "Cannot read unknown format: " + task.file().format());
+      }
+    }
+
+    this.currentCloseable = iter;
+
+    return iter.iterator();
+  }
+
+  private CloseableIterable<InternalRow> newAvroIterable(
+      InputFile location,
+      FileScanTask task,
+      Schema readSchema) {
+    return Avro.read(location)
+        .reuseContainers()
+        .project(readSchema)
+        .split(task.start(), task.length())
+        .createReaderFunc(SparkAvroReader::new)
+        .build();
+  }
+
+  private CloseableIterable<InternalRow> newParquetIterable(
+      InputFile location,
+      FileScanTask task,
+      Schema readSchema) {
+    return Parquet.read(location)
+        .project(readSchema)
+        .split(task.start(), task.length())
+        .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema))
+        .filter(task.residual())
+        .caseSensitive(caseSensitive)
+        .build();
+  }
+
+  private CloseableIterable<InternalRow> newOrcIterable(
+      InputFile location,
+      FileScanTask task,
+      Schema readSchema) {
+    return ORC.read(location)
+        .schema(readSchema)
+        .split(task.start(), task.length())
+        .createReaderFunc(SparkOrcReader::new)
+        .caseSensitive(caseSensitive)
+        .build();
+  }
+
+  private CloseableIterable<InternalRow> newDataIterable(DataTask task, Schema readSchema) {
+    StructInternalRow row = new StructInternalRow(tableSchema.asStruct());
+    CloseableIterable<InternalRow> asSparkRows = CloseableIterable.transform(
+        task.asDataTask().rows(), row::setStruct);
+    return CloseableIterable.transform(
+        asSparkRows, APPLY_PROJECTION.bind(projection(readSchema, tableSchema))::invoke);
+  }
+
+  private static UnsafeProjection projection(Schema finalSchema, Schema readSchema) {
+    StructType struct = SparkSchemaUtil.convert(readSchema);
+
+    List<AttributeReference> refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava();
+    List<Attribute> attrs = Lists.newArrayListWithExpectedSize(struct.fields().length);
+    List<org.apache.spark.sql.catalyst.expressions.Expression> exprs =
+        Lists.newArrayListWithExpectedSize(struct.fields().length);
+
+    for (AttributeReference ref : refs) {
+      attrs.add(ref.toAttribute());
+    }
+
+    for (Types.NestedField field : finalSchema.columns()) {
+      int indexInReadSchema = struct.fieldIndex(field.name());
+      exprs.add(refs.get(indexInReadSchema));
+    }
+
+    return UnsafeProjection.create(
+        JavaConverters.asScalaBufferConverter(exprs).asScala().toSeq(),
+        JavaConverters.asScalaBufferConverter(attrs).asScala().toSeq());
+  }
+
+  private static class PartitionRowConverter implements Function<StructLike, InternalRow> {
+    private final DataType[] types;
+    private final int[] positions;
+    private final Class<?>[] javaTypes;
+    private final GenericInternalRow reusedRow;
+
+    PartitionRowConverter(Schema partitionSchema, PartitionSpec spec) {
+      StructType partitionType = SparkSchemaUtil.convert(partitionSchema);
+      StructField[] fields = partitionType.fields();
+
+      this.types = new DataType[fields.length];
+      this.positions = new int[types.length];
+      this.javaTypes = new Class<?>[types.length];
+      this.reusedRow = new GenericInternalRow(types.length);
+
+      List<PartitionField> partitionFields = spec.fields();
+      for (int rowIndex = 0; rowIndex < fields.length; rowIndex += 1) {
+        this.types[rowIndex] = fields[rowIndex].dataType();
+
+        int sourceId = partitionSchema.columns().get(rowIndex).fieldId();
+        for (int specIndex = 0; specIndex < partitionFields.size(); specIndex += 1) {
+          PartitionField field = spec.fields().get(specIndex);
+          if (field.sourceId() == sourceId && "identity".equals(field.transform().toString())) {
+            positions[rowIndex] = specIndex;
+            javaTypes[rowIndex] = spec.javaClasses()[specIndex];
+            break;
+          }
+        }
+      }
+    }
+
+    @Override
+    public InternalRow apply(StructLike tuple) {
+      for (int i = 0; i < types.length; i += 1) {
+        Object value = tuple.get(positions[i], javaTypes[i]);
+        if (value != null) {
+          reusedRow.update(i, convert(value, types[i]));
+        } else {
+          reusedRow.setNullAt(i);
+        }
+      }
+
+      return reusedRow;
+    }
+
+    /**
+     * Converts the objects into instances used by Spark's InternalRow.
+     *
+     * @param value a data value
+     * @param type the Spark data type
+     * @return the value converted to the representation expected by Spark's InternalRow.
+     */
+    private static Object convert(Object value, DataType type) {
+      if (type instanceof StringType) {
+        return UTF8String.fromString(value.toString());
+      } else if (type instanceof BinaryType) {
+        return ByteBuffers.toByteArray((ByteBuffer) value);
+      } else if (type instanceof DecimalType) {
+        return Decimal.fromDecimal(value);
+      }
+      return value;
+    }
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
index 1f3d26e4b185..0f38df328ce3 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -19,15 +19,18 @@
 
 package org.apache.iceberg.spark.source;
 
+import com.google.common.base.Preconditions;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Optional;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.iceberg.CombinedScanTask;
+import org.apache.iceberg.FileFormat;
 import org.apache.iceberg.FileScanTask;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.SchemaParser;
@@ -35,6 +38,7 @@
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.TableScan;
+import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader;
 import org.apache.iceberg.encryption.EncryptionManager;
 import org.apache.iceberg.exceptions.RuntimeIOException;
 import org.apache.iceberg.expressions.Expression;
@@ -59,14 +63,16 @@
 import org.apache.spark.sql.sources.v2.reader.SupportsPushDownFilters;
 import org.apache.spark.sql.sources.v2.reader.SupportsPushDownRequiredColumns;
 import org.apache.spark.sql.sources.v2.reader.SupportsReportStatistics;
+import org.apache.spark.sql.sources.v2.reader.SupportsScanColumnarBatch;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-class Reader implements DataSourceReader, SupportsPushDownFilters, SupportsPushDownRequiredColumns,
-    SupportsReportStatistics {
+class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPushDownFilters,
+    SupportsPushDownRequiredColumns, SupportsReportStatistics {
   private static final Logger LOG = LoggerFactory.getLogger(Reader.class);
 
   private static final Filter[] NO_FILTERS = new Filter[0];
@@ -87,14 +93,17 @@ class Reader implements DataSourceReader, SupportsPushDownFilters, SupportsPushD
   private List<Expression> filterExpressions = null;
   private Filter[] pushedFilters = NO_FILTERS;
   private final boolean localityPreferred;
+  private final int batchSize;
 
   // lazy variables
   private Schema schema = null;
   private StructType type = null; // cached because Spark accesses it multiple times
   private List<CombinedScanTask> tasks = null; // lazy cache of tasks
+  private Boolean enableBatchRead = null; // cache variable for enabling batched reads
 
-  Reader(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
-         boolean caseSensitive, DataSourceOptions options) {
+  Reader(
+      Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
+      boolean caseSensitive, DataSourceOptions options) {
     this.table = table;
     this.snapshotId = options.get("snapshot-id").map(Long::parseLong).orElse(null);
     this.asOfTimestamp = options.get("as-of-timestamp").map(Long::parseLong).orElse(null);
@@ -145,6 +154,14 @@ class Reader implements DataSourceReader, SupportsPushDownFilters, SupportsPushD
     this.io = io;
     this.encryptionManager = encryptionManager;
     this.caseSensitive = caseSensitive;
+
+    boolean enableBatchReadsConfig =
+        options.get("iceberg.read.parquet-vectorization.enabled").map(Boolean::parseBoolean).orElse(true);
+    if (!enableBatchReadsConfig) {
+      enableBatchRead = Boolean.FALSE;
+    }
+    Optional<String> numRecordsPerBatchOpt = options.get("iceberg.read.parquet-vectorization.batch-size");
+    this.batchSize = numRecordsPerBatchOpt.map(Integer::parseInt).orElse(VectorizedArrowReader.DEFAULT_BATCH_SIZE);
   }
 
   private Schema lazySchema() {
@@ -178,6 +195,30 @@ public StructType readSchema() {
     return lazyType();
   }
 
+  /**
+   * This is called in the Spark Driver when data is to be materialized into {@link ColumnarBatch}
+   */
+  @Override
+  public List<InputPartition<ColumnarBatch>> planBatchInputPartitions() {
+    Preconditions.checkState(enableBatchRead != null && enableBatchRead, "Batched reads not enabled");
+    Preconditions.checkState(batchSize > 0, "Invalid batch size");
+    String tableSchemaString = SchemaParser.toJson(table.schema());
+    String expectedSchemaString = SchemaParser.toJson(lazySchema());
+
+    List<InputPartition<ColumnarBatch>> readTasks = Lists.newArrayList();
+    for (CombinedScanTask task : tasks()) {
+      readTasks.add(
+          new ColumnarBatchReadTask(task, tableSchemaString, expectedSchemaString,
+              io, encryptionManager, caseSensitive, localityPreferred, batchSize));
+    }
+    LOG.info("Batching input partitions with {} tasks.", readTasks.size());
+
+    return readTasks;
+  }
+
+  /**
+   * This is called in the Spark Driver when data is to be materialized into {@link InternalRow}
+   */
   @Override
   public List<InputPartition<InternalRow>> planInputPartitions() {
     String tableSchemaString = SchemaParser.toJson(table.schema());
@@ -186,7 +227,7 @@ public List<InputPartition<InternalRow>> planInputPartitions() {
     List<InputPartition<InternalRow>> readTasks = Lists.newArrayList();
     for (CombinedScanTask task : tasks()) {
       readTasks.add(
-          new ReadTask(task, tableSchemaString, expectedSchemaString, io, encryptionManager,
+          new InternalRowReadTask(task, tableSchemaString, expectedSchemaString, io, encryptionManager,
               caseSensitive, localityPreferred));
     }
 
@@ -249,6 +290,46 @@ public Statistics estimateStatistics() {
     return new Stats(sizeInBytes, numRows);
   }
 
+  @Override
+  public boolean enableBatchRead() {
+    return lazyCheckEnableBatchRead();
+  }
+
+  private boolean lazyCheckEnableBatchRead() {
+    if (enableBatchRead == null) {
+      boolean allParquetFileScanTasks =
+          tasks().stream()
+              .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files()
+                  .stream()
+                  .allMatch(fileScanTask -> fileScanTask.file().format().equals(
+                      FileFormat.PARQUET)));
+      if (!allParquetFileScanTasks) {
+        this.enableBatchRead = false;
+        return false;
+      }
+
+      int numColumns = lazySchema().columns().size();
+      if (numColumns == 0) {
+        this.enableBatchRead = false;
+        return false;
+      }
+
+      boolean projectIdentityPartitionColumn =
+          tasks().stream()
+              .anyMatch(combinedScanTask -> combinedScanTask.files()
+                  .stream()
+                  .anyMatch(fileScanTask -> !fileScanTask.spec().identitySourceIds().isEmpty()));
+      if (projectIdentityPartitionColumn) {
+        this.enableBatchRead = false;
+        return false;
+      }
+
+      // Enable batched reads only if all requested columns are primitive otherwise revert to row-based reads
+      this.enableBatchRead = lazySchema().columns().stream().allMatch(c -> c.type().isPrimitiveType());
+    }
+    return enableBatchRead;
+  }
+
   private static void mergeIcebergHadoopConfs(
       Configuration baseConf, Map<String, String> options) {
     options.keySet().stream()
@@ -299,7 +380,7 @@ private List<CombinedScanTask> tasks() {
 
       try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks()) {
         this.tasks = Lists.newArrayList(tasksIterable);
-      }  catch (IOException e) {
+      } catch (IOException e) {
         throw new RuntimeIOException(e, "Failed to close table scan: %s", scan);
       }
     }
@@ -310,26 +391,27 @@ private List<CombinedScanTask> tasks() {
   @Override
   public String toString() {
     return String.format(
-        "IcebergScan(table=%s, type=%s, filters=%s, caseSensitive=%s)",
-        table, lazySchema().asStruct(), filterExpressions, caseSensitive);
+        "IcebergScan(table=%s, type=%s, filters=%s, caseSensitive=%s, batchedReads=%s)",
+        table, lazySchema().asStruct(), filterExpressions, caseSensitive, enableBatchRead());
   }
 
-  private static class ReadTask implements InputPartition<InternalRow>, Serializable {
-    private final CombinedScanTask task;
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  private abstract static class BaseReadTask<T> implements Serializable, InputPartition<T> {
+    final CombinedScanTask task;
     private final String tableSchemaString;
     private final String expectedSchemaString;
-    private final Broadcast<FileIO> io;
-    private final Broadcast<EncryptionManager> encryptionManager;
-    private final boolean caseSensitive;
+    final Broadcast<FileIO> io;
+    final Broadcast<EncryptionManager> encryptionManager;
+    final boolean caseSensitive;
     private final boolean localityPreferred;
 
     private transient Schema tableSchema = null;
     private transient Schema expectedSchema = null;
     private transient String[] preferredLocations;
 
-    private ReadTask(CombinedScanTask task, String tableSchemaString, String expectedSchemaString,
-                     Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
-                     boolean caseSensitive, boolean localityPreferred) {
+    private BaseReadTask(CombinedScanTask task, String tableSchemaString, String expectedSchemaString,
+        Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
+        boolean caseSensitive, boolean localityPreferred) {
       this.task = task;
       this.tableSchemaString = tableSchemaString;
       this.expectedSchemaString = expectedSchemaString;
@@ -340,25 +422,19 @@ private ReadTask(CombinedScanTask task, String tableSchemaString, String expecte
       this.preferredLocations = getPreferredLocations();
     }
 
-    @Override
-    public InputPartitionReader<InternalRow> createPartitionReader() {
-      return new RowDataReader(task, lazyTableSchema(), lazyExpectedSchema(), io.value(),
-        encryptionManager.value(), caseSensitive);
-    }
-
     @Override
     public String[] preferredLocations() {
       return preferredLocations;
     }
 
-    private Schema lazyTableSchema() {
+    Schema lazyTableSchema() {
       if (tableSchema == null) {
         this.tableSchema = SchemaParser.fromJson(tableSchemaString);
       }
       return tableSchema;
     }
 
-    private Schema lazyExpectedSchema() {
+    Schema lazyExpectedSchema() {
       if (expectedSchema == null) {
         this.expectedSchema = SchemaParser.fromJson(expectedSchemaString);
       }
@@ -375,6 +451,42 @@ private String[] getPreferredLocations() {
     }
   }
 
+  private static class InternalRowReadTask extends BaseReadTask<InternalRow> {
+
+    private InternalRowReadTask(
+        CombinedScanTask task, String tableSchemaString, String expectedSchemaString,
+        Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
+        boolean caseSensitive, boolean localityPreferred) {
+      super(task, tableSchemaString, expectedSchemaString, io, encryptionManager, caseSensitive, localityPreferred);
+    }
+
+    @Override
+    public InputPartitionReader<InternalRow> createPartitionReader() {
+      return new RowDataReader(task, lazyTableSchema(), lazyExpectedSchema(), io.value(),
+          encryptionManager.value(), caseSensitive);
+    }
+  }
+
+  /**
+   * Organizes input data into [InputPartition]s for Vectorized [ColumnarBatch] reads
+   */
+  private static class ColumnarBatchReadTask extends BaseReadTask<ColumnarBatch> {
+    private final int batchSize;
+
+    ColumnarBatchReadTask(
+        CombinedScanTask task, String tableSchemaString, String expectedSchemaString, Broadcast<FileIO> fileIo,
+        Broadcast<EncryptionManager> encryptionManager, boolean caseSensitive, boolean localityPreferred, int size) {
+      super(task, tableSchemaString, expectedSchemaString, fileIo, encryptionManager, caseSensitive, localityPreferred);
+      this.batchSize = size;
+    }
+
+    @Override
+    public InputPartitionReader<ColumnarBatch> createPartitionReader() {
+      return new BatchDataReader(task, lazyTableSchema(), lazyExpectedSchema(), io.value(),
+          encryptionManager.value(), caseSensitive, batchSize);
+    }
+  }
+
   private static class StructLikeInternalRow implements StructLike {
     private final DataType[] types;
     private InternalRow row = null;
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/DictionaryData.java b/spark/src/test/java/org/apache/iceberg/spark/data/DictionaryData.java
new file mode 100644
index 000000000000..5451e55b7ff3
--- /dev/null
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/DictionaryData.java
@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Supplier;
+import org.apache.avro.generic.GenericData;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.avro.AvroSchemaUtil;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.unsafe.types.UTF8String;
+
+
+public class DictionaryData {
+
+  private DictionaryData() {}
+
+  public static List<GenericData.Record> generateDictionaryEncodableData(Schema schema, int numRecords, long seed) {
+    List<GenericData.Record> records = Lists.newArrayListWithExpectedSize(numRecords);
+    DictionaryDataGenerator dictionaryDataGenerator = new DictionaryDataGenerator(schema, seed);
+    for (int i = 0; i < numRecords; i += 1) {
+      GenericData.Record rec = (GenericData.Record) TypeUtil.visit(schema, dictionaryDataGenerator);
+      records.add(rec);
+    }
+    return records;
+  }
+
+  private static class DictionaryDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
+    private final Map<Type, org.apache.avro.Schema> typeToSchema;
+    private final Random random;
+
+    private DictionaryDataGenerator(Schema schema, long seed) {
+      this.typeToSchema = AvroSchemaUtil.convertTypes(schema.asStruct(), "test");
+      this.random = new Random(seed);
+    }
+
+    @Override
+    public GenericData.Record schema(Schema schema, Supplier<Object> structResult) {
+      return (GenericData.Record) structResult.get();
+    }
+
+    @Override
+    public GenericData.Record struct(Types.StructType struct, Iterable<Object> fieldResults) {
+      GenericData.Record rec = new GenericData.Record(typeToSchema.get(struct));
+
+      List<Object> values = Lists.newArrayList(fieldResults);
+      for (int i = 0; i < values.size(); i += 1) {
+        rec.put(i, values.get(i));
+      }
+
+      return rec;
+    }
+
+    @Override
+    public Object field(Types.NestedField field, Supplier<Object> fieldResult) {
+      // return null 5% of the time when the value is optional
+      if (field.isOptional() && random.nextInt(20) == 1) {
+        return null;
+      }
+      return fieldResult.get();
+    }
+
+    @Override
+    public Object list(Types.ListType list, Supplier<Object> elementResult) {
+      int numElements = random.nextInt(20);
+
+      List<Object> result = Lists.newArrayListWithExpectedSize(numElements);
+      for (int i = 0; i < numElements; i += 1) {
+        // return null 5% of the time when the value is optional
+        if (list.isElementOptional() && random.nextInt(20) == 1) {
+          result.add(null);
+        } else {
+          result.add(elementResult.get());
+        }
+      }
+
+      return result;
+    }
+
+    @Override
+    public Object map(Types.MapType map, Supplier<Object> keyResult, Supplier<Object> valueResult) {
+      int numEntries = random.nextInt(20);
+
+      Map<Object, Object> result = Maps.newLinkedHashMap();
+      Set<Object> keySet = Sets.newHashSet();
+      for (int i = 0; i < numEntries; i += 1) {
+        Object key = keyResult.get();
+        // ensure no collisions
+        while (keySet.contains(key)) {
+          key = keyResult.get();
+        }
+
+        keySet.add(key);
+
+        // return null 5% of the time when the value is optional
+        if (map.isValueOptional() && random.nextInt(20) == 1) {
+          result.put(key, null);
+        } else {
+          result.put(key, valueResult.get());
+        }
+      }
+
+      return result;
+    }
+
+    @Override
+    public Object primitive(Type.PrimitiveType primitive) {
+      Object result = generatePrimitive(primitive, random);
+      // For the primitives that Avro needs a different type than Spark, fix
+      // them here.
+      switch (primitive.typeId()) {
+        case STRING:
+          return ((UTF8String) result).toString();
+        case FIXED:
+          return new GenericData.Fixed(
+              typeToSchema.get(primitive),
+              (byte[]) result);
+        case BINARY:
+          return ByteBuffer.wrap((byte[]) result);
+        case UUID:
+          return UUID.nameUUIDFromBytes((byte[]) result);
+        case DECIMAL:
+          return ((Decimal) result).toJavaBigDecimal();
+        default:
+          return result;
+      }
+    }
+  }
+
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
+  private static Object generatePrimitive(
+      Type.PrimitiveType primitive,
+      Random random) {
+    // 3 choices
+    int choice = random.nextInt(3);
+    switch (primitive.typeId()) {
+      case BOOLEAN:
+        return true; // doesn't really matter for booleans since they are not dictionary encoded
+
+      case INTEGER:
+        switch (choice) {
+          case 0:
+            return 0;
+          case 1:
+            return 1;
+          case 2:
+            return 2;
+        }
+
+      case LONG:
+        switch (choice) {
+          case 0:
+            return 0L;
+          case 1:
+            return 1L;
+          case 2:
+            return 2L;
+        }
+
+      case FLOAT:
+        switch (choice) {
+          case 0:
+            return 0.0f;
+          case 1:
+            return 1.0f;
+          case 2:
+            return 2.0f;
+        }
+
+      case DOUBLE:
+        switch (choice) {
+          case 0:
+            return 0.0d;
+          case 1:
+            return 1.0d;
+          case 2:
+            return 2.0d;
+        }
+
+      case DATE:
+        switch (choice) {
+          case 0:
+            return 0;
+          case 1:
+            return 1;
+          case 2:
+            return 2;
+        }
+
+      case TIME:
+        switch (choice) {
+          case 0:
+            return 0L;
+          case 1:
+            return 1L;
+          case 2:
+            return 2L;
+        }
+
+      case TIMESTAMP:
+        switch (choice) {
+          case 0:
+            return 0L;
+          case 1:
+            return 1L;
+          case 2:
+            return 2L;
+        }
+
+      case STRING:
+        switch (choice) {
+          case 0:
+            return UTF8String.fromString("0");
+          case 1:
+            return UTF8String.fromString("1");
+          case 2:
+            return UTF8String.fromString("2");
+        }
+
+      case FIXED:
+        byte[] fixed = new byte[((Types.FixedType) primitive).length()];
+        switch (choice) {
+          case 0:
+            fixed[0] = 0;
+            return fixed;
+          case 1:
+            fixed[0] = 1;
+            return fixed;
+          case 2:
+            fixed[0] = 2;
+            return fixed;
+        }
+
+      case BINARY:
+        byte[] binary = new byte[4];
+        switch (choice) {
+          case 0:
+            binary[0] = 0;
+            return binary;
+          case 1:
+            binary[0] = 1;
+            return binary;
+          case 2:
+            binary[0] = 2;
+            return binary;
+        }
+
+      case DECIMAL:
+        Types.DecimalType type = (Types.DecimalType) primitive;
+        switch (choice) {
+          case 0:
+            BigInteger unscaled = new BigInteger("1");
+            return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+          case 1:
+            unscaled = new BigInteger("2");
+            return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+          case 2:
+            unscaled = new BigInteger("3");
+            return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+        }
+
+      default:
+        throw new IllegalArgumentException(
+            "Cannot generate random value for unknown type: " + primitive);
+    }
+  }
+}
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
index b5f0b7153b7a..e6517619514a 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
@@ -103,6 +103,22 @@ public Record next() {
     };
   }
 
+  public static List<Record> generateListWithFallBackDictionaryEncodingForStrings(
+          Schema schema,
+          int numRecords,
+          long seed,
+          float fraction) {
+    FallbackDictionaryEncodedDataGenerator generator =
+            new FallbackDictionaryEncodedDataGenerator(schema, seed, numRecords, fraction);
+    List<Record> records = Lists.newArrayListWithExpectedSize(numRecords);
+    for (int i = 0; i < numRecords; i += 1) {
+      Record rec = (Record) TypeUtil.visit(schema, generator);
+      records.add(rec);
+    }
+
+    return records;
+  }
+
   private static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
     private final Map<Type, org.apache.avro.Schema> typeToSchema;
     private final Random random;
@@ -295,4 +311,181 @@ public Object primitive(Type.PrimitiveType primitive) {
       }
     }
   }
+
+  private static class FallbackDictionaryEncodedDataGenerator extends RandomDataGenerator {
+
+    private final int numRecords;
+    private final float fraction;
+    private int current;
+
+    private FallbackDictionaryEncodedDataGenerator(Schema schema, long seed, int numRecords, float fraction) {
+      super(schema, seed);
+      this.numRecords = numRecords;
+      this.fraction = fraction;
+    }
+
+    @Override
+    public Object primitive(Type.PrimitiveType primitive) {
+      switch (primitive.typeId()) {
+        case STRING:
+          if (current < fraction * numRecords) {
+            current++;
+            return "ABC";
+          } else {
+            current++;
+            return super.primitive(primitive);
+          }
+      }
+      return super.primitive(primitive);
+    }
+  }
+
+  @SuppressWarnings("RandomModInteger")
+  private static Object generatePrimitive(Type.PrimitiveType primitive,
+                                         Random random) {
+    int choice = random.nextInt(20);
+
+    switch (primitive.typeId()) {
+      case BOOLEAN:
+        return choice < 10;
+
+      case INTEGER:
+        switch (choice) {
+          case 1:
+            return Integer.MIN_VALUE;
+          case 2:
+            return Integer.MAX_VALUE;
+          case 3:
+            return 0;
+          default:
+            return random.nextInt();
+        }
+
+      case LONG:
+        switch (choice) {
+          case 1:
+            return Long.MIN_VALUE;
+          case 2:
+            return Long.MAX_VALUE;
+          case 3:
+            return 0L;
+          default:
+            return random.nextLong();
+        }
+
+      case FLOAT:
+        switch (choice) {
+          case 1:
+            return Float.MIN_VALUE;
+          case 2:
+            return -Float.MIN_VALUE;
+          case 3:
+            return Float.MAX_VALUE;
+          case 4:
+            return -Float.MAX_VALUE;
+          case 5:
+            return Float.NEGATIVE_INFINITY;
+          case 6:
+            return Float.POSITIVE_INFINITY;
+          case 7:
+            return 0.0F;
+          case 8:
+            return Float.NaN;
+          default:
+            return random.nextFloat();
+        }
+
+      case DOUBLE:
+        switch (choice) {
+          case 1:
+            return Double.MIN_VALUE;
+          case 2:
+            return -Double.MIN_VALUE;
+          case 3:
+            return Double.MAX_VALUE;
+          case 4:
+            return -Double.MAX_VALUE;
+          case 5:
+            return Double.NEGATIVE_INFINITY;
+          case 6:
+            return Double.POSITIVE_INFINITY;
+          case 7:
+            return 0.0D;
+          case 8:
+            return Double.NaN;
+          default:
+            return random.nextDouble();
+        }
+
+      case DATE:
+        // this will include negative values (dates before 1970-01-01)
+        return random.nextInt() % ABOUT_380_YEARS_IN_DAYS;
+
+      case TIME:
+        return (random.nextLong() & Integer.MAX_VALUE) % ONE_DAY_IN_MICROS;
+
+      case TIMESTAMP:
+        return random.nextLong() % FIFTY_YEARS_IN_MICROS;
+
+      case STRING:
+        return randomString(random);
+
+      case UUID:
+        byte[] uuidBytes = new byte[16];
+        random.nextBytes(uuidBytes);
+        // this will hash the uuidBytes
+        return uuidBytes;
+
+      case FIXED:
+        byte[] fixed = new byte[((Types.FixedType) primitive).length()];
+        random.nextBytes(fixed);
+        return fixed;
+
+      case BINARY:
+        byte[] binary = new byte[random.nextInt(50)];
+        random.nextBytes(binary);
+        return binary;
+
+      case DECIMAL:
+        Types.DecimalType type = (Types.DecimalType) primitive;
+        BigInteger unscaled = randomUnscaled(type.precision(), random);
+        return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+
+      default:
+        throw new IllegalArgumentException(
+            "Cannot generate random value for unknown type: " + primitive);
+    }
+  }
+
+  private static final long FIFTY_YEARS_IN_MICROS =
+      (50L * (365 * 3 + 366) * 24 * 60 * 60 * 1_000_000) / 4;
+  private static final int ABOUT_380_YEARS_IN_DAYS = 380 * 365;
+  private static final long ONE_DAY_IN_MICROS = 24 * 60 * 60 * 1_000_000L;
+  private static final String CHARS =
+      "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-.!?";
+
+  private static UTF8String randomString(Random random) {
+    int length = random.nextInt(50);
+    byte[] buffer = new byte[length];
+
+    for (int i = 0; i < length; i += 1) {
+      buffer[i] = (byte) CHARS.charAt(random.nextInt(CHARS.length()));
+    }
+
+    return UTF8String.fromBytes(buffer);
+  }
+
+  private static final String DIGITS = "0123456789";
+
+  private static BigInteger randomUnscaled(int precision, Random random) {
+    int length = random.nextInt(precision);
+    if (length == 0) {
+      return BigInteger.ZERO;
+    }
+
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < length; i += 1) {
+      sb.append(DIGITS.charAt(random.nextInt(DIGITS.length())));
+    }
+  }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
index 433f87c75582..7b0450561d06 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
@@ -36,6 +36,7 @@
 import org.apache.avro.generic.GenericData.Record;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.data.vectorized.IcebergArrowColumnVector;
 import org.apache.iceberg.types.Type;
 import org.apache.iceberg.types.Types;
 import org.apache.orc.storage.serde2.io.DateWritable;
@@ -53,6 +54,8 @@
 import org.apache.spark.sql.types.MapType;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.UTF8String;
 import org.junit.Assert;
 import scala.collection.Seq;
@@ -78,6 +81,48 @@ public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row
     }
   }
 
+  public static void assertEqualsUnsafe(Types.StructType struct, List<Record> expected, ColumnarBatch batch) {
+    List<Types.NestedField> fields = struct.fields();
+    for (int r = 0; r < batch.numRows(); r++) {
+
+      Record expRec = expected.get(r);
+      InternalRow actualRow = batch.getRow(r);
+
+      for (int i = 0; i < fields.size(); i += 1) {
+
+        Type fieldType = fields.get(i).type();
+        Object expectedValue = expRec.get(i);
+        if (actualRow.isNullAt(i)) {
+          Assert.assertTrue("Expect null at " + r, expectedValue == null);
+        } else {
+          Object actualValue = actualRow.get(i, convert(fieldType));
+          assertEqualsUnsafe(fieldType, expectedValue, actualValue);
+        }
+      }
+    }
+  }
+
+  public static void assertArrowVectors(Types.StructType struct, List<Record> expected,
+                                        ColumnarBatch batch) {
+    List<Types.NestedField> fields = struct.fields();
+    for (int r = 0; r < batch.numRows(); r++) {
+      Record expRec = expected.get(r);
+      InternalRow actualRow = batch.getRow(r);
+      for (int i = 0; i < fields.size(); i += 1) {
+        ColumnVector vector = batch.column(i);
+        Assert.assertTrue(vector instanceof IcebergArrowColumnVector);
+        Type fieldType = fields.get(i).type();
+        Object expectedValue = expRec.get(i);
+        if (actualRow.isNullAt(i)) {
+          Assert.assertNull(expectedValue);
+        } else {
+          Object actualValue = actualRow.get(i, convert(fieldType));
+          assertEqualsUnsafe(fieldType, expectedValue, actualValue);
+        }
+      }
+    }
+  }
+
   private static void assertEqualsSafe(Types.ListType list, Collection<?> expected, List actual) {
     Type elementType = list.elementType();
     List<?> expectedElements = Lists.newArrayList(expected);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java
new file mode 100644
index 000000000000..20b0a2f908d7
--- /dev/null
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import org.apache.avro.generic.GenericData;
+import org.apache.iceberg.Files;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types;
+import org.junit.Assert;
+import org.junit.Assume;
+
+public class TestSparkParquetDictionaryEncodedVectorizedReader extends TestSparkParquetVectorizedReader {
+
+  @Override
+  protected void writeAndValidate(Schema schema) throws IOException {
+    // Write test data
+    Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(
+        schema,
+        type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));
+
+    List<GenericData.Record> expected = DictionaryData.generateDictionaryEncodableData(schema, 100000, 0L);
+
+    // write a test parquet file using iceberg writer
+    File testFile = temp.newFile();
+    Assert.assertTrue("Delete should succeed", testFile.delete());
+
+    try (FileAppender<GenericData.Record> writer = Parquet.write(Files.localOutput(testFile))
+        .schema(schema)
+        .named("test")
+        .build()) {
+      writer.addAll(expected);
+    }
+    assertRecordsMatch(schema, expected, testFile);
+  }
+}
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
new file mode 100644
index 000000000000..76cb1d877d5b
--- /dev/null
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import org.apache.avro.generic.GenericData;
+import org.apache.iceberg.Files;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types;
+import org.junit.Assert;
+import org.junit.Assume;
+
+public class TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader extends TestSparkParquetVectorizedReader {
+
+  @Override
+  protected void writeAndValidate(Schema schema) throws IOException {
+    // Write test data
+    Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(
+        schema,
+        type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));
+
+    List<GenericData.Record> expected =
+        RandomData.generateListWithFallBackDictionaryEncodingForStrings(schema, 100000, 0L, 0.5f);
+
+    // write a test parquet file using iceberg writer
+    File testFile = temp.newFile();
+    Assert.assertTrue("Delete should succeed", testFile.delete());
+
+    try (FileAppender<GenericData.Record> writer = Parquet.write(Files.localOutput(testFile))
+        .schema(schema)
+        .named("test")
+        .build()) {
+      writer.addAll(expected);
+    }
+    assertRecordsMatch(schema, expected, testFile);
+  }
+}
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
new file mode 100644
index 000000000000..55ccbcd8ee8d
--- /dev/null
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.avro.generic.GenericData;
+import org.apache.iceberg.Files;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestSparkParquetVectorizedReader extends AvroDataTest {
+
+  @Before
+  public void setupArrowFlags() {
+    System.setProperty("arrow.enable_unsafe_memory_access", "true");
+    System.setProperty("arrow.enable_null_check_for_get", "false");
+  }
+
+  @Override
+  protected void writeAndValidate(Schema schema) throws IOException {
+    // Write test data
+    Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(
+        schema,
+        type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));
+
+    List<GenericData.Record> expected = RandomData.generateList(schema, 100000, 0L);
+
+    // write a test parquet file using iceberg writer
+    File testFile = temp.newFile();
+    Assert.assertTrue("Delete should succeed", testFile.delete());
+
+    try (FileAppender<GenericData.Record> writer = Parquet.write(Files.localOutput(testFile))
+        .schema(schema)
+        .named("test")
+        .build()) {
+      writer.addAll(expected);
+    }
+    assertRecordsMatch(schema, expected, testFile);
+  }
+
+  void assertRecordsMatch(Schema schema, List<GenericData.Record> expected, File testFile) throws IOException {
+    try (CloseableIterable<ColumnarBatch> batchReader = Parquet.read(Files.localInput(testFile))
+        .project(schema)
+        .reuseContainers()
+        .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(schema, schema, type, 10000))
+        .build()) {
+
+      Iterator<ColumnarBatch> batches = batchReader.iterator();
+      int numRowsRead = 0;
+      int numExpectedRead = 0;
+      int batchNum = 0;
+      while (batches.hasNext()) {
+
+        ColumnarBatch batch = batches.next();
+        numRowsRead += batch.numRows();
+
+        List<GenericData.Record> expectedBatch = new ArrayList<>(batch.numRows());
+        for (int i = numExpectedRead; i < numExpectedRead + batch.numRows(); i++) {
+          expectedBatch.add(expected.get(i));
+        }
+        TestHelpers.assertArrowVectors(schema.asStruct(), expectedBatch, batch);
+        numExpectedRead += batch.numRows();
+        batchNum++;
+      }
+      Assert.assertEquals(expected.size(), numRowsRead);
+    }
+  }
+
+  @Test
+  public void testArray() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testArrayOfStructs() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testMap() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testNumericMapKey() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testComplexMapKey() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testMapOfStructs() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testMixedTypes() throws IOException {
+    System.out.println("Not Supported");
+  }
+}
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
index 8d65b64cab6d..edfa644e1115 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
@@ -33,6 +33,7 @@
 import org.apache.iceberg.types.Comparators;
 import org.apache.iceberg.types.Types;
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
@@ -54,6 +55,15 @@ protected abstract Record writeAndRead(String desc,
   @Rule
   public TemporaryFolder temp = new TemporaryFolder();
 
+  @BeforeClass
+  public static void setArrowFlags() {
+    // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds
+    System.setProperty("arrow.enable_unsafe_memory_access", "true");
+    // Disable expensive null check for every get(index) call.
+    // Iceberg manages nullability checks itself instead of relying on arrow.
+    System.setProperty("arrow.enable_null_check_for_get", "false");
+  }
+
   @Test
   public void testFullProjection() throws Exception {
     Schema schema = new Schema(
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java
index a5fae085f9b9..9b9106276f3f 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java
@@ -75,6 +75,9 @@ public static Object[][] parameters() {
   @BeforeClass
   public static void startSpark() {
     TestSparkDataWrite.spark = SparkSession.builder().master("local[2]").getOrCreate();
+    // Set arrow flags
+    System.setProperty("arrow.enable_unsafe_memory_access", "true");
+    System.setProperty("arrow.enable_null_check_for_get", "false");
   }
 
   @AfterClass
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
index 72eeab5a283b..8b032b4a67dd 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
@@ -69,6 +69,9 @@ public static void startSpark() {
         .master("local[2]")
         .config("spark.sql.shuffle.partitions", 4)
         .getOrCreate();
+    // Set arrow flags
+    System.setProperty("arrow.enable_unsafe_memory_access", "true");
+    System.setProperty("arrow.enable_null_check_for_get", "false");
   }
 
   @AfterClass

From b33f6d876a0b4ebdc93e0a74af1b437eea1336bc Mon Sep 17 00:00:00 2001
From: samarthjain <samarth@apache.org>
Date: Thu, 5 Mar 2020 11:57:44 -0800
Subject: [PATCH 02/12] Minor cleanup

---
 .../VectorizedReadFloatsTwentyPercentNullBenchmark.java  | 2 +-
 ...VectorizedReadIntegersTwentyPercentNullBenchmark.java | 2 +-
 .../VectorizedReadLongsTwentyPercentNullBenchmark.java   | 2 +-
 .../VectorizedReadStringsTwentyPercentNullBenchmark.java | 2 +-
 .../java/org/apache/iceberg/spark/data/TestHelpers.java  | 8 ++++++++
 .../apache/iceberg/spark/source/TestReadProjection.java  | 9 +++------
 .../apache/iceberg/spark/source/TestSparkDataWrite.java  | 5 ++---
 .../iceberg/spark/source/TestStructuredStreaming.java    | 5 ++---
 8 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java
index d4c1d411214f..94ac603558c6 100644
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java
@@ -33,7 +33,7 @@ public class VectorizedReadFloatsTwentyPercentNullBenchmark extends VectorizedRe
   protected void appendData() {
     for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
       Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn("longCol", when(pmod(col("id"), lit(2)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
+          .withColumn("longCol", when(pmod(col("id"), lit(5)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
           .drop("id")
           .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"));
       appendAsFile(df);
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java
index 61ae0cf50c60..2646ca4a00c0 100644
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java
@@ -33,7 +33,7 @@ public class VectorizedReadIntegersTwentyPercentNullBenchmark extends Vectorized
   protected void appendData() {
     for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
       Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn("longCol", when(pmod(col("id"), lit(2)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
+          .withColumn("longCol", when(pmod(col("id"), lit(5)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
           .drop("id")
           .withColumn("intCol", expr("CAST(longCol AS INT)"));
       appendAsFile(df);
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java
index ecf9c6b21084..3544c8e675cf 100644
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java
@@ -33,7 +33,7 @@ public class VectorizedReadLongsTwentyPercentNullBenchmark extends VectorizedRea
   protected void appendData() {
     for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
       Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn("longCol", when(pmod(col("id"), lit(2)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
+          .withColumn("longCol", when(pmod(col("id"), lit(5)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
           .drop("id");
 
       appendAsFile(df);
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java
index d2a4037b89f6..ef783d8e5bf7 100644
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java
+++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java
@@ -33,7 +33,7 @@ public class VectorizedReadStringsTwentyPercentNullBenchmark extends VectorizedR
   protected void appendData() {
     for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
       Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn("id", when(pmod(col("id"), lit(2)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
+          .withColumn("id", when(pmod(col("id"), lit(5)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
           .withColumn("stringCol", expr("CAST(longCol AS STRING)"));
 
       appendAsFile(df);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
index 7b0450561d06..741b5f96a247 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
@@ -688,4 +688,12 @@ private static void assertEquals(String context, MapType map, MapData expected,
           expectedValues.get(i, valueType), actualValues.get(i, valueType));
     }
   }
+
+  public static void setArrowFlagsForVectorizedReads() {
+    // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds
+    System.setProperty("arrow.enable_unsafe_memory_access", "true");
+    // Disable expensive null check for every get(index) call.
+    // Iceberg manages nullability checks itself instead of relying on arrow.
+    System.setProperty("arrow.enable_null_check_for_get", "false");
+  }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
index edfa644e1115..43d6044564d8 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
@@ -30,6 +30,7 @@
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.spark.data.TestHelpers;
 import org.apache.iceberg.types.Comparators;
 import org.apache.iceberg.types.Types;
 import org.junit.Assert;
@@ -56,12 +57,8 @@ protected abstract Record writeAndRead(String desc,
   public TemporaryFolder temp = new TemporaryFolder();
 
   @BeforeClass
-  public static void setArrowFlags() {
-    // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds
-    System.setProperty("arrow.enable_unsafe_memory_access", "true");
-    // Disable expensive null check for every get(index) call.
-    // Iceberg manages nullability checks itself instead of relying on arrow.
-    System.setProperty("arrow.enable_null_check_for_get", "false");
+  public static void setup() {
+    TestHelpers.setArrowFlagsForVectorizedReads();
   }
 
   @Test
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java
index 9b9106276f3f..c721fe0c8d39 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java
@@ -34,6 +34,7 @@
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.hadoop.HadoopTables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.data.TestHelpers;
 import org.apache.iceberg.types.Types;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
@@ -75,9 +76,7 @@ public static Object[][] parameters() {
   @BeforeClass
   public static void startSpark() {
     TestSparkDataWrite.spark = SparkSession.builder().master("local[2]").getOrCreate();
-    // Set arrow flags
-    System.setProperty("arrow.enable_unsafe_memory_access", "true");
-    System.setProperty("arrow.enable_null_check_for_get", "false");
+    TestHelpers.setArrowFlagsForVectorizedReads();
   }
 
   @AfterClass
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
index 8b032b4a67dd..32ab3a9aecdf 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
@@ -29,6 +29,7 @@
 import org.apache.iceberg.hadoop.HadoopTables;
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.data.TestHelpers;
 import org.apache.iceberg.types.Types;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
@@ -69,9 +70,7 @@ public static void startSpark() {
         .master("local[2]")
         .config("spark.sql.shuffle.partitions", 4)
         .getOrCreate();
-    // Set arrow flags
-    System.setProperty("arrow.enable_unsafe_memory_access", "true");
-    System.setProperty("arrow.enable_null_check_for_get", "false");
+    TestHelpers.setArrowFlagsForVectorizedReads();
   }
 
   @AfterClass

From 492c0b0845722c3e9aaef9f108880cff1de672ec Mon Sep 17 00:00:00 2001
From: samarthjain <samarth@apache.org>
Date: Wed, 18 Mar 2020 13:56:17 -0700
Subject: [PATCH 03/12] Address code review comments

---
 .../arrow/vectorized/VectorHolder.java        |  51 +-
 .../vectorized/VectorizedArrowReader.java     |  49 +-
 .../parquet/VectorizedColumnIterator.java     |   8 +-
 ...dDictionaryEncodedParquetValuesReader.java |  33 +-
 .../parquet/VectorizedPageIterator.java       |  39 +-
 ...ectorizedParquetDefinitionLevelReader.java |  13 +-
 .../apache/iceberg/parquet/ParquetUtil.java   |   1 +
 .../iceberg/spark/arrow/ArrowAllocation.java  |  37 ++
 .../iceberg/spark/arrow/ArrowUtils.java       | 113 ----
 .../data/vectorized/ArrowVectorAccessor.java  |  86 +++
 .../data/vectorized/ArrowVectorAccessors.java | 495 ++++++++++++++
 .../data/vectorized/ColumnarBatchReaders.java |  31 +-
 .../vectorized/IcebergArrowColumnVector.java  | 624 +-----------------
 .../vectorized/NullValuesColumnVector.java    |  15 +-
 .../VectorizedSparkParquetReaders.java        |   8 +-
 ...taReader.java => BatchTaskDataReader.java} |   4 +-
 ...DataReader.java => RowTaskDataReader.java} |   4 +-
 .../iceberg/spark/data/DictionaryData.java    | 297 ---------
 .../apache/iceberg/spark/data/RandomData.java | 223 ++++++-
 ...quetDictionaryEncodedVectorizedReader.java |  31 +-
 ...DictionaryEncodingForVectorizedReader.java |  32 +-
 .../TestSparkParquetVectorizedReader.java     |  47 +-
 .../spark/source/TestReadProjection.java      |   2 +-
 23 files changed, 1018 insertions(+), 1225 deletions(-)
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowAllocation.java
 delete mode 100644 spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowUtils.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
 rename spark/src/main/java/org/apache/iceberg/spark/source/{ColumnarBatchTaskDataReader.java => BatchTaskDataReader.java} (97%)
 rename spark/src/main/java/org/apache/iceberg/spark/source/{InternalRowTaskDataReader.java => RowTaskDataReader.java} (98%)
 delete mode 100644 spark/src/test/java/org/apache/iceberg/spark/data/DictionaryData.java

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java
index 337111097e47..9df90a4c9066 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java
@@ -19,7 +19,9 @@
 
 package org.apache.iceberg.arrow.vectorized;
 
+import org.apache.arrow.util.Preconditions;
 import org.apache.arrow.vector.FieldVector;
+import org.apache.iceberg.types.Type;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.Dictionary;
 
@@ -31,23 +33,34 @@ public class VectorHolder {
   private final ColumnDescriptor columnDescriptor;
   private final FieldVector vector;
   private final boolean isDictionaryEncoded;
-
   private final Dictionary dictionary;
   private final NullabilityHolder nullabilityHolder;
-
-  public static final VectorHolder NULL_VECTOR_HOLDER = new VectorHolder(null, null, false, null, null);
+  private final Type icebergType;
 
   public VectorHolder(
-      ColumnDescriptor columnDescriptor,
-      FieldVector vector,
-      boolean isDictionaryEncoded,
-      Dictionary dictionary,
-      NullabilityHolder holder) {
+      ColumnDescriptor columnDescriptor, FieldVector vector, boolean isDictionaryEncoded,
+      Dictionary dictionary, NullabilityHolder holder, Type type) {
+    // All the fields except dictionary are not nullable unless it is a dummy holder
+    Preconditions.checkNotNull(columnDescriptor, "ColumnDescriptor cannot be null");
+    Preconditions.checkNotNull(vector, "Vector cannot be null");
+    Preconditions.checkNotNull(holder, "NullabilityHolder cannot be null");
+    Preconditions.checkNotNull(type, "IcebergType cannot be null");
     this.columnDescriptor = columnDescriptor;
     this.vector = vector;
     this.isDictionaryEncoded = isDictionaryEncoded;
     this.dictionary = dictionary;
     this.nullabilityHolder = holder;
+    this.icebergType = type;
+  }
+
+  // Only used for returning dummyHolder
+  private VectorHolder() {
+    columnDescriptor = null;
+    vector = null;
+    isDictionaryEncoded = false;
+    dictionary = null;
+    nullabilityHolder = null;
+    icebergType = null;
   }
 
   public ColumnDescriptor descriptor() {
@@ -69,4 +82,26 @@ public Dictionary dictionary() {
   public NullabilityHolder nullabilityHolder() {
     return nullabilityHolder;
   }
+
+  public Type icebergType() {
+    return icebergType;
+  }
+
+  public int numValues() {
+    return vector.getValueCount();
+  }
+
+  public static VectorHolder dummyHolder(int numRows) {
+    return new VectorHolder() {
+      @Override
+      public int numValues() {
+        return numRows;
+      }
+    };
+  }
+
+  public boolean isDummy() {
+    return vector == null;
+  }
+
 }
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java
index cbe3eacc9139..4fd6ccef6c7e 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java
@@ -72,7 +72,6 @@ public class VectorizedArrowReader implements VectorizedReader<VectorHolder> {
   // before storing the values in the Arrow vector. This means even if the dictionary is present, data
   // present in the vector may not necessarily be dictionary encoded.
   private Dictionary dictionary;
-  private boolean allPagesDictEncoded;
 
   public VectorizedArrowReader(
       ColumnDescriptor desc,
@@ -102,15 +101,16 @@ private enum ReadType {
 
   @Override
   public VectorHolder read(int numValsToRead) {
+    boolean dictEncoded = vectorizedColumnIterator.producesDictionaryEncodedVector();
     if (vec == null || !reuseContainers) {
-      allocateFieldVector();
+      allocateFieldVector(dictEncoded);
       nullabilityHolder = new NullabilityHolder(batchSize);
     } else {
       vec.setValueCount(0);
       nullabilityHolder.reset();
     }
     if (vectorizedColumnIterator.hasNext()) {
-      if (allPagesDictEncoded) {
+      if (dictEncoded) {
         vectorizedColumnIterator.nextBatchDictionaryIds((IntVector) vec, nullabilityHolder);
       } else {
         switch (readType) {
@@ -157,11 +157,12 @@ public VectorHolder read(int numValsToRead) {
     }
     Preconditions.checkState(vec.getValueCount() == numValsToRead,
         "Number of values read, %s, does not equal expected, %s", vec.getValueCount(), numValsToRead);
-    return new VectorHolder(columnDescriptor, vec, allPagesDictEncoded, dictionary, nullabilityHolder);
+    return new VectorHolder(columnDescriptor, vec, dictEncoded, dictionary,
+        nullabilityHolder, icebergField.type());
   }
 
-  private void allocateFieldVector() {
-    if (allPagesDictEncoded) {
+  private void allocateFieldVector(boolean dictionaryEncodedVector) {
+    if (dictionaryEncodedVector) {
       Field field = new Field(
           icebergField.name(),
           new FieldType(icebergField.isOptional(), new ArrowType.Int(Integer.SIZE, true), null, null),
@@ -303,8 +304,9 @@ private void allocateFieldVector() {
   @Override
   public void setRowGroupInfo(PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
     ColumnChunkMetaData chunkMetaData = metadata.get(ColumnPath.get(columnDescriptor.getPath()));
-    allPagesDictEncoded = !ParquetUtil.hasNonDictionaryPages(chunkMetaData);
-    dictionary = vectorizedColumnIterator.setRowGroupInfo(source.getPageReader(columnDescriptor), allPagesDictEncoded);
+    this.dictionary = vectorizedColumnIterator.setRowGroupInfo(
+        source.getPageReader(columnDescriptor),
+        !ParquetUtil.hasNonDictionaryPages(chunkMetaData));
   }
 
   @Override
@@ -324,16 +326,27 @@ public String toString() {
     return columnDescriptor.toString();
   }
 
-  public static final VectorizedArrowReader NULL_VALUES_READER =
-      new VectorizedArrowReader() {
-        @Override
-        public VectorHolder read(int numValsToRead) {
-          return VectorHolder.NULL_VECTOR_HOLDER;
-        }
+  public static VectorizedArrowReader nulls() {
+    return NullVectorReader.INSTANCE;
+  }
+
+  private static final class NullVectorReader extends VectorizedArrowReader {
+    private static final NullVectorReader INSTANCE = new NullVectorReader();
+
+    @Override
+    public VectorHolder read(int numValsToRead) {
+      return VectorHolder.dummyHolder(numValsToRead);
+    }
+
+    @Override
+    public void setRowGroupInfo(PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
+    }
+
+    @Override
+    public String toString() {
+      return "NullReader";
+    }
+  }
 
-        @Override
-        public void setRowGroupInfo(PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
-        }
-      };
 }
 
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java
index 2692cfc59747..57f55a39f589 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java
@@ -48,8 +48,10 @@ public VectorizedColumnIterator(ColumnDescriptor desc, String writerVersion, int
   }
 
   public Dictionary setRowGroupInfo(PageReader store, boolean allPagesDictEncoded) {
-    super.setPageSource(store);
+    // setPageSource can result in a data page read. If that happens, we need
+    // to know in advance whether all the pages in the row group are dictionary encoded or not
     this.vectorizedPageIterator.setAllPagesDictEncoded(allPagesDictEncoded);
+    super.setPageSource(store);
     return dictionary;
   }
 
@@ -199,4 +201,8 @@ protected BasePageIterator pageIterator() {
     return vectorizedPageIterator;
   }
 
+  public boolean producesDictionaryEncodedVector() {
+    return vectorizedPageIterator.producesDictionaryEncodedVector();
+  }
+
 }
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java
index e71d61aa6f71..43d6a50e5968 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java
@@ -72,7 +72,7 @@ void readBatchOfDictionaryIds(IntVector intVector, int startOffset, int numValue
   }
 
   void readBatchOfDictionaryEncodedLongs(FieldVector vector, int startOffset, int numValuesToRead, Dictionary dict,
-                                         NullabilityHolder nullabilityHolder) {
+                                         NullabilityHolder nullabilityHolder, int typeWidth) {
     int left = numValuesToRead;
     int idx = startOffset;
     while (left > 0) {
@@ -83,7 +83,7 @@ void readBatchOfDictionaryEncodedLongs(FieldVector vector, int startOffset, int
       switch (mode) {
         case RLE:
           for (int i = 0; i < numValues; i++) {
-            vector.getDataBuffer().setLong(idx, dict.decodeToLong(currentValue));
+            vector.getDataBuffer().setLong(idx * typeWidth, dict.decodeToLong(currentValue));
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
@@ -95,7 +95,7 @@ void readBatchOfDictionaryEncodedLongs(FieldVector vector, int startOffset, int
         case PACKED:
           for (int i = 0; i < numValues; i++) {
             vector.getDataBuffer()
-                .setLong(idx, dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]));
+                .setLong(idx * typeWidth, dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]));
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
@@ -111,7 +111,8 @@ void readBatchOfDictionaryEncodedLongs(FieldVector vector, int startOffset, int
   }
 
   void readBatchOfDictionaryEncodedTimestampMillis(FieldVector vector, int startOffset, int numValuesToRead,
-                                                   Dictionary dict, NullabilityHolder nullabilityHolder) {
+                                                   Dictionary dict, NullabilityHolder nullabilityHolder,
+      int typeWidth) {
     int left = numValuesToRead;
     int idx = startOffset;
     while (left > 0) {
@@ -122,7 +123,7 @@ void readBatchOfDictionaryEncodedTimestampMillis(FieldVector vector, int startOf
       switch (mode) {
         case RLE:
           for (int i = 0; i < numValues; i++) {
-            vector.getDataBuffer().setLong(idx, dict.decodeToLong(currentValue) * 1000);
+            vector.getDataBuffer().setLong(idx * typeWidth, dict.decodeToLong(currentValue) * 1000);
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
@@ -134,7 +135,7 @@ void readBatchOfDictionaryEncodedTimestampMillis(FieldVector vector, int startOf
         case PACKED:
           for (int i = 0; i < numValues; i++) {
             vector.getDataBuffer()
-                .setLong(idx, dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]) * 1000);
+                .setLong(idx * typeWidth, dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]) * 1000);
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
@@ -150,7 +151,7 @@ void readBatchOfDictionaryEncodedTimestampMillis(FieldVector vector, int startOf
   }
 
   void readBatchOfDictionaryEncodedIntegers(FieldVector vector, int startOffset, int numValuesToRead, Dictionary dict,
-                                            NullabilityHolder nullabilityHolder) {
+                                            NullabilityHolder nullabilityHolder, int typeWidth) {
     int left = numValuesToRead;
     int idx = startOffset;
     while (left > 0) {
@@ -162,7 +163,7 @@ void readBatchOfDictionaryEncodedIntegers(FieldVector vector, int startOffset, i
       switch (mode) {
         case RLE:
           for (int i = 0; i < num; i++) {
-            dataBuffer.setInt(idx, dict.decodeToInt(currentValue));
+            dataBuffer.setInt(idx * typeWidth, dict.decodeToInt(currentValue));
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
@@ -173,7 +174,7 @@ void readBatchOfDictionaryEncodedIntegers(FieldVector vector, int startOffset, i
           break;
         case PACKED:
           for (int i = 0; i < num; i++) {
-            dataBuffer.setInt(idx, dict.decodeToInt(packedValuesBuffer[packedValuesBufferIdx++]));
+            dataBuffer.setInt(idx * typeWidth, dict.decodeToInt(packedValuesBuffer[packedValuesBufferIdx++]));
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
@@ -189,7 +190,7 @@ void readBatchOfDictionaryEncodedIntegers(FieldVector vector, int startOffset, i
   }
 
   void readBatchOfDictionaryEncodedFloats(FieldVector vector, int startOffset, int numValuesToRead, Dictionary dict,
-                                          NullabilityHolder nullabilityHolder) {
+                                          NullabilityHolder nullabilityHolder, int typeWidth) {
     int left = numValuesToRead;
     int idx = startOffset;
     while (left > 0) {
@@ -200,7 +201,7 @@ void readBatchOfDictionaryEncodedFloats(FieldVector vector, int startOffset, int
       switch (mode) {
         case RLE:
           for (int i = 0; i < num; i++) {
-            vector.getDataBuffer().setFloat(idx, dict.decodeToFloat(currentValue));
+            vector.getDataBuffer().setFloat(idx * typeWidth, dict.decodeToFloat(currentValue));
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
@@ -211,7 +212,8 @@ void readBatchOfDictionaryEncodedFloats(FieldVector vector, int startOffset, int
           break;
         case PACKED:
           for (int i = 0; i < num; i++) {
-            vector.getDataBuffer().setFloat(idx, dict.decodeToFloat(packedValuesBuffer[packedValuesBufferIdx++]));
+            vector.getDataBuffer()
+                .setFloat(idx * typeWidth, dict.decodeToFloat(packedValuesBuffer[packedValuesBufferIdx++]));
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
@@ -227,7 +229,7 @@ void readBatchOfDictionaryEncodedFloats(FieldVector vector, int startOffset, int
   }
 
   void readBatchOfDictionaryEncodedDoubles(FieldVector vector, int startOffset, int numValuesToRead, Dictionary dict,
-                                           NullabilityHolder nullabilityHolder) {
+                                           NullabilityHolder nullabilityHolder, int typeWidth) {
     int left = numValuesToRead;
     int idx = startOffset;
     while (left > 0) {
@@ -238,7 +240,7 @@ void readBatchOfDictionaryEncodedDoubles(FieldVector vector, int startOffset, in
       switch (mode) {
         case RLE:
           for (int i = 0; i < num; i++) {
-            vector.getDataBuffer().setDouble(idx, dict.decodeToDouble(currentValue));
+            vector.getDataBuffer().setDouble(idx * typeWidth, dict.decodeToDouble(currentValue));
             nullabilityHolder.setNotNull(idx);
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
@@ -250,7 +252,8 @@ void readBatchOfDictionaryEncodedDoubles(FieldVector vector, int startOffset, in
           break;
         case PACKED:
           for (int i = 0; i < num; i++) {
-            vector.getDataBuffer().setDouble(idx, dict.decodeToDouble(packedValuesBuffer[packedValuesBufferIdx++]));
+            vector.getDataBuffer()
+                .setDouble(idx * typeWidth, dict.decodeToDouble(packedValuesBuffer[packedValuesBufferIdx++]));
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java
index 7cc32e06aecf..2aa6f2c07324 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java
@@ -47,12 +47,19 @@ public VectorizedPageIterator(ColumnDescriptor desc, String writerVersion, boole
     this.setArrowValidityVector = setValidityVector;
   }
 
-  private boolean eagerDecodeDictionary;
   private ValuesAsBytesReader plainValuesReader = null;
   private VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader = null;
   private boolean allPagesDictEncoded;
   private VectorizedParquetDefinitionLevelReader vectorizedDefinitionLevelReader;
 
+  private enum DictionaryDecodeMode {
+    NONE, // plain encoding
+    LAZY,
+    EAGER
+  }
+
+  private DictionaryDecodeMode dictionaryDecodeMode;
+
   public void setAllPagesDictEncoded(boolean allDictEncoded) {
     this.allPagesDictEncoded = allDictEncoded;
   }
@@ -98,7 +105,7 @@ public int nextBatchIntegers(
     if (actualBatchSize <= 0) {
       return 0;
     }
-    if (eagerDecodeDictionary) {
+    if (dictionaryDecodeMode == DictionaryDecodeMode.EAGER) {
       vectorizedDefinitionLevelReader.readBatchOfDictionaryEncodedIntegers(
           vector,
           numValsInVector,
@@ -132,7 +139,7 @@ public int nextBatchLongs(
     if (actualBatchSize <= 0) {
       return 0;
     }
-    if (eagerDecodeDictionary) {
+    if (dictionaryDecodeMode == DictionaryDecodeMode.EAGER) {
       vectorizedDefinitionLevelReader.readBatchOfDictionaryEncodedLongs(
           vector,
           numValsInVector,
@@ -168,7 +175,7 @@ public int nextBatchTimestampMillis(
     if (actualBatchSize <= 0) {
       return 0;
     }
-    if (eagerDecodeDictionary) {
+    if (dictionaryDecodeMode == DictionaryDecodeMode.EAGER) {
       vectorizedDefinitionLevelReader.readBatchOfDictionaryEncodedTimestampMillis(
           vector,
           numValsInVector,
@@ -202,7 +209,7 @@ public int nextBatchFloats(
     if (actualBatchSize <= 0) {
       return 0;
     }
-    if (eagerDecodeDictionary) {
+    if (dictionaryDecodeMode == DictionaryDecodeMode.EAGER) {
       vectorizedDefinitionLevelReader.readBatchOfDictionaryEncodedFloats(
           vector,
           numValsInVector,
@@ -236,7 +243,7 @@ public int nextBatchDoubles(
     if (actualBatchSize <= 0) {
       return 0;
     }
-    if (eagerDecodeDictionary) {
+    if (dictionaryDecodeMode == DictionaryDecodeMode.EAGER) {
       vectorizedDefinitionLevelReader.readBatchOfDictionaryEncodedDoubles(
           vector,
           numValsInVector,
@@ -274,7 +281,7 @@ public int nextBatchIntLongBackedDecimal(
     if (actualBatchSize <= 0) {
       return 0;
     }
-    if (eagerDecodeDictionary) {
+    if (dictionaryDecodeMode == DictionaryDecodeMode.EAGER) {
       vectorizedDefinitionLevelReader
           .readBatchOfDictionaryEncodedIntLongBackedDecimals(
               vector,
@@ -312,7 +319,7 @@ public int nextBatchFixedLengthDecimal(
     if (actualBatchSize <= 0) {
       return 0;
     }
-    if (eagerDecodeDictionary) {
+    if (dictionaryDecodeMode == DictionaryDecodeMode.EAGER) {
       vectorizedDefinitionLevelReader.readBatchOfDictionaryEncodedFixedLengthDecimals(
           vector,
           numValsInVector,
@@ -347,7 +354,7 @@ public int nextBatchVarWidthType(
     if (actualBatchSize <= 0) {
       return 0;
     }
-    if (eagerDecodeDictionary) {
+    if (dictionaryDecodeMode == DictionaryDecodeMode.EAGER) {
       vectorizedDefinitionLevelReader.readBatchOfDictionaryEncodedVarWidth(
           vector,
           numValsInVector,
@@ -380,7 +387,7 @@ public int nextBatchFixedWidthBinary(
     if (actualBatchSize <= 0) {
       return 0;
     }
-    if (eagerDecodeDictionary) {
+    if (dictionaryDecodeMode == DictionaryDecodeMode.EAGER) {
       vectorizedDefinitionLevelReader.readBatchOfDictionaryEncodedFixedWidthBinary(
           vector,
           numValsInVector,
@@ -403,6 +410,10 @@ public int nextBatchFixedWidthBinary(
     return actualBatchSize;
   }
 
+  public boolean producesDictionaryEncodedVector() {
+    return dictionaryDecodeMode == DictionaryDecodeMode.LAZY;
+  }
+
   /**
    * Method for reading batches of booleans.
    */
@@ -426,8 +437,6 @@ public int nextBatchBoolean(
   @Override
   protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
     ValuesReader previousReader = plainValuesReader;
-    this.eagerDecodeDictionary = dataEncoding.usesDictionary() && dictionary != null &&
-        (ParquetUtil.isIntType(desc.getPrimitiveType()) || !allPagesDictEncoded);
     if (dataEncoding.usesDictionary()) {
       if (dictionary == null) {
         throw new ParquetDecodingException(
@@ -437,12 +446,18 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i
         dictionaryEncodedValuesReader =
             new VectorizedDictionaryEncodedParquetValuesReader(desc.getMaxDefinitionLevel(), setArrowValidityVector);
         dictionaryEncodedValuesReader.initFromPage(valueCount, in);
+        if (ParquetUtil.isIntType(desc.getPrimitiveType()) || !allPagesDictEncoded) {
+          dictionaryDecodeMode = DictionaryDecodeMode.EAGER;
+        } else {
+          dictionaryDecodeMode = DictionaryDecodeMode.LAZY;
+        }
       } catch (IOException e) {
         throw new ParquetDecodingException("could not read page in col " + desc, e);
       }
     } else {
       plainValuesReader = new ValuesAsBytesReader();
       plainValuesReader.initFromPage(valueCount, in);
+      dictionaryDecodeMode = DictionaryDecodeMode.NONE;
     }
     if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
         previousReader != null && previousReader instanceof RequiresPreviousReader) {
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java
index 86918f7de5d2..ae9879e56ffc 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java
@@ -193,7 +193,7 @@ public void readBatchOfDictionaryEncodedLongs(
         case RLE:
           if (currentValue == maxDefLevel) {
             dictionaryEncodedValuesReader.readBatchOfDictionaryEncodedLongs(vector,
-                idx, numValues, dict, nullabilityHolder);
+                idx, numValues, dict, nullabilityHolder, typeWidth);
           } else {
             setNulls(nullabilityHolder, idx, numValues, validityBuffer);
           }
@@ -240,7 +240,7 @@ public void readBatchOfDictionaryEncodedTimestampMillis(
         case RLE:
           if (currentValue == maxDefLevel) {
             dictionaryEncodedValuesReader.readBatchOfDictionaryEncodedTimestampMillis(vector,
-                idx, numValues, dict, nullabilityHolder);
+                idx, numValues, dict, nullabilityHolder, typeWidth);
           } else {
             setNulls(nullabilityHolder, idx, numValues, validityBuffer);
           }
@@ -329,7 +329,7 @@ public void readBatchOfDictionaryEncodedIntegers(
         case RLE:
           if (currentValue == maxDefLevel) {
             dictionaryEncodedValuesReader.readBatchOfDictionaryEncodedIntegers(vector, idx,
-                num, dict, nullabilityHolder);
+                num, dict, nullabilityHolder, typeWidth);
           } else {
             setNulls(nullabilityHolder, idx, num, vector.getValidityBuffer());
           }
@@ -338,7 +338,8 @@ public void readBatchOfDictionaryEncodedIntegers(
         case PACKED:
           for (int i = 0; i < num; i++) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
-              vector.getDataBuffer().setInt(idx, dict.decodeToInt(dictionaryEncodedValuesReader.readInteger()));
+              vector.getDataBuffer()
+                  .setInt(idx * typeWidth, dict.decodeToInt(dictionaryEncodedValuesReader.readInteger()));
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
               } else {
@@ -418,7 +419,7 @@ public void readBatchOfDictionaryEncodedFloats(
         case RLE:
           if (currentValue == maxDefLevel) {
             dictionaryEncodedValuesReader.readBatchOfDictionaryEncodedFloats(vector, idx,
-                num, dict, nullabilityHolder);
+                num, dict, nullabilityHolder, typeWidth);
           } else {
             setNulls(nullabilityHolder, idx, num, validityBuffer);
           }
@@ -507,7 +508,7 @@ public void readBatchOfDictionaryEncodedDoubles(
         case RLE:
           if (currentValue == maxDefLevel) {
             dictionaryEncodedValuesReader.readBatchOfDictionaryEncodedDoubles(vector, idx,
-                num, dict, nullabilityHolder);
+                num, dict, nullabilityHolder, typeWidth);
           } else {
             setNulls(nullabilityHolder, idx, num, vector.getValidityBuffer());
           }
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java
index f92230c6eb1f..c4c8ebf30c12 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java
@@ -280,6 +280,7 @@ public static boolean isIntType(PrimitiveType primitiveType) {
         case INT_8:
         case INT_16:
         case INT_32:
+        case DATE:
           return true;
         default:
           return false;
diff --git a/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowAllocation.java b/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowAllocation.java
new file mode 100644
index 000000000000..c1a38a7b7f11
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowAllocation.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.arrow;
+
+import org.apache.arrow.memory.RootAllocator;
+
+public class ArrowAllocation {
+  static {
+    ROOT_ALLOCATOR = new RootAllocator(Long.MAX_VALUE);
+  }
+
+  private static final RootAllocator ROOT_ALLOCATOR;
+
+  private ArrowAllocation() {
+  }
+
+  public static RootAllocator rootAllocator() {
+    return ROOT_ALLOCATOR;
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowUtils.java b/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowUtils.java
deleted file mode 100644
index 02fbc435dc81..000000000000
--- a/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowUtils.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.arrow;
-
-import org.apache.arrow.memory.RootAllocator;
-import org.apache.arrow.vector.types.DateUnit;
-import org.apache.arrow.vector.types.FloatingPointPrecision;
-import org.apache.arrow.vector.types.TimeUnit;
-import org.apache.arrow.vector.types.pojo.ArrowType;
-import org.apache.arrow.vector.types.pojo.Field;
-import org.apache.spark.sql.types.ArrayType;
-import org.apache.spark.sql.types.DataType;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.DecimalType;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-public class ArrowUtils {
-
-  private static ArrowUtils instance;
-  private RootAllocator rootAllocator;
-
-  private ArrowUtils() {
-    rootAllocator = new RootAllocator(Long.MAX_VALUE);
-  }
-
-  public static ArrowUtils instance() {
-    if (instance == null) {
-      instance = new ArrowUtils();
-    }
-    return instance;
-  }
-
-  public RootAllocator rootAllocator() {
-    return rootAllocator;
-  }
-
-  @SuppressWarnings("checkstyle:CyclomaticComplexity")
-  public DataType fromArrowType(ArrowType data) {
-
-    if (data instanceof ArrowType.Bool) {
-      return DataTypes.BooleanType;
-    } else if (data instanceof ArrowType.Int) {
-      ArrowType.Int intData = (ArrowType.Int) data;
-      if (intData.getIsSigned() && intData.getBitWidth() == 8) {
-        return DataTypes.ByteType;
-      } else if (intData.getIsSigned() && intData.getBitWidth() == 8 * 2) {
-        return DataTypes.ShortType;
-      } else if (intData.getIsSigned() && intData.getBitWidth() == 8 * 4) {
-        return DataTypes.IntegerType;
-      } else if (intData.getIsSigned() && intData.getBitWidth() == 8 * 8) {
-        return DataTypes.LongType;
-      }
-    } else if (data instanceof ArrowType.FloatingPoint) {
-      ArrowType.FloatingPoint floatData = (ArrowType.FloatingPoint) data;
-      if (floatData.getPrecision() == FloatingPointPrecision.SINGLE) {
-        return DataTypes.FloatType;
-      } else if (floatData.getPrecision() == FloatingPointPrecision.DOUBLE) {
-        return DataTypes.DoubleType;
-      }
-    } else if (data instanceof ArrowType.Utf8) {
-      return DataTypes.StringType;
-    } else if (data instanceof ArrowType.Binary) {
-      return DataTypes.BinaryType;
-    } else if (data instanceof ArrowType.Decimal) {
-      ArrowType.Decimal decimalData = (ArrowType.Decimal) data;
-      return new DecimalType(decimalData.getPrecision(), decimalData.getScale());
-    } else if (data instanceof ArrowType.Date && ((ArrowType.Date) data).getUnit() == DateUnit.DAY) {
-      return DataTypes.DateType;
-    } else if (data instanceof ArrowType.Timestamp && ((ArrowType.Timestamp) data).getUnit() == TimeUnit.MICROSECOND) {
-      return DataTypes.TimestampType;
-    }
-
-    throw new UnsupportedOperationException("Unsupported data type: " + data);
-  }
-
-  public DataType fromArrowField(Field field) {
-    ArrowType arrowType = field.getType();
-    if (arrowType instanceof ArrowType.List) {
-      Field elementField = field.getChildren().get(0);
-      DataType elementType = fromArrowField(elementField);
-      return new ArrayType(elementType, elementField.isNullable());
-    } else if (arrowType instanceof ArrowType.Struct) {
-      StructField[] fields = new StructField[field.getChildren().size()];
-      int index = 0;
-      for (Field f : field.getChildren()) {
-        DataType dt = fromArrowField(f);
-        fields[index++] = new StructField(f.getName(), dt, f.isNullable(), Metadata.empty());
-      }
-      return new StructType(fields);
-    } else {
-      return fromArrowType(arrowType);
-    }
-  }
-}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
new file mode 100644
index 000000000000..15af1e53f35e
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data.vectorized;
+
+import org.apache.arrow.vector.ValueVector;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.vectorized.ArrowColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.unsafe.types.UTF8String;
+
+@SuppressWarnings("checkstyle:VisibilityModifier")
+abstract class ArrowVectorAccessor {
+
+  private final ValueVector vector;
+  ArrowColumnVector[] childColumns;
+
+  ArrowVectorAccessor(ValueVector vector) {
+    this.vector = vector;
+  }
+
+  final void close() {
+    vector.close();
+    if (childColumns != null) {
+      for (int i = 0; i < childColumns.length; i++) {
+        childColumns[i].close();
+      }
+    }
+  }
+
+  boolean getBoolean(int rowId) {
+    throw new UnsupportedOperationException("Unsupported type: boolean");
+  }
+
+  int getInt(int rowId) {
+    throw new UnsupportedOperationException("Unsupported type: int");
+  }
+
+  long getLong(int rowId) {
+    throw new UnsupportedOperationException("Unsupported type: long");
+  }
+
+  float getFloat(int rowId) {
+    throw new UnsupportedOperationException("Unsupported type: float");
+  }
+
+  double getDouble(int rowId) {
+    throw new UnsupportedOperationException("Unsupported type: double");
+  }
+
+  Decimal getDecimal(int rowId, int precision, int scale) {
+    throw new UnsupportedOperationException("Unsupported type: decimal");
+  }
+
+  UTF8String getUTF8String(int rowId) {
+    throw new UnsupportedOperationException("Unsupported type: UTF8String");
+  }
+
+  byte[] getBinary(int rowId) {
+    throw new UnsupportedOperationException("Unsupported type: binary");
+  }
+
+  ColumnarArray getArray(int rowId) {
+    throw new UnsupportedOperationException("Unsupported type: array");
+  }
+
+  ArrowColumnVector[] childColumns() {
+    return childColumns;
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
new file mode 100644
index 000000000000..ba1d3eb959e0
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
@@ -0,0 +1,495 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data.vectorized;
+
+import io.netty.buffer.ArrowBuf;
+import java.math.BigInteger;
+import org.apache.arrow.vector.BigIntVector;
+import org.apache.arrow.vector.BitVector;
+import org.apache.arrow.vector.DateDayVector;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.FixedSizeBinaryVector;
+import org.apache.arrow.vector.Float4Vector;
+import org.apache.arrow.vector.Float8Vector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.TimeStampMicroTZVector;
+import org.apache.arrow.vector.VarBinaryVector;
+import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.StructVector;
+import org.apache.arrow.vector.holders.NullableVarCharHolder;
+import org.apache.iceberg.arrow.vectorized.IcebergArrowVectors;
+import org.apache.iceberg.arrow.vectorized.VectorHolder;
+import org.apache.parquet.Preconditions;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.vectorized.ArrowColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.unsafe.types.UTF8String;
+
+public class ArrowVectorAccessors {
+
+  private ArrowVectorAccessors() {}
+
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
+  static ArrowVectorAccessor getVectorAccessor(VectorHolder holder) {
+    Dictionary dictionary = holder.dictionary();
+    boolean isVectorDictEncoded = holder.isDictionaryEncoded();
+    ColumnDescriptor desc = holder.descriptor();
+    FieldVector vector = holder.vector();
+    PrimitiveType primitive = desc.getPrimitiveType();
+    if (isVectorDictEncoded) {
+      Preconditions.checkState(vector instanceof IntVector, "Dictionary ids should be stored in IntVectors only");
+      if (primitive.getOriginalType() != null) {
+        switch (desc.getPrimitiveType().getOriginalType()) {
+          case ENUM:
+          case JSON:
+          case UTF8:
+          case BSON:
+            return new DictionaryStringAccessor((IntVector) vector, dictionary);
+          case INT_64:
+          case TIMESTAMP_MILLIS:
+          case TIMESTAMP_MICROS:
+            return new DictionaryLongAccessor((IntVector) vector, dictionary);
+          case DECIMAL:
+            switch (primitive.getPrimitiveTypeName()) {
+              case BINARY:
+              case FIXED_LEN_BYTE_ARRAY:
+                return new DictionaryDecimalBinaryAccessor(
+                    (IntVector) vector,
+                    dictionary);
+              case INT64:
+                return new DictionaryDecimalLongAccessor(
+                    (IntVector) vector,
+                    dictionary);
+              case INT32:
+                return new DictionaryDecimalIntAccessor(
+                    (IntVector) vector,
+                    dictionary);
+              default:
+                throw new UnsupportedOperationException(
+                    "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
+            }
+          default:
+            throw new UnsupportedOperationException(
+                "Unsupported logical type: " + primitive.getOriginalType());
+        }
+      } else {
+        switch (primitive.getPrimitiveTypeName()) {
+          case FIXED_LEN_BYTE_ARRAY:
+          case BINARY:
+            return new DictionaryBinaryAccessor((IntVector) vector, dictionary);
+          case FLOAT:
+            return new DictionaryFloatAccessor((IntVector) vector, dictionary);
+          case INT64:
+            return new DictionaryLongAccessor((IntVector) vector, dictionary);
+          case DOUBLE:
+            return new DictionaryDoubleAccessor((IntVector) vector, dictionary);
+          default:
+            throw new UnsupportedOperationException("Unsupported type: " + primitive);
+        }
+      }
+    } else {
+      if (vector instanceof BitVector) {
+        return new BooleanAccessor((BitVector) vector);
+      } else if (vector instanceof IntVector) {
+        return new IntAccessor((IntVector) vector);
+      } else if (vector instanceof BigIntVector) {
+        return new LongAccessor((BigIntVector) vector);
+      } else if (vector instanceof Float4Vector) {
+        return new FloatAccessor((Float4Vector) vector);
+      } else if (vector instanceof Float8Vector) {
+        return new DoubleAccessor((Float8Vector) vector);
+      } else if (vector instanceof IcebergArrowVectors.DecimalArrowVector) {
+        return new DecimalAccessor((IcebergArrowVectors.DecimalArrowVector) vector);
+      } else if (vector instanceof IcebergArrowVectors.VarcharArrowVector) {
+        return new StringAccessor((IcebergArrowVectors.VarcharArrowVector) vector);
+      } else if (vector instanceof IcebergArrowVectors.VarBinaryArrowVector) {
+        return new BinaryAccessor((IcebergArrowVectors.VarBinaryArrowVector) vector);
+      } else if (vector instanceof DateDayVector) {
+        return new DateAccessor((DateDayVector) vector);
+      } else if (vector instanceof TimeStampMicroTZVector) {
+        return new TimestampAccessor((TimeStampMicroTZVector) vector);
+      } else if (vector instanceof ListVector) {
+        ListVector listVector = (ListVector) vector;
+        return new ArrayAccessor(listVector);
+      } else if (vector instanceof StructVector) {
+        StructVector structVector = (StructVector) vector;
+        return new StructAccessor(structVector);
+      }
+    }
+    throw new UnsupportedOperationException("Unsupported type: " + primitive);
+  }
+
+  private static class BooleanAccessor extends ArrowVectorAccessor {
+
+    private final BitVector vector;
+
+    BooleanAccessor(BitVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final boolean getBoolean(int rowId) {
+      return vector.get(rowId) == 1;
+    }
+  }
+
+  private static class IntAccessor extends ArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    IntAccessor(IntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final int getInt(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private static class LongAccessor extends ArrowVectorAccessor {
+
+    private final BigIntVector vector;
+
+    LongAccessor(BigIntVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final long getLong(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private static class DictionaryLongAccessor extends DictionaryArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    DictionaryLongAccessor(IntVector vector, Dictionary dictionary) {
+      super(vector, dictionary);
+      this.vector = vector;
+    }
+
+    @Override
+    final long getLong(int rowId) {
+      return parquetDictionary.decodeToLong(vector.get(rowId));
+    }
+  }
+
+  private static class FloatAccessor extends ArrowVectorAccessor {
+
+    private final Float4Vector vector;
+
+    FloatAccessor(Float4Vector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final float getFloat(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private static class DictionaryFloatAccessor extends DictionaryArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    DictionaryFloatAccessor(IntVector vector, Dictionary dictionary) {
+      super(vector, dictionary);
+      this.vector = vector;
+    }
+
+    @Override
+    final float getFloat(int rowId) {
+      return parquetDictionary.decodeToFloat(vector.get(rowId));
+    }
+  }
+
+  private static class DoubleAccessor extends ArrowVectorAccessor {
+
+    private final Float8Vector vector;
+
+    DoubleAccessor(Float8Vector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final double getDouble(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private static class DictionaryDoubleAccessor extends DictionaryArrowVectorAccessor {
+
+    private final IntVector vector;
+
+    DictionaryDoubleAccessor(IntVector vector, Dictionary dictionary) {
+      super(vector, dictionary);
+      this.vector = vector;
+    }
+
+    @Override
+    final double getDouble(int rowId) {
+      return parquetDictionary.decodeToDouble(vector.get(rowId));
+    }
+  }
+
+  private static class DecimalAccessor extends ArrowVectorAccessor {
+
+    private final IcebergArrowVectors.DecimalArrowVector vector;
+
+    DecimalAccessor(IcebergArrowVectors.DecimalArrowVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final Decimal getDecimal(int rowId, int precision, int scale) {
+      return Decimal.apply(vector.getObject(rowId), precision, scale);
+    }
+  }
+
+  private static class StringAccessor extends ArrowVectorAccessor {
+
+    private final IcebergArrowVectors.VarcharArrowVector vector;
+    private final NullableVarCharHolder stringResult = new NullableVarCharHolder();
+
+    StringAccessor(IcebergArrowVectors.VarcharArrowVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final UTF8String getUTF8String(int rowId) {
+      vector.get(rowId, stringResult);
+      if (stringResult.isSet == 0) {
+        return null;
+      } else {
+        return UTF8String.fromAddress(
+            null,
+            stringResult.buffer.memoryAddress() + stringResult.start,
+            stringResult.end - stringResult.start);
+      }
+    }
+  }
+
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  private abstract static class DictionaryArrowVectorAccessor extends ArrowVectorAccessor {
+    final Dictionary parquetDictionary;
+    final IntVector dictionaryVector;
+
+    private DictionaryArrowVectorAccessor(IntVector vector, Dictionary dictionary) {
+      super(vector);
+      this.dictionaryVector = vector;
+      this.parquetDictionary = dictionary;
+    }
+  }
+
+  private static class DictionaryStringAccessor extends DictionaryArrowVectorAccessor {
+
+    DictionaryStringAccessor(IntVector vector, Dictionary dictionary) {
+      super(vector, dictionary);
+    }
+
+    @Override
+    final UTF8String getUTF8String(int rowId) {
+      Binary binary = parquetDictionary.decodeToBinary(dictionaryVector.get(rowId));
+      return UTF8String.fromBytes(binary.getBytesUnsafe());
+    }
+  }
+
+  private static class FixedSizeBinaryAccessor extends ArrowVectorAccessor {
+
+    private final FixedSizeBinaryVector vector;
+
+    FixedSizeBinaryAccessor(FixedSizeBinaryVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final byte[] getBinary(int rowId) {
+      return vector.getObject(rowId);
+    }
+  }
+
+  private static class BinaryAccessor extends ArrowVectorAccessor {
+
+    private final VarBinaryVector vector;
+
+    BinaryAccessor(VarBinaryVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final byte[] getBinary(int rowId) {
+      return vector.getObject(rowId);
+    }
+  }
+
+  private static class DictionaryBinaryAccessor extends DictionaryArrowVectorAccessor {
+
+    DictionaryBinaryAccessor(IntVector vector, Dictionary dictionary) {
+      super(vector, dictionary);
+    }
+
+    @Override
+    final byte[] getBinary(int rowId) {
+      Binary binary = parquetDictionary.decodeToBinary(dictionaryVector.get(rowId));
+      return binary.getBytesUnsafe();
+    }
+  }
+
+  private static class DateAccessor extends ArrowVectorAccessor {
+
+    private final DateDayVector vector;
+
+    DateAccessor(DateDayVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final int getInt(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private static class TimestampAccessor extends ArrowVectorAccessor {
+
+    private final TimeStampMicroTZVector vector;
+
+    TimestampAccessor(TimeStampMicroTZVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final long getLong(int rowId) {
+      return vector.get(rowId);
+    }
+  }
+
+  private static class ArrayAccessor extends ArrowVectorAccessor {
+
+    private final ListVector vector;
+    private final ArrowColumnVector arrayData;
+
+    ArrayAccessor(ListVector vector) {
+      super(vector);
+      this.vector = vector;
+      this.arrayData = new ArrowColumnVector(vector.getDataVector());
+    }
+
+    @Override
+    final ColumnarArray getArray(int rowId) {
+      ArrowBuf offsets = vector.getOffsetBuffer();
+      int index = rowId * ListVector.OFFSET_WIDTH;
+      int start = offsets.getInt(index);
+      int end = offsets.getInt(index + ListVector.OFFSET_WIDTH);
+      return new ColumnarArray(arrayData, start, end - start);
+    }
+  }
+
+  /**
+   * Use {@link IcebergArrowColumnVector#getChild(int)} to get hold of the {@link ArrowColumnVector} vectors holding the
+   * struct values.
+   */
+  private static class StructAccessor extends ArrowVectorAccessor {
+    StructAccessor(StructVector structVector) {
+      super(structVector);
+      childColumns = new ArrowColumnVector[structVector.size()];
+      for (int i = 0; i < childColumns.length; ++i) {
+        childColumns[i] = new ArrowColumnVector(structVector.getVectorById(i));
+      }
+    }
+  }
+
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  private abstract static class DictionaryDecimalAccessor extends DictionaryArrowVectorAccessor {
+    final Decimal[] cache;
+
+    private DictionaryDecimalAccessor(IntVector vector, Dictionary dictionary) {
+      super(vector, dictionary);
+      this.cache = new Decimal[dictionary.getMaxId() + 1];
+    }
+  }
+
+  private static class DictionaryDecimalBinaryAccessor extends DictionaryDecimalAccessor {
+
+    DictionaryDecimalBinaryAccessor(IntVector vector, Dictionary dictionary) {
+      super(vector, dictionary);
+    }
+
+    @Override
+    final Decimal getDecimal(int rowId, int precision, int scale) {
+      int dictId = dictionaryVector.get(rowId);
+      if (cache[dictId] == null) {
+        cache[dictId] = Decimal.apply(
+            new BigInteger(parquetDictionary.decodeToBinary(dictId).getBytesUnsafe()).longValue(),
+            precision,
+            scale);
+      }
+      return cache[dictId];
+    }
+  }
+
+  private static class DictionaryDecimalLongAccessor extends DictionaryDecimalAccessor {
+
+    DictionaryDecimalLongAccessor(IntVector vector, Dictionary dictionary) {
+      super(vector, dictionary);
+    }
+
+    @Override
+    final Decimal getDecimal(int rowId, int precision, int scale) {
+      int dictId = dictionaryVector.get(rowId);
+      if (cache[dictId] == null) {
+        cache[dictId] = Decimal.apply(parquetDictionary.decodeToLong(dictId), precision, scale);
+      }
+      return cache[dictId];
+    }
+  }
+
+  private static class DictionaryDecimalIntAccessor extends DictionaryDecimalAccessor {
+
+    DictionaryDecimalIntAccessor(IntVector vector, Dictionary dictionary) {
+      super(vector, dictionary);
+    }
+
+    @Override
+    final Decimal getDecimal(int rowId, int precision, int scale) {
+      int dictId = dictionaryVector.get(rowId);
+      if (cache[dictId] == null) {
+        cache[dictId] = Decimal.apply(parquetDictionary.decodeToInt(dictId), precision, scale);
+      }
+      return cache[dictId];
+    }
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java
index 478b5a9b8c50..81545a7b51ae 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java
@@ -22,10 +22,10 @@
 import java.lang.reflect.Array;
 import java.util.List;
 import java.util.Map;
-import org.apache.arrow.vector.FieldVector;
 import org.apache.iceberg.arrow.vectorized.VectorHolder;
 import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader;
 import org.apache.iceberg.parquet.VectorizedReader;
+import org.apache.parquet.Preconditions;
 import org.apache.parquet.column.page.PageReadStore;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 import org.apache.parquet.hadoop.metadata.ColumnPath;
@@ -39,9 +39,8 @@
  */
 public class ColumnarBatchReaders implements VectorizedReader<ColumnarBatch> {
   private final VectorizedArrowReader[] readers;
-  private final int batchSize;
 
-  public ColumnarBatchReaders(List<VectorizedReader> readers, int bSize) {
+  public ColumnarBatchReaders(List<VectorizedReader> readers) {
     this.readers = (VectorizedArrowReader[]) Array.newInstance(
         VectorizedArrowReader.class, readers.size());
     int idx = 0;
@@ -49,7 +48,6 @@ public ColumnarBatchReaders(List<VectorizedReader> readers, int bSize) {
       this.readers[idx] = (VectorizedArrowReader) reader;
       idx++;
     }
-    this.batchSize = bSize;
   }
 
   @Override
@@ -69,21 +67,28 @@ public void reuseContainers(boolean reuse) {
   }
 
   @Override
-  public final ColumnarBatch read(int numValsToRead) {
+  public final ColumnarBatch read(int numRowsToRead) {
+    Preconditions.checkArgument(numRowsToRead > 0, "Invalid value: " + numRowsToRead);
     ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length];
-    int numRows = 0;
+    int prevNum = 0;
     for (int i = 0; i < readers.length; i += 1) {
-      VectorHolder holder = readers[i].read(numValsToRead);
-      FieldVector vector = holder.vector();
-      if (vector == null) {
-        arrowColumnVectors[i] = new NullValuesColumnVector(batchSize);
+      VectorHolder holder = readers[i].read(numRowsToRead);
+      int numRowsInVector = holder.numValues();
+      Preconditions.checkState(
+          numRowsInVector == numRowsToRead,
+          "Number of rows in the vector " + numRowsInVector + " didn't match expected " +
+              numRowsToRead);
+      if (prevNum > 0) {
+        // assert that all the vectors in the batch have the same number of rows
+        Preconditions.checkState(numRowsInVector == prevNum, "Number of rows in arrow vectors didn't match " +
+            "for " + readers[i - 1] + " and " + readers[i]);
       } else {
-        arrowColumnVectors[i] = new IcebergArrowColumnVector(holder);
-        numRows = vector.getValueCount();
+        prevNum = numRowsInVector;
       }
+      arrowColumnVectors[i] = IcebergArrowColumnVector.forHolder(holder, numRowsInVector);
     }
     ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors);
-    batch.setNumRows(numRows);
+    batch.setNumRows(numRowsToRead);
     return batch;
   }
 
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
index ac4002a34ec9..5f18e9d36792 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
@@ -19,33 +19,10 @@
 
 package org.apache.iceberg.spark.data.vectorized;
 
-import io.netty.buffer.ArrowBuf;
-import java.math.BigInteger;
-import org.apache.arrow.vector.BigIntVector;
-import org.apache.arrow.vector.BitVector;
-import org.apache.arrow.vector.DateDayVector;
-import org.apache.arrow.vector.FixedSizeBinaryVector;
-import org.apache.arrow.vector.Float4Vector;
-import org.apache.arrow.vector.Float8Vector;
-import org.apache.arrow.vector.IntVector;
-import org.apache.arrow.vector.SmallIntVector;
-import org.apache.arrow.vector.TimeStampMicroTZVector;
-import org.apache.arrow.vector.TinyIntVector;
-import org.apache.arrow.vector.ValueVector;
-import org.apache.arrow.vector.VarBinaryVector;
-import org.apache.arrow.vector.complex.ListVector;
-import org.apache.arrow.vector.complex.StructVector;
-import org.apache.arrow.vector.holders.NullableVarCharHolder;
-import org.apache.iceberg.arrow.vectorized.IcebergArrowVectors;
 import org.apache.iceberg.arrow.vectorized.NullabilityHolder;
 import org.apache.iceberg.arrow.vectorized.VectorHolder;
-import org.apache.iceberg.spark.arrow.ArrowUtils;
+import org.apache.iceberg.spark.SparkSchemaUtil;
 import org.apache.parquet.Preconditions;
-import org.apache.parquet.column.ColumnDescriptor;
-import org.apache.parquet.column.Dictionary;
-import org.apache.parquet.io.api.Binary;
-import org.apache.parquet.schema.DecimalMetadata;
-import org.apache.parquet.schema.PrimitiveType;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.sql.vectorized.ArrowColumnVector;
 import org.apache.spark.sql.vectorized.ColumnVector;
@@ -58,32 +35,19 @@
  * {@link ArrowColumnVector} The main difference is in how nullability checks are made in this class by relying on
  * {@link NullabilityHolder} instead of the validity vector in the Arrow vector.
  */
-
 public class IcebergArrowColumnVector extends ColumnVector {
 
   private final ArrowVectorAccessor accessor;
   private final NullabilityHolder nullabilityHolder;
-  private final Dictionary dictionary;
-  private final boolean isVectorDictEncoded;
-  private ArrowColumnVector[] childColumns;
 
   public IcebergArrowColumnVector(VectorHolder holder) {
-    super(ArrowUtils.instance().fromArrowField(holder.vector().getField()));
+    super(SparkSchemaUtil.convert(holder.icebergType()));
     this.nullabilityHolder = holder.nullabilityHolder();
-    this.dictionary = holder.dictionary();
-    this.isVectorDictEncoded = holder.isDictionaryEncoded();
-    this.accessor = getVectorAccessor(holder.descriptor(), holder.vector());
+    this.accessor = ArrowVectorAccessors.getVectorAccessor(holder);
   }
 
   @Override
   public void close() {
-    if (childColumns != null) {
-      for (int i = 0; i < childColumns.length; i++) {
-        childColumns[i].close();
-        childColumns[i] = null;
-      }
-      childColumns = null;
-    }
     accessor.close();
   }
 
@@ -109,12 +73,12 @@ public boolean getBoolean(int rowId) {
 
   @Override
   public byte getByte(int rowId) {
-    return accessor.getByte(rowId);
+    throw new UnsupportedOperationException("Unsupported type - byte");
   }
 
   @Override
   public short getShort(int rowId) {
-    return accessor.getShort(rowId);
+    throw new UnsupportedOperationException("Unsupported type - short");
   }
 
   @Override
@@ -147,7 +111,7 @@ public ColumnarArray getArray(int rowId) {
 
   @Override
   public ColumnarMap getMap(int rowId) {
-    throw new UnsupportedOperationException();
+    throw new UnsupportedOperationException("Unsupported type - map");
   }
 
   @Override
@@ -176,578 +140,14 @@ public byte[] getBinary(int rowId) {
 
   @Override
   public ArrowColumnVector getChild(int ordinal) {
+    ArrowColumnVector[] childColumns = accessor.childColumns();
+    Preconditions.checkArgument(childColumns != null && ordinal < childColumns.length, "Invalid call for getChild() " +
+        "with ordinal " + ordinal);
     return childColumns[ordinal];
   }
 
-  private abstract class ArrowVectorAccessor {
-
-    private final ValueVector vector;
-
-    ArrowVectorAccessor(ValueVector vector) {
-      this.vector = vector;
-    }
-
-    final boolean isNullAt(int rowId) {
-      return nullabilityHolder.isNullAt(rowId) == 1;
-    }
-
-    final void close() {
-      vector.close();
-    }
-
-    boolean getBoolean(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-
-    byte getByte(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-
-    short getShort(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-
-    int getInt(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-
-    long getLong(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-
-    float getFloat(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-
-    double getDouble(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-
-    Decimal getDecimal(int rowId, int precision, int scale) {
-      throw new UnsupportedOperationException();
-    }
-
-    UTF8String getUTF8String(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-
-    byte[] getBinary(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-
-    ColumnarArray getArray(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-  }
-
-  @SuppressWarnings("checkstyle:CyclomaticComplexity")
-  private ArrowVectorAccessor getVectorAccessor(ColumnDescriptor desc, ValueVector vector) {
-    PrimitiveType primitive = desc.getPrimitiveType();
-    if (isVectorDictEncoded) {
-      Preconditions.checkState(vector instanceof IntVector, "Dictionary ids should be stored in IntVectors only");
-      if (primitive.getOriginalType() != null) {
-        switch (desc.getPrimitiveType().getOriginalType()) {
-          case ENUM:
-          case JSON:
-          case UTF8:
-          case BSON:
-            return new DictionaryStringAccessor((IntVector) vector);
-          case INT_8:
-          case INT_16:
-          case INT_32:
-          case DATE:
-            return new DictionaryIntAccessor((IntVector) vector);
-          case INT_64:
-          case TIMESTAMP_MILLIS:
-          case TIMESTAMP_MICROS:
-            return new DictionaryLongAccessor((IntVector) vector);
-          case DECIMAL:
-            DecimalMetadata decimal = primitive.getDecimalMetadata();
-            switch (primitive.getPrimitiveTypeName()) {
-              case BINARY:
-              case FIXED_LEN_BYTE_ARRAY:
-                return new DictionaryDecimalBinaryAccessor(
-                    (IntVector) vector,
-                    decimal.getPrecision(),
-                    decimal.getScale());
-              case INT64:
-                return new DictionaryDecimalLongAccessor(
-                    (IntVector) vector,
-                    decimal.getPrecision(),
-                    decimal.getScale());
-              case INT32:
-                return new DictionaryDecimalIntAccessor(
-                    (IntVector) vector,
-                    decimal.getPrecision(),
-                    decimal.getScale());
-              default:
-                throw new UnsupportedOperationException(
-                    "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
-            }
-          default:
-            throw new UnsupportedOperationException(
-                "Unsupported logical type: " + primitive.getOriginalType());
-        }
-      } else {
-        switch (primitive.getPrimitiveTypeName()) {
-          case FIXED_LEN_BYTE_ARRAY:
-          case BINARY:
-            return new DictionaryBinaryAccessor((IntVector) vector);
-          case INT32:
-            return new DictionaryIntAccessor((IntVector) vector);
-          case FLOAT:
-            return new DictionaryFloatAccessor((IntVector) vector);
-          case INT64:
-            return new DictionaryLongAccessor((IntVector) vector);
-          case DOUBLE:
-            return new DictionaryDoubleAccessor((IntVector) vector);
-          default:
-            throw new UnsupportedOperationException("Unsupported type: " + primitive);
-        }
-      }
-    } else {
-      if (vector instanceof BitVector) {
-        return new BooleanAccessor((BitVector) vector);
-      } else if (vector instanceof TinyIntVector) {
-        return new ByteAccessor((TinyIntVector) vector);
-      } else if (vector instanceof SmallIntVector) {
-        return new ShortAccessor((SmallIntVector) vector);
-      } else if (vector instanceof IntVector) {
-        return new IntAccessor((IntVector) vector);
-      } else if (vector instanceof BigIntVector) {
-        return new LongAccessor((BigIntVector) vector);
-      } else if (vector instanceof Float4Vector) {
-        return new FloatAccessor((Float4Vector) vector);
-      } else if (vector instanceof Float8Vector) {
-        return new DoubleAccessor((Float8Vector) vector);
-      } else if (vector instanceof IcebergArrowVectors.DecimalArrowVector) {
-        return new DecimalAccessor((IcebergArrowVectors.DecimalArrowVector) vector);
-      } else if (vector instanceof IcebergArrowVectors.VarcharArrowVector) {
-        return new StringAccessor((IcebergArrowVectors.VarcharArrowVector) vector);
-      } else if (vector instanceof IcebergArrowVectors.VarBinaryArrowVector) {
-        return new BinaryAccessor((IcebergArrowVectors.VarBinaryArrowVector) vector);
-      } else if (vector instanceof DateDayVector) {
-        return new DateAccessor((DateDayVector) vector);
-      } else if (vector instanceof TimeStampMicroTZVector) {
-        return new TimestampAccessor((TimeStampMicroTZVector) vector);
-      } else if (vector instanceof ListVector) {
-        ListVector listVector = (ListVector) vector;
-        return new ArrayAccessor(listVector);
-      } else if (vector instanceof StructVector) {
-        StructVector structVector = (StructVector) vector;
-        ArrowVectorAccessor structAccessor = new StructAccessor(structVector);
-        childColumns = new ArrowColumnVector[structVector.size()];
-        for (int i = 0; i < childColumns.length; ++i) {
-          childColumns[i] = new ArrowColumnVector(structVector.getVectorById(i));
-        }
-        return structAccessor;
-      }
-    }
-    throw new UnsupportedOperationException();
-  }
-
-  private class BooleanAccessor extends ArrowVectorAccessor {
-
-    private final BitVector vector;
-
-    BooleanAccessor(BitVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final boolean getBoolean(int rowId) {
-      return vector.get(rowId) == 1;
-    }
-  }
-
-  private class ByteAccessor extends ArrowVectorAccessor {
-
-    private final TinyIntVector vector;
-
-    ByteAccessor(TinyIntVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final byte getByte(int rowId) {
-      return vector.get(rowId);
-    }
-  }
-
-  private class ShortAccessor extends ArrowVectorAccessor {
-
-    private final SmallIntVector vector;
-
-    ShortAccessor(SmallIntVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final short getShort(int rowId) {
-      return vector.get(rowId);
-    }
-  }
-
-  private class IntAccessor extends ArrowVectorAccessor {
-
-    private final IntVector vector;
-
-    IntAccessor(IntVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final int getInt(int rowId) {
-      return vector.get(rowId);
-    }
-  }
-
-  private class DictionaryIntAccessor extends ArrowVectorAccessor {
-
-    private final IntVector vector;
-
-    DictionaryIntAccessor(IntVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final int getInt(int rowId) {
-      return dictionary.decodeToInt(vector.get(rowId));
-    }
-  }
-
-  private class LongAccessor extends ArrowVectorAccessor {
-
-    private final BigIntVector vector;
-
-    LongAccessor(BigIntVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final long getLong(int rowId) {
-      return vector.get(rowId);
-    }
-  }
-
-  private class DictionaryLongAccessor extends ArrowVectorAccessor {
-
-    private final IntVector vector;
-
-    DictionaryLongAccessor(IntVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final long getLong(int rowId) {
-      return dictionary.decodeToLong(vector.get(rowId));
-    }
-  }
-
-  private class FloatAccessor extends ArrowVectorAccessor {
-
-    private final Float4Vector vector;
-
-    FloatAccessor(Float4Vector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final float getFloat(int rowId) {
-      return vector.get(rowId);
-    }
-  }
-
-  private class DictionaryFloatAccessor extends ArrowVectorAccessor {
-
-    private final IntVector vector;
-
-    DictionaryFloatAccessor(IntVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final float getFloat(int rowId) {
-      return dictionary.decodeToFloat(vector.get(rowId));
-    }
-  }
-
-  private class DoubleAccessor extends ArrowVectorAccessor {
-
-    private final Float8Vector vector;
-
-    DoubleAccessor(Float8Vector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final double getDouble(int rowId) {
-      return vector.get(rowId);
-    }
-  }
-
-  private class DictionaryDoubleAccessor extends ArrowVectorAccessor {
-
-    private final IntVector vector;
-
-    DictionaryDoubleAccessor(IntVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final double getDouble(int rowId) {
-      return dictionary.decodeToDouble(vector.get(rowId));
-    }
-  }
-
-  private class DecimalAccessor extends ArrowVectorAccessor {
-
-    private final IcebergArrowVectors.DecimalArrowVector vector;
-
-    DecimalAccessor(IcebergArrowVectors.DecimalArrowVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final Decimal getDecimal(int rowId, int precision, int scale) {
-      if (isNullAt(rowId)) {
-        return null;
-      }
-      return Decimal.apply(vector.getObject(rowId), precision, scale);
-    }
-  }
-
-  private class StringAccessor extends ArrowVectorAccessor {
-
-    private final IcebergArrowVectors.VarcharArrowVector vector;
-    private final NullableVarCharHolder stringResult = new NullableVarCharHolder();
-
-    StringAccessor(IcebergArrowVectors.VarcharArrowVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final UTF8String getUTF8String(int rowId) {
-      vector.get(rowId, stringResult);
-      if (stringResult.isSet == 0) {
-        return null;
-      } else {
-        return UTF8String.fromAddress(
-            null,
-            stringResult.buffer.memoryAddress() + stringResult.start,
-            stringResult.end - stringResult.start);
-      }
-    }
-  }
-
-  private class DictionaryStringAccessor extends ArrowVectorAccessor {
-
-    private final IntVector vector;
-
-    DictionaryStringAccessor(IntVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final UTF8String getUTF8String(int rowId) {
-      if (isNullAt(rowId)) {
-        return null;
-      }
-      Binary binary = dictionary.decodeToBinary(vector.get(rowId));
-      return UTF8String.fromBytes(binary.getBytesUnsafe());
-    }
-  }
-
-  private class FixedSizeBinaryAccessor extends ArrowVectorAccessor {
-
-    private final FixedSizeBinaryVector vector;
-
-    FixedSizeBinaryAccessor(FixedSizeBinaryVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final byte[] getBinary(int rowId) {
-      return vector.getObject(rowId);
-    }
-  }
-
-  private class BinaryAccessor extends ArrowVectorAccessor {
-
-    private final VarBinaryVector vector;
-
-    BinaryAccessor(VarBinaryVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final byte[] getBinary(int rowId) {
-      return vector.getObject(rowId);
-    }
-  }
-
-  private class DictionaryBinaryAccessor extends ArrowVectorAccessor {
-
-    private final IntVector vector;
-
-    DictionaryBinaryAccessor(IntVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final byte[] getBinary(int rowId) {
-      Binary binary = dictionary.decodeToBinary(vector.get(rowId));
-      return binary.getBytesUnsafe();
-    }
-  }
-
-  private class DateAccessor extends ArrowVectorAccessor {
-
-    private final DateDayVector vector;
-
-    DateAccessor(DateDayVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final int getInt(int rowId) {
-      return vector.get(rowId);
-    }
-  }
-
-  private class DictionaryDateAccessor extends DictionaryIntAccessor {
-    DictionaryDateAccessor(IntVector vector) {
-      super(vector);
-    }
-  }
-
-  private class TimestampAccessor extends ArrowVectorAccessor {
-
-    private final TimeStampMicroTZVector vector;
-
-    TimestampAccessor(TimeStampMicroTZVector vector) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final long getLong(int rowId) {
-      return vector.get(rowId);
-    }
-  }
-
-  private class DictionaryTimestampAccessor extends DictionaryLongAccessor {
-    DictionaryTimestampAccessor(IntVector vector) {
-      super(vector);
-    }
-  }
-
-  private class ArrayAccessor extends ArrowVectorAccessor {
-
-    private final ListVector vector;
-    private final ArrowColumnVector arrayData;
-
-    ArrayAccessor(ListVector vector) {
-      super(vector);
-      this.vector = vector;
-      this.arrayData = new ArrowColumnVector(vector.getDataVector());
-    }
-
-    @Override
-    final ColumnarArray getArray(int rowId) {
-      ArrowBuf offsets = vector.getOffsetBuffer();
-      int index = rowId * ListVector.OFFSET_WIDTH;
-      int start = offsets.getInt(index);
-      int end = offsets.getInt(index + ListVector.OFFSET_WIDTH);
-      return new ColumnarArray(arrayData, start, end - start);
-    }
-  }
-
-  /**
-   * Any call to "get" method will throw UnsupportedOperationException.
-   * <p>
-   * Access struct values in a ArrowColumnVector doesn't use this vector. Instead, it uses getStruct() method defined in
-   * the parent class. Any call to "get" method in this class is a bug in the code.
-   */
-  private class StructAccessor extends ArrowVectorAccessor {
-
-    StructAccessor(StructVector vector) {
-      super(vector);
-    }
-  }
-
-  private class DictionaryDecimalBinaryAccessor extends ArrowVectorAccessor {
-    private final IntVector vector;
-
-    DictionaryDecimalBinaryAccessor(IntVector vector, int precision, int scale) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    //TODO: still need to evaluate if this is the most efficient way
-    @Override
-    final Decimal getDecimal(int rowId, int precision, int scale) {
-      if (isNullAt(rowId)) {
-        return null;
-      }
-      Binary value = dictionary.decodeToBinary(vector.get(rowId));
-      BigInteger unscaledValue = new BigInteger(value.getBytesUnsafe());
-      return Decimal.apply(unscaledValue.longValue(), precision, scale);
-    }
-  }
-
-  private class DictionaryDecimalLongAccessor extends ArrowVectorAccessor {
-    private final IntVector vector;
-
-    DictionaryDecimalLongAccessor(IntVector vector, int precision, int scale) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    //TODO: still need to evaluate if this is the most efficient way
-    @Override
-    final Decimal getDecimal(int rowId, int precision, int scale) {
-      if (isNullAt(rowId)) {
-        return null;
-      }
-      long unscaledValue = dictionary.decodeToLong(vector.get(rowId));
-      return Decimal.apply(unscaledValue, precision, scale);
-    }
-  }
-
-  private class DictionaryDecimalIntAccessor extends ArrowVectorAccessor {
-    private final IntVector vector;
-
-    DictionaryDecimalIntAccessor(IntVector vector, int precision, int scale) {
-      super(vector);
-      this.vector = vector;
-    }
-
-    @Override
-    final Decimal getDecimal(int rowId, int precision, int scale) {
-      if (isNullAt(rowId)) {
-        return null;
-      }
-      int unscaledValue = dictionary.decodeToInt(vector.get(rowId));
-      return Decimal.apply(unscaledValue, precision, scale);
-    }
+  static ColumnVector forHolder(VectorHolder holder, int numRows) {
+    return holder.isDummy() ? new NullValuesColumnVector(numRows) :
+        new IcebergArrowColumnVector(holder);
   }
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/NullValuesColumnVector.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/NullValuesColumnVector.java
index 933fd8c00927..8770d13ab883 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/NullValuesColumnVector.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/NullValuesColumnVector.java
@@ -19,10 +19,9 @@
 
 package org.apache.iceberg.spark.data.vectorized;
 
-import org.apache.arrow.vector.types.pojo.ArrowType;
-import org.apache.arrow.vector.types.pojo.Field;
-import org.apache.arrow.vector.types.pojo.FieldType;
-import org.apache.iceberg.spark.arrow.ArrowUtils;
+import org.apache.iceberg.spark.SparkSchemaUtil;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.sql.vectorized.ColumnarArray;
@@ -32,14 +31,10 @@
 public class NullValuesColumnVector extends ColumnVector {
 
   private final int numNulls;
-  private static final String NULL_FIELD_NAME = "NULL_FIELD";
-  private static final Field NULL_ARROW_FIELD = new Field(
-      NULL_FIELD_NAME,
-      new FieldType(true, new ArrowType.Int(Integer.SIZE, true), null, null),
-      null);
+  private static final Type NULL_TYPE = Types.IntegerType.get();
 
   public NullValuesColumnVector(int nValues) {
-    super(ArrowUtils.instance().fromArrowField(NULL_ARROW_FIELD));
+    super(SparkSchemaUtil.convert(NULL_TYPE));
     this.numNulls = nValues;
   }
 
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
index 61fe6d664fd5..cbb1fb864175 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
@@ -30,7 +30,7 @@
 import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader;
 import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
 import org.apache.iceberg.parquet.VectorizedReader;
-import org.apache.iceberg.spark.arrow.ArrowUtils;
+import org.apache.iceberg.spark.arrow.ArrowAllocation;
 import org.apache.iceberg.types.Types;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.schema.GroupType;
@@ -68,7 +68,7 @@ private static class VectorizedReaderBuilder extends TypeWithSchemaVisitor<Vecto
       this.parquetSchema = parquetSchema;
       this.tableIcebergSchema = tableSchema;
       this.batchSize = bSize;
-      this.rootAllocator = ArrowUtils.instance().rootAllocator()
+      this.rootAllocator = ArrowAllocation.rootAllocator()
           .newChildAllocator("VectorizedReadBuilder", 0, Long.MAX_VALUE);
     }
 
@@ -105,10 +105,10 @@ public VectorizedReader struct(
         if (reader != null) {
           reorderedFields.add(reader);
         } else {
-          reorderedFields.add(VectorizedArrowReader.NULL_VALUES_READER);
+          reorderedFields.add(VectorizedArrowReader.nulls());
         }
       }
-      return new ColumnarBatchReaders(reorderedFields, batchSize);
+      return new ColumnarBatchReaders(reorderedFields);
     }
 
     @Override
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/ColumnarBatchTaskDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/BatchTaskDataReader.java
similarity index 97%
rename from spark/src/main/java/org/apache/iceberg/spark/source/ColumnarBatchTaskDataReader.java
rename to spark/src/main/java/org/apache/iceberg/spark/source/BatchTaskDataReader.java
index aa470759cb43..187afcb0672e 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/ColumnarBatchTaskDataReader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/BatchTaskDataReader.java
@@ -36,10 +36,10 @@
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 
-class ColumnarBatchTaskDataReader extends BaseTaskDataReader<ColumnarBatch>
+class BatchTaskDataReader extends BaseTaskDataReader<ColumnarBatch>
     implements InputPartitionReader<ColumnarBatch> {
 
-  ColumnarBatchTaskDataReader(
+  BatchTaskDataReader(
       CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
       EncryptionManager encryptionManager, boolean caseSensitive, int bSize) {
     super(task, tableSchema, expectedSchema, fileIo, encryptionManager, caseSensitive, bSize);
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowTaskDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/RowTaskDataReader.java
similarity index 98%
rename from spark/src/main/java/org/apache/iceberg/spark/source/InternalRowTaskDataReader.java
rename to spark/src/main/java/org/apache/iceberg/spark/source/RowTaskDataReader.java
index 65563c16bc5c..275d7c5f3e84 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowTaskDataReader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/RowTaskDataReader.java
@@ -67,9 +67,9 @@
 import org.apache.spark.unsafe.types.UTF8String;
 import scala.collection.JavaConverters;
 
-class InternalRowTaskDataReader extends BaseTaskDataReader<InternalRow> implements InputPartitionReader<InternalRow> {
+class RowTaskDataReader extends BaseTaskDataReader<InternalRow> implements InputPartitionReader<InternalRow> {
 
-  InternalRowTaskDataReader(
+  RowTaskDataReader(
       CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
       EncryptionManager encryptionManager, boolean caseSensitive) {
     super(task, tableSchema, expectedSchema, fileIo, encryptionManager, caseSensitive);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/DictionaryData.java b/spark/src/test/java/org/apache/iceberg/spark/data/DictionaryData.java
deleted file mode 100644
index 5451e55b7ff3..000000000000
--- a/spark/src/test/java/org/apache/iceberg/spark/data/DictionaryData.java
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.data;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-import java.math.BigDecimal;
-import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import java.util.UUID;
-import java.util.function.Supplier;
-import org.apache.avro.generic.GenericData;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.avro.AvroSchemaUtil;
-import org.apache.iceberg.types.Type;
-import org.apache.iceberg.types.TypeUtil;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.types.Decimal;
-import org.apache.spark.unsafe.types.UTF8String;
-
-
-public class DictionaryData {
-
-  private DictionaryData() {}
-
-  public static List<GenericData.Record> generateDictionaryEncodableData(Schema schema, int numRecords, long seed) {
-    List<GenericData.Record> records = Lists.newArrayListWithExpectedSize(numRecords);
-    DictionaryDataGenerator dictionaryDataGenerator = new DictionaryDataGenerator(schema, seed);
-    for (int i = 0; i < numRecords; i += 1) {
-      GenericData.Record rec = (GenericData.Record) TypeUtil.visit(schema, dictionaryDataGenerator);
-      records.add(rec);
-    }
-    return records;
-  }
-
-  private static class DictionaryDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
-    private final Map<Type, org.apache.avro.Schema> typeToSchema;
-    private final Random random;
-
-    private DictionaryDataGenerator(Schema schema, long seed) {
-      this.typeToSchema = AvroSchemaUtil.convertTypes(schema.asStruct(), "test");
-      this.random = new Random(seed);
-    }
-
-    @Override
-    public GenericData.Record schema(Schema schema, Supplier<Object> structResult) {
-      return (GenericData.Record) structResult.get();
-    }
-
-    @Override
-    public GenericData.Record struct(Types.StructType struct, Iterable<Object> fieldResults) {
-      GenericData.Record rec = new GenericData.Record(typeToSchema.get(struct));
-
-      List<Object> values = Lists.newArrayList(fieldResults);
-      for (int i = 0; i < values.size(); i += 1) {
-        rec.put(i, values.get(i));
-      }
-
-      return rec;
-    }
-
-    @Override
-    public Object field(Types.NestedField field, Supplier<Object> fieldResult) {
-      // return null 5% of the time when the value is optional
-      if (field.isOptional() && random.nextInt(20) == 1) {
-        return null;
-      }
-      return fieldResult.get();
-    }
-
-    @Override
-    public Object list(Types.ListType list, Supplier<Object> elementResult) {
-      int numElements = random.nextInt(20);
-
-      List<Object> result = Lists.newArrayListWithExpectedSize(numElements);
-      for (int i = 0; i < numElements; i += 1) {
-        // return null 5% of the time when the value is optional
-        if (list.isElementOptional() && random.nextInt(20) == 1) {
-          result.add(null);
-        } else {
-          result.add(elementResult.get());
-        }
-      }
-
-      return result;
-    }
-
-    @Override
-    public Object map(Types.MapType map, Supplier<Object> keyResult, Supplier<Object> valueResult) {
-      int numEntries = random.nextInt(20);
-
-      Map<Object, Object> result = Maps.newLinkedHashMap();
-      Set<Object> keySet = Sets.newHashSet();
-      for (int i = 0; i < numEntries; i += 1) {
-        Object key = keyResult.get();
-        // ensure no collisions
-        while (keySet.contains(key)) {
-          key = keyResult.get();
-        }
-
-        keySet.add(key);
-
-        // return null 5% of the time when the value is optional
-        if (map.isValueOptional() && random.nextInt(20) == 1) {
-          result.put(key, null);
-        } else {
-          result.put(key, valueResult.get());
-        }
-      }
-
-      return result;
-    }
-
-    @Override
-    public Object primitive(Type.PrimitiveType primitive) {
-      Object result = generatePrimitive(primitive, random);
-      // For the primitives that Avro needs a different type than Spark, fix
-      // them here.
-      switch (primitive.typeId()) {
-        case STRING:
-          return ((UTF8String) result).toString();
-        case FIXED:
-          return new GenericData.Fixed(
-              typeToSchema.get(primitive),
-              (byte[]) result);
-        case BINARY:
-          return ByteBuffer.wrap((byte[]) result);
-        case UUID:
-          return UUID.nameUUIDFromBytes((byte[]) result);
-        case DECIMAL:
-          return ((Decimal) result).toJavaBigDecimal();
-        default:
-          return result;
-      }
-    }
-  }
-
-  @SuppressWarnings("checkstyle:CyclomaticComplexity")
-  private static Object generatePrimitive(
-      Type.PrimitiveType primitive,
-      Random random) {
-    // 3 choices
-    int choice = random.nextInt(3);
-    switch (primitive.typeId()) {
-      case BOOLEAN:
-        return true; // doesn't really matter for booleans since they are not dictionary encoded
-
-      case INTEGER:
-        switch (choice) {
-          case 0:
-            return 0;
-          case 1:
-            return 1;
-          case 2:
-            return 2;
-        }
-
-      case LONG:
-        switch (choice) {
-          case 0:
-            return 0L;
-          case 1:
-            return 1L;
-          case 2:
-            return 2L;
-        }
-
-      case FLOAT:
-        switch (choice) {
-          case 0:
-            return 0.0f;
-          case 1:
-            return 1.0f;
-          case 2:
-            return 2.0f;
-        }
-
-      case DOUBLE:
-        switch (choice) {
-          case 0:
-            return 0.0d;
-          case 1:
-            return 1.0d;
-          case 2:
-            return 2.0d;
-        }
-
-      case DATE:
-        switch (choice) {
-          case 0:
-            return 0;
-          case 1:
-            return 1;
-          case 2:
-            return 2;
-        }
-
-      case TIME:
-        switch (choice) {
-          case 0:
-            return 0L;
-          case 1:
-            return 1L;
-          case 2:
-            return 2L;
-        }
-
-      case TIMESTAMP:
-        switch (choice) {
-          case 0:
-            return 0L;
-          case 1:
-            return 1L;
-          case 2:
-            return 2L;
-        }
-
-      case STRING:
-        switch (choice) {
-          case 0:
-            return UTF8String.fromString("0");
-          case 1:
-            return UTF8String.fromString("1");
-          case 2:
-            return UTF8String.fromString("2");
-        }
-
-      case FIXED:
-        byte[] fixed = new byte[((Types.FixedType) primitive).length()];
-        switch (choice) {
-          case 0:
-            fixed[0] = 0;
-            return fixed;
-          case 1:
-            fixed[0] = 1;
-            return fixed;
-          case 2:
-            fixed[0] = 2;
-            return fixed;
-        }
-
-      case BINARY:
-        byte[] binary = new byte[4];
-        switch (choice) {
-          case 0:
-            binary[0] = 0;
-            return binary;
-          case 1:
-            binary[0] = 1;
-            return binary;
-          case 2:
-            binary[0] = 2;
-            return binary;
-        }
-
-      case DECIMAL:
-        Types.DecimalType type = (Types.DecimalType) primitive;
-        switch (choice) {
-          case 0:
-            BigInteger unscaled = new BigInteger("1");
-            return Decimal.apply(new BigDecimal(unscaled, type.scale()));
-          case 1:
-            unscaled = new BigInteger("2");
-            return Decimal.apply(new BigDecimal(unscaled, type.scale()));
-          case 2:
-            unscaled = new BigInteger("3");
-            return Decimal.apply(new BigDecimal(unscaled, type.scale()));
-        }
-
-      default:
-        throw new IllegalArgumentException(
-            "Cannot generate random value for unknown type: " + primitive);
-    }
-  }
-}
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
index e6517619514a..fc7fcbd5d885 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
@@ -119,9 +119,21 @@ public static List<Record> generateListWithFallBackDictionaryEncodingForStrings(
     return records;
   }
 
+  public static List<GenericData.Record> generateDictionaryEncodableData(Schema schema, int numRecords, long seed) {
+    List<GenericData.Record> records = Lists.newArrayListWithExpectedSize(numRecords);
+    DictionaryEncodedDataGenerator
+        dictionaryDataGenerator = new DictionaryEncodedDataGenerator(schema, seed);
+    for (int i = 0; i < numRecords; i += 1) {
+      GenericData.Record rec = (GenericData.Record) TypeUtil.visit(schema, dictionaryDataGenerator);
+      records.add(rec);
+    }
+    return records;
+  }
+
+  @SuppressWarnings("checkstyle:VisibilityModifier")
   private static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
     private final Map<Type, org.apache.avro.Schema> typeToSchema;
-    private final Random random;
+    final Random random;
 
     private RandomDataGenerator(Schema schema, long seed) {
       this.typeToSchema = AvroSchemaUtil.convertTypes(schema.asStruct(), "test");
@@ -202,6 +214,10 @@ public Object primitive(Type.PrimitiveType primitive) {
       Object result = RandomUtil.generatePrimitive(primitive, random);
       // For the primitives that Avro needs a different type than Spark, fix
       // them here.
+      return getPrimitive(primitive, result);
+    }
+
+    Object getPrimitive(Type.PrimitiveType primitive, Object result) {
       switch (primitive.typeId()) {
         case FIXED:
           return new GenericData.Fixed(typeToSchema.get(primitive),
@@ -312,34 +328,6 @@ public Object primitive(Type.PrimitiveType primitive) {
     }
   }
 
-  private static class FallbackDictionaryEncodedDataGenerator extends RandomDataGenerator {
-
-    private final int numRecords;
-    private final float fraction;
-    private int current;
-
-    private FallbackDictionaryEncodedDataGenerator(Schema schema, long seed, int numRecords, float fraction) {
-      super(schema, seed);
-      this.numRecords = numRecords;
-      this.fraction = fraction;
-    }
-
-    @Override
-    public Object primitive(Type.PrimitiveType primitive) {
-      switch (primitive.typeId()) {
-        case STRING:
-          if (current < fraction * numRecords) {
-            current++;
-            return "ABC";
-          } else {
-            current++;
-            return super.primitive(primitive);
-          }
-      }
-      return super.primitive(primitive);
-    }
-  }
-
   @SuppressWarnings("RandomModInteger")
   private static Object generatePrimitive(Type.PrimitiveType primitive,
                                          Random random) {
@@ -488,4 +476,181 @@ private static BigInteger randomUnscaled(int precision, Random random) {
       sb.append(DIGITS.charAt(random.nextInt(DIGITS.length())));
     }
   }
+
+  private static class DictionaryEncodedDataGenerator extends RandomDataGenerator {
+
+    private DictionaryEncodedDataGenerator(Schema schema, long seed) {
+      super(schema, seed);
+    }
+
+    @Override
+    public Object primitive(Type.PrimitiveType primitive) {
+      Object result = generateDictionaryEncodablePrimitive(primitive, random);
+      return super.getPrimitive(primitive, result);
+    }
+
+    @SuppressWarnings("checkstyle:CyclomaticComplexity")
+    private static Object generateDictionaryEncodablePrimitive(Type.PrimitiveType primitive, Random random) {
+      // 3 choices
+      int choice = random.nextInt(3);
+      switch (primitive.typeId()) {
+        case BOOLEAN:
+          return true; // doesn't really matter for booleans since they are not dictionary encoded
+
+        case INTEGER:
+          switch (choice) {
+            case 0:
+              return 0;
+            case 1:
+              return 1;
+            case 2:
+              return 2;
+          }
+
+        case LONG:
+          switch (choice) {
+            case 0:
+              return 0L;
+            case 1:
+              return 1L;
+            case 2:
+              return 2L;
+          }
+
+        case FLOAT:
+          switch (choice) {
+            case 0:
+              return 0.0f;
+            case 1:
+              return 1.0f;
+            case 2:
+              return 2.0f;
+          }
+
+        case DOUBLE:
+          switch (choice) {
+            case 0:
+              return 0.0d;
+            case 1:
+              return 1.0d;
+            case 2:
+              return 2.0d;
+          }
+
+        case DATE:
+          switch (choice) {
+            case 0:
+              return 0;
+            case 1:
+              return 1;
+            case 2:
+              return 2;
+          }
+
+        case TIME:
+          switch (choice) {
+            case 0:
+              return 0L;
+            case 1:
+              return 1L;
+            case 2:
+              return 2L;
+          }
+
+        case TIMESTAMP:
+          switch (choice) {
+            case 0:
+              return 0L;
+            case 1:
+              return 1L;
+            case 2:
+              return 2L;
+          }
+
+        case STRING:
+          switch (choice) {
+            case 0:
+              return UTF8String.fromString("0");
+            case 1:
+              return UTF8String.fromString("1");
+            case 2:
+              return UTF8String.fromString("2");
+          }
+
+        case FIXED:
+          byte[] fixed = new byte[((Types.FixedType) primitive).length()];
+          switch (choice) {
+            case 0:
+              fixed[0] = 0;
+              return fixed;
+            case 1:
+              fixed[0] = 1;
+              return fixed;
+            case 2:
+              fixed[0] = 2;
+              return fixed;
+          }
+
+        case BINARY:
+          byte[] binary = new byte[4];
+          switch (choice) {
+            case 0:
+              binary[0] = 0;
+              return binary;
+            case 1:
+              binary[0] = 1;
+              return binary;
+            case 2:
+              binary[0] = 2;
+              return binary;
+          }
+
+        case DECIMAL:
+          Types.DecimalType type = (Types.DecimalType) primitive;
+          switch (choice) {
+            case 0:
+              BigInteger unscaled = new BigInteger("1");
+              return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+            case 1:
+              unscaled = new BigInteger("2");
+              return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+            case 2:
+              unscaled = new BigInteger("3");
+              return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+          }
+
+        default:
+          throw new IllegalArgumentException(
+              "Cannot generate random value for unknown type: " + primitive);
+      }
+    }
+  }
+
+  private static class FallbackDictionaryEncodedDataGenerator extends RandomDataGenerator {
+
+    private final int numRecords;
+    private final float fraction;
+    private int current;
+
+    private FallbackDictionaryEncodedDataGenerator(Schema schema, long seed, int numRecords, float fraction) {
+      super(schema, seed);
+      this.numRecords = numRecords;
+      this.fraction = fraction;
+    }
+
+    @Override
+    public Object primitive(Type.PrimitiveType primitive) {
+      switch (primitive.typeId()) {
+        case STRING:
+          if (current < fraction * numRecords) {
+            current++;
+            return "ABC";
+          } else {
+            current++;
+            return super.primitive(primitive);
+          }
+      }
+      return super.primitive(primitive);
+    }
+  }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java
index 20b0a2f908d7..235afc7ca698 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java
@@ -19,40 +19,13 @@
 
 package org.apache.iceberg.spark.data;
 
-import java.io.File;
-import java.io.IOException;
 import java.util.List;
 import org.apache.avro.generic.GenericData;
-import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
-import org.apache.iceberg.io.FileAppender;
-import org.apache.iceberg.parquet.Parquet;
-import org.apache.iceberg.types.TypeUtil;
-import org.apache.iceberg.types.Types;
-import org.junit.Assert;
-import org.junit.Assume;
 
 public class TestSparkParquetDictionaryEncodedVectorizedReader extends TestSparkParquetVectorizedReader {
-
   @Override
-  protected void writeAndValidate(Schema schema) throws IOException {
-    // Write test data
-    Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(
-        schema,
-        type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));
-
-    List<GenericData.Record> expected = DictionaryData.generateDictionaryEncodableData(schema, 100000, 0L);
-
-    // write a test parquet file using iceberg writer
-    File testFile = temp.newFile();
-    Assert.assertTrue("Delete should succeed", testFile.delete());
-
-    try (FileAppender<GenericData.Record> writer = Parquet.write(Files.localOutput(testFile))
-        .schema(schema)
-        .named("test")
-        .build()) {
-      writer.addAll(expected);
-    }
-    assertRecordsMatch(schema, expected, testFile);
+  List<GenericData.Record> generateData(Schema schema) {
+    return RandomData.generateDictionaryEncodableData(schema, 100000, 0L);
   }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
index 76cb1d877d5b..1b00be85ec83 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
@@ -19,41 +19,13 @@
 
 package org.apache.iceberg.spark.data;
 
-import java.io.File;
-import java.io.IOException;
 import java.util.List;
 import org.apache.avro.generic.GenericData;
-import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
-import org.apache.iceberg.io.FileAppender;
-import org.apache.iceberg.parquet.Parquet;
-import org.apache.iceberg.types.TypeUtil;
-import org.apache.iceberg.types.Types;
-import org.junit.Assert;
-import org.junit.Assume;
 
 public class TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader extends TestSparkParquetVectorizedReader {
-
   @Override
-  protected void writeAndValidate(Schema schema) throws IOException {
-    // Write test data
-    Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(
-        schema,
-        type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));
-
-    List<GenericData.Record> expected =
-        RandomData.generateListWithFallBackDictionaryEncodingForStrings(schema, 100000, 0L, 0.5f);
-
-    // write a test parquet file using iceberg writer
-    File testFile = temp.newFile();
-    Assert.assertTrue("Delete should succeed", testFile.delete());
-
-    try (FileAppender<GenericData.Record> writer = Parquet.write(Files.localOutput(testFile))
-        .schema(schema)
-        .named("test")
-        .build()) {
-      writer.addAll(expected);
-    }
-    assertRecordsMatch(schema, expected, testFile);
+  public List<GenericData.Record> generateData(Schema schema) {
+    return RandomData.generateListWithFallBackDictionaryEncodingForStrings(schema, 100000, 0L, 0.5f);
   }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
index 55ccbcd8ee8d..146f173f0be3 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
@@ -36,15 +36,15 @@
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.junit.Assert;
 import org.junit.Assume;
-import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 
 public class TestSparkParquetVectorizedReader extends AvroDataTest {
 
-  @Before
-  public void setupArrowFlags() {
-    System.setProperty("arrow.enable_unsafe_memory_access", "true");
-    System.setProperty("arrow.enable_null_check_for_get", "false");
+  @BeforeClass
+  public static void beforeClass() {
+    TestHelpers.setArrowFlagsForVectorizedReads();
   }
 
   @Override
@@ -54,7 +54,7 @@ protected void writeAndValidate(Schema schema) throws IOException {
         schema,
         type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));
 
-    List<GenericData.Record> expected = RandomData.generateList(schema, 100000, 0L);
+    List<GenericData.Record> expected = generateData(schema);
 
     // write a test parquet file using iceberg writer
     File testFile = temp.newFile();
@@ -69,6 +69,10 @@ protected void writeAndValidate(Schema schema) throws IOException {
     assertRecordsMatch(schema, expected, testFile);
   }
 
+  List<GenericData.Record> generateData(Schema schema) {
+    return RandomData.generateList(schema, 100000, 0L);
+  }
+
   void assertRecordsMatch(Schema schema, List<GenericData.Record> expected, File testFile) throws IOException {
     try (CloseableIterable<ColumnarBatch> batchReader = Parquet.read(Files.localInput(testFile))
         .project(schema)
@@ -79,56 +83,53 @@ void assertRecordsMatch(Schema schema, List<GenericData.Record> expected, File t
       Iterator<ColumnarBatch> batches = batchReader.iterator();
       int numRowsRead = 0;
       int numExpectedRead = 0;
-      int batchNum = 0;
       while (batches.hasNext()) {
-
         ColumnarBatch batch = batches.next();
         numRowsRead += batch.numRows();
-
         List<GenericData.Record> expectedBatch = new ArrayList<>(batch.numRows());
         for (int i = numExpectedRead; i < numExpectedRead + batch.numRows(); i++) {
           expectedBatch.add(expected.get(i));
         }
         TestHelpers.assertArrowVectors(schema.asStruct(), expectedBatch, batch);
         numExpectedRead += batch.numRows();
-        batchNum++;
       }
       Assert.assertEquals(expected.size(), numRowsRead);
     }
   }
 
   @Test
-  public void testArray() throws IOException {
-    System.out.println("Not Supported");
+  @Ignore
+  public void testArray() {
   }
 
   @Test
-  public void testArrayOfStructs() throws IOException {
+  @Ignore
+  public void testArrayOfStructs() {
     System.out.println("Not Supported");
   }
 
   @Test
-  public void testMap() throws IOException {
-    System.out.println("Not Supported");
+  @Ignore
+  public void testMap() {
   }
 
   @Test
-  public void testNumericMapKey() throws IOException {
-    System.out.println("Not Supported");
+  @Ignore
+  public void testNumericMapKey() {
   }
 
   @Test
-  public void testComplexMapKey() throws IOException {
-    System.out.println("Not Supported");
+  @Ignore
+  public void testComplexMapKey() {
   }
 
   @Test
-  public void testMapOfStructs() throws IOException {
-    System.out.println("Not Supported");
+  @Ignore
+  public void testMapOfStructs() {
   }
 
   @Test
-  public void testMixedTypes() throws IOException {
-    System.out.println("Not Supported");
+  @Ignore
+  public void testMixedTypes() {
   }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
index 43d6044564d8..13bdd79a1cbd 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
@@ -57,7 +57,7 @@ protected abstract Record writeAndRead(String desc,
   public TemporaryFolder temp = new TemporaryFolder();
 
   @BeforeClass
-  public static void setup() {
+  public static void beforeClass() {
     TestHelpers.setArrowFlagsForVectorizedReads();
   }
 

From dc93427018ae3a7256ed20c203466885d61713fb Mon Sep 17 00:00:00 2001
From: samarthjain <samarth@apache.org>
Date: Wed, 18 Mar 2020 14:00:20 -0700
Subject: [PATCH 04/12] Remove benchmarks from the code review

---
 .../VectorizedDictionaryEncodedBenchmark.java |  28 ---
 ...rizedDictionaryEncodedFloatsBenchmark.java |  62 -----
 ...zedDictionaryEncodedIntegersBenchmark.java |  62 -----
 ...orizedDictionaryEncodedLongsBenchmark.java |  62 -----
 ...izedDictionaryEncodedStringsBenchmark.java | 101 --------
 ...llbackToPlainEncodingStringsBenchmark.java |  93 --------
 .../VectorizedIcebergSourceBenchmark.java     | 223 ------------------
 .../VectorizedReadFloatsBenchmark.java        |  60 -----
 ...dReadFloatsTwentyPercentNullBenchmark.java |  42 ----
 ...torizedReadIntBackedDecimalsBenchmark.java |  59 -----
 .../VectorizedReadIntegersBenchmark.java      |  59 -----
 ...eadIntegersTwentyPercentNullBenchmark.java |  42 ----
 .../VectorizedReadLongsBenchmark.java         |  57 -----
 ...edReadLongsTwentyPercentNullBenchmark.java |  42 ----
 .../VectorizedReadPrimitivesBenchmark.java    |  79 -------
 .../VectorizedReadStringsBenchmark.java       |  60 -----
 ...ReadStringsTwentyPercentNullBenchmark.java |  42 ----
 17 files changed, 1173 deletions(-)
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedFloatsBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedIntegersBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedLongsBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedStringsBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedFallbackToPlainEncodingStringsBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedIcebergSourceBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntBackedDecimalsBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadPrimitivesBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsBenchmark.java
 delete mode 100644 spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java

diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedBenchmark.java
deleted file mode 100644
index 3c7b72ac29ed..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedBenchmark.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-public abstract class VectorizedDictionaryEncodedBenchmark extends VectorizedIcebergSourceBenchmark {
-
-  @Override
-  protected void setupSpark() {
-    setupSpark(true);
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedFloatsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedFloatsBenchmark.java
deleted file mode 100644
index 22555ef0bac4..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedFloatsBenchmark.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.lit;
-import static org.apache.spark.sql.functions.pmod;
-import static org.apache.spark.sql.functions.when;
-
-public class VectorizedDictionaryEncodedFloatsBenchmark extends VectorizedDictionaryEncodedBenchmark {
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()),
-        optional(2, "floatCol", Types.FloatType.get()));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumnRenamed("id", "longCol")
-          .drop("id")
-          .withColumn("floatCol", when(pmod(col("longCol"), lit(2)).equalTo(lit(0)), lit(0.0f)).otherwise(lit(1.0f)));
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedIntegersBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedIntegersBenchmark.java
deleted file mode 100644
index 6095b472bcb0..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedIntegersBenchmark.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.lit;
-import static org.apache.spark.sql.functions.pmod;
-import static org.apache.spark.sql.functions.when;
-
-public class VectorizedDictionaryEncodedIntegersBenchmark extends VectorizedDictionaryEncodedBenchmark {
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()),
-        optional(2, "intCol", Types.IntegerType.get()));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumnRenamed("id", "longCol")
-          .drop("id")
-          .withColumn("intCol", when(pmod(col("longCol"), lit(2)).equalTo(lit(0)), lit(0)).otherwise(lit(1)));
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedLongsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedLongsBenchmark.java
deleted file mode 100644
index 20b0d5f7f952..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedLongsBenchmark.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.lit;
-import static org.apache.spark.sql.functions.pmod;
-import static org.apache.spark.sql.functions.when;
-
-public class VectorizedDictionaryEncodedLongsBenchmark extends VectorizedDictionaryEncodedBenchmark {
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()),
-        optional(2, "longCol2", Types.LongType.get()));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumnRenamed("id", "longCol")
-          .drop("id")
-          .withColumn("longCol2", when(pmod(col("longCol"), lit(2)).equalTo(lit(0L)), lit(0)).otherwise(lit(1L)));
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedStringsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedStringsBenchmark.java
deleted file mode 100644
index 30cc01ec4d3d..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedDictionaryEncodedStringsBenchmark.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.lit;
-import static org.apache.spark.sql.functions.pmod;
-import static org.apache.spark.sql.functions.when;
-
-public class VectorizedDictionaryEncodedStringsBenchmark extends VectorizedDictionaryEncodedBenchmark {
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()), optional(2, "stringCol", Types.StringType.get()));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn(
-              "longCol",
-              when(pmod(col("id"), lit(9))
-                  .equalTo(lit(0)), lit(0L))
-                  //.when(expr("id > NUM_ROWS/2"), lit(UUID.randomUUID().toString()))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(1)), lit(1L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(2)), lit(2L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(3)), lit(3L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(4)), lit(4L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(5)), lit(5L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(6)), lit(6L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(7)), lit(7L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(8)), lit(8L))
-                  .otherwise(lit(2L)))
-          .drop("id")
-          .withColumn(
-              "stringCol",
-              when(col("longCol")
-                  .equalTo(lit(1L)), lit("1"))
-                  .when(col("longCol")
-                      .equalTo(lit(2L)), lit("2"))
-                  .when(col("longCol")
-                      .equalTo(lit(3L)), lit("3"))
-                  .when(col("longCol")
-                      .equalTo(lit(4L)), lit("4"))
-                  .when(col("longCol")
-                      .equalTo(lit(5L)), lit("5"))
-                  .when(col("longCol")
-                      .equalTo(lit(6L)), lit("6"))
-                  .when(col("longCol")
-                      .equalTo(lit(7L)), lit("7"))
-                  .when(col("longCol")
-                      .equalTo(lit(8L)), lit("8"))
-                  .when(col("longCol")
-                      .equalTo(lit(9L)), lit("9")));
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedFallbackToPlainEncodingStringsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedFallbackToPlainEncodingStringsBenchmark.java
deleted file mode 100644
index 6660df8823e0..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedFallbackToPlainEncodingStringsBenchmark.java
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import java.util.UUID;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.expr;
-import static org.apache.spark.sql.functions.lit;
-import static org.apache.spark.sql.functions.pmod;
-import static org.apache.spark.sql.functions.when;
-
-public class VectorizedFallbackToPlainEncodingStringsBenchmark extends VectorizedDictionaryEncodedBenchmark {
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()), optional(2, "stringCol", Types.StringType.get()));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn(
-              "longCol",
-              when(expr("id > 10000000/2"), lit(3L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(0)), lit(1L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(1)), lit(1L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(2)), lit(1L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(3)), lit(1L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(4)), lit(1L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(5)), lit(2L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(6)), lit(2L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(7)), lit(2L))
-                  .when(pmod(col("id"), lit(9))
-                      .equalTo(lit(8)), lit(2L))
-                  .otherwise(lit(2L)))
-          .drop("id")
-          .withColumn(
-              "stringCol",
-              when(col("longCol")
-                  .equalTo(lit(1L)), lit("1"))
-                  .when(col("longCol")
-                      .equalTo(lit(2L)), lit("2"))
-                  .when(col("longCol")
-                      .equalTo(lit(3L)), lit(UUID.randomUUID().toString()))
-                  .otherwise(lit(UUID.randomUUID().toString())));
-      appendAsFile(df);
-    }
-  }
-}
-
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedIcebergSourceBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedIcebergSourceBenchmark.java
deleted file mode 100644
index a5b0c81783f5..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedIcebergSourceBenchmark.java
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.io.IOException;
-import java.util.Map;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.iceberg.spark.source.IcebergSourceBenchmark;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.internal.SQLConf;
-import org.openjdk.jmh.annotations.Benchmark;
-import org.openjdk.jmh.annotations.Setup;
-import org.openjdk.jmh.annotations.TearDown;
-import org.openjdk.jmh.annotations.Threads;
-
-import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST;
-
-/**
- * Parent class of the benchmarks that compare performance of performance of reading Parquet data with a flat schema
- * using vectorized Iceberg read path and the built-in file source in Spark.
- * <p>
- * To run all the the benchmarks that extend this class:
- * <code>
- * ./gradlew :iceberg-spark:jmh -PjmhIncludeRegex=VectorizedRead*Benchmark
- * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-read-benchmark-result.txt
- * </code>
- */
-
-public abstract class VectorizedIcebergSourceBenchmark extends IcebergSourceBenchmark {
-  static final int NUM_FILES = 10;
-  static final int NUM_ROWS = 10000000;
-
-  @Setup
-  public void setupBenchmark() {
-    setupSpark();
-    appendData();
-    // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds
-    System.setProperty("arrow.enable_unsafe_memory_access", "true");
-    // Disable expensive null check for every get(index) call.
-    // Iceberg manages nullability checks itself instead of relying on arrow.
-    System.setProperty("arrow.enable_null_check_for_get", "false");
-  }
-
-  @TearDown
-  public void tearDownBenchmark() throws IOException {
-    tearDownSpark();
-    cleanupFiles();
-  }
-
-  @Override
-  protected Configuration initHadoopConf() {
-    return new Configuration();
-  }
-
-  protected abstract void appendData();
-
-  @Benchmark
-  @Threads(1)
-  public void readIcebergVectorized100() {
-    Map<String, String> tableProperties = Maps.newHashMap();
-    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
-    withTableProperties(tableProperties, () -> {
-      String tableLocation = table().location();
-      Dataset<Row> df = spark().read().format("iceberg")
-          .option("iceberg.read.numrecordsperbatch", "100")
-          .load(tableLocation);
-      materialize(df);
-    });
-  }
-
-  @Benchmark
-  @Threads(1)
-  public void readIcebergVectorized1k() {
-    Map<String, String> tableProperties = Maps.newHashMap();
-    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
-    withTableProperties(tableProperties, () -> {
-      String tableLocation = table().location();
-      Dataset<Row> df = spark().read().format("iceberg")
-          .option("iceberg.read.numrecordsperbatch", "1000")
-          .load(tableLocation);
-      materialize(df);
-    });
-  }
-
-  @Benchmark
-  @Threads(1)
-  public void readFileSourceIcebergVectorized5k() {
-    Map<String, String> tableProperties = Maps.newHashMap();
-    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
-    withTableProperties(tableProperties, () -> {
-      String tableLocation = table().location();
-      Dataset<Row> df = spark().read().format("iceberg")
-          .option("iceberg.read.numrecordsperbatch", "5000")
-          .load(tableLocation);
-      materialize(df);
-    });
-  }
-
-  @Benchmark
-  @Threads(1)
-  public void readIcebergVectorized10k() {
-    Map<String, String> tableProperties = Maps.newHashMap();
-    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
-    withTableProperties(tableProperties, () -> {
-      String tableLocation = table().location();
-      Dataset<Row> df = spark().read().format("iceberg")
-          .option("iceberg.read.numrecordsperbatch", "10000")
-          .load(tableLocation);
-      materialize(df);
-    });
-  }
-
-  @Benchmark
-  @Threads(1)
-  public void readFileSourceVectorized5k() {
-    Map<String, String> conf = Maps.newHashMap();
-    conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true");
-    conf.put(SQLConf.PARQUET_VECTORIZED_READER_BATCH_SIZE().key(), "5000");
-    conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024));
-    conf.put(SQLConf.COLUMN_VECTOR_OFFHEAP_ENABLED().key(), "true");
-    withSQLConf(conf, () -> {
-      Dataset<Row> df = spark().read().parquet(dataLocation());
-      materialize(df);
-    });
-  }
-
-  @Benchmark
-  @Threads(1)
-  public void readFileSourceNonVectorized() {
-    Map<String, String> conf = Maps.newHashMap();
-    conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false");
-    conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024));
-    withSQLConf(conf, () -> {
-      Dataset<Row> df = spark().read().parquet(dataLocation());
-      materialize(df);
-    });
-  }
-
-  @Benchmark
-  @Threads(1)
-  public void readWithProjectionIcebergVectorized1k() {
-    Map<String, String> tableProperties = Maps.newHashMap();
-    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
-    withTableProperties(tableProperties, () -> {
-      String tableLocation = table().location();
-      Dataset<Row> df = spark().read().format("iceberg")
-          .option("iceberg.read.numrecordsperbatch", "1000")
-          .load(tableLocation).select("longCol");
-      materialize(df);
-    });
-  }
-
-  @Benchmark
-  @Threads(1)
-  public void readWithProjectionIcebergVectorized5k() {
-    Map<String, String> tableProperties = Maps.newHashMap();
-    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
-    withTableProperties(tableProperties, () -> {
-      String tableLocation = table().location();
-      Dataset<Row> df = spark().read().format("iceberg")
-          .option("iceberg.read.numrecordsperbatch", "5000")
-          .load(tableLocation).select("longCol");
-      materialize(df);
-    });
-  }
-
-  @Benchmark
-  @Threads(1)
-  public void readWithProjectionIcebergVectorized10k() {
-    Map<String, String> tableProperties = Maps.newHashMap();
-    tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024));
-    withTableProperties(tableProperties, () -> {
-      String tableLocation = table().location();
-      Dataset<Row> df = spark().read().format("iceberg")
-          .option("iceberg.read.numrecordsperbatch", "10000")
-          .load(tableLocation).select("longCol");
-      materialize(df);
-    });
-  }
-
-  @Benchmark
-  @Threads(1)
-  public void readWithProjectionFileSourceVectorized() {
-    Map<String, String> conf = Maps.newHashMap();
-    conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true");
-    conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024));
-    withSQLConf(conf, () -> {
-      Dataset<Row> df = spark().read().parquet(dataLocation()).select("longCol");
-      materialize(df);
-    });
-  }
-
-  @Benchmark
-  @Threads(1)
-  public void readWithProjectionFileSourceNonVectorized() {
-    Map<String, String> conf = Maps.newHashMap();
-    conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false");
-    conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024));
-    withSQLConf(conf, () -> {
-      Dataset<Row> df = spark().read().parquet(dataLocation()).select("longCol");
-      materialize(df);
-    });
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsBenchmark.java
deleted file mode 100644
index 15111514a11c..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsBenchmark.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-import static org.apache.spark.sql.functions.expr;
-
-public class VectorizedReadFloatsBenchmark extends VectorizedIcebergSourceBenchmark {
-
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()),
-        optional(2, "floatCol", Types.FloatType.get()));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumnRenamed("id", "longCol")
-          .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"));
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java
deleted file mode 100644
index 94ac603558c6..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFloatsTwentyPercentNullBenchmark.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.expr;
-import static org.apache.spark.sql.functions.lit;
-import static org.apache.spark.sql.functions.pmod;
-import static org.apache.spark.sql.functions.when;
-
-public class VectorizedReadFloatsTwentyPercentNullBenchmark extends VectorizedReadFloatsBenchmark {
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn("longCol", when(pmod(col("id"), lit(5)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
-          .drop("id")
-          .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"));
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntBackedDecimalsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntBackedDecimalsBenchmark.java
deleted file mode 100644
index 2fcab1615977..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntBackedDecimalsBenchmark.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-import static org.apache.spark.sql.functions.expr;
-
-public class VectorizedReadIntBackedDecimalsBenchmark extends VectorizedIcebergSourceBenchmark {
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()),
-        optional(2, "decimalCol", Types.DecimalType.of(9, 0)));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumnRenamed("id", "longCol")
-          .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(9, 0))"));
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersBenchmark.java
deleted file mode 100644
index df8848663ea7..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersBenchmark.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-import static org.apache.spark.sql.functions.expr;
-
-public class VectorizedReadIntegersBenchmark extends VectorizedIcebergSourceBenchmark {
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()),
-        optional(2, "intCol", Types.IntegerType.get()));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumnRenamed("id", "longCol")
-          .withColumn("intCol", expr("CAST(longCol AS INT)"));
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java
deleted file mode 100644
index 2646ca4a00c0..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadIntegersTwentyPercentNullBenchmark.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.expr;
-import static org.apache.spark.sql.functions.lit;
-import static org.apache.spark.sql.functions.pmod;
-import static org.apache.spark.sql.functions.when;
-
-public class VectorizedReadIntegersTwentyPercentNullBenchmark extends VectorizedReadIntegersBenchmark {
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn("longCol", when(pmod(col("id"), lit(5)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
-          .drop("id")
-          .withColumn("intCol", expr("CAST(longCol AS INT)"));
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsBenchmark.java
deleted file mode 100644
index cdac8ef05c52..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsBenchmark.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-
-public class VectorizedReadLongsBenchmark extends VectorizedIcebergSourceBenchmark {
-
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumnRenamed("id", "longCol");
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java
deleted file mode 100644
index 3544c8e675cf..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadLongsTwentyPercentNullBenchmark.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.lit;
-import static org.apache.spark.sql.functions.pmod;
-import static org.apache.spark.sql.functions.when;
-
-public class VectorizedReadLongsTwentyPercentNullBenchmark extends VectorizedReadLongsBenchmark {
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn("longCol", when(pmod(col("id"), lit(5)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
-          .drop("id");
-
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadPrimitivesBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadPrimitivesBenchmark.java
deleted file mode 100644
index 3619f584fa81..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadPrimitivesBenchmark.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.current_date;
-import static org.apache.spark.sql.functions.date_add;
-import static org.apache.spark.sql.functions.expr;
-import static org.apache.spark.sql.functions.lit;
-import static org.apache.spark.sql.functions.pmod;
-import static org.apache.spark.sql.functions.when;
-
-public class VectorizedReadPrimitivesBenchmark extends VectorizedIcebergSourceBenchmark {
-
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()),
-        optional(2, "intCol", Types.LongType.get()),
-        optional(3, "floatCol", Types.LongType.get()),
-        optional(4, "doubleCol", Types.LongType.get()),
-        optional(5, "decimalCol", Types.DecimalType.of(20, 5)),
-        optional(6, "dateCol", Types.DateType.get()),
-        optional(7, "timestampCol", Types.TimestampType.withZone()),
-        optional(8, "stringCol", Types.StringType.get()));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn("longCol", when(pmod(col("id"), lit(2)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
-          .drop("id")
-          .withColumn("intCol", expr("CAST(longCol AS BIGINT)"))
-          .withColumn("floatCol", expr("CAST(longCol AS BIGINT)"))
-          .withColumn("doubleCol", expr("CAST(longCol AS BIGINT)"))
-          .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))"))
-          .withColumn("dateCol", date_add(current_date(), fileNum))
-          .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
-          .withColumn("stringCol", expr("CAST(longCol AS STRING)"));
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsBenchmark.java
deleted file mode 100644
index 94c95f93e64f..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsBenchmark.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import com.google.common.collect.Maps;
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-import static org.apache.spark.sql.functions.expr;
-
-public class VectorizedReadStringsBenchmark extends VectorizedIcebergSourceBenchmark {
-  @Override
-  protected final Table initTable() {
-    Schema schema = new Schema(
-        optional(1, "longCol", Types.LongType.get()),
-        optional(2, "stringCol", Types.StringType.get()));
-    PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
-    HadoopTables tables = new HadoopTables(hadoopConf());
-    Map<String, String> properties = Maps.newHashMap();
-    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
-    properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1");
-    return tables.create(schema, partitionSpec, properties, newTableLocation());
-  }
-
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumnRenamed("id", "longCol")
-          .withColumn("stringCol", expr("CAST(longCol AS STRING)"));
-
-      appendAsFile(df);
-    }
-  }
-}
diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java
deleted file mode 100644
index ef783d8e5bf7..000000000000
--- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadStringsTwentyPercentNullBenchmark.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source.parquet.vectorized;
-
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.expr;
-import static org.apache.spark.sql.functions.lit;
-import static org.apache.spark.sql.functions.pmod;
-import static org.apache.spark.sql.functions.when;
-
-public class VectorizedReadStringsTwentyPercentNullBenchmark extends VectorizedReadStringsBenchmark {
-  @Override
-  protected void appendData() {
-    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
-      Dataset<Row> df = spark().range(NUM_ROWS)
-          .withColumn("id", when(pmod(col("id"), lit(5)).equalTo(lit(0)), lit(null)).otherwise(col("id")))
-          .withColumn("stringCol", expr("CAST(longCol AS STRING)"));
-
-      appendAsFile(df);
-    }
-  }
-}

From 4760f790b60b456b214dd9db1d8c94a594bb7995 Mon Sep 17 00:00:00 2001
From: samarthjain <samarth@apache.org>
Date: Mon, 23 Mar 2020 12:21:11 -0700
Subject: [PATCH 05/12] Merge reader changes from master. Rebase branch to
 master

---
 .../arrow/vectorized/VectorHolder.java        |   2 +-
 .../spark/source/BaseTaskDataReader.java      | 115 -------
 ...skDataReader.java => BatchDataReader.java} |  28 +-
 .../spark/source/RowTaskDataReader.java       | 296 ------------------
 4 files changed, 15 insertions(+), 426 deletions(-)
 delete mode 100644 spark/src/main/java/org/apache/iceberg/spark/source/BaseTaskDataReader.java
 rename spark/src/main/java/org/apache/iceberg/spark/source/{BatchTaskDataReader.java => BatchDataReader.java} (85%)
 delete mode 100644 spark/src/main/java/org/apache/iceberg/spark/source/RowTaskDataReader.java

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java
index 9df90a4c9066..b938d3845c19 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java
@@ -53,7 +53,7 @@ public VectorHolder(
     this.icebergType = type;
   }
 
-  // Only used for returning dummyHolder
+  // Only used for returning dummy holder
   private VectorHolder() {
     columnDescriptor = null;
     vector = null;
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/BaseTaskDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/BaseTaskDataReader.java
deleted file mode 100644
index fff2c20dff5f..000000000000
--- a/spark/src/main/java/org/apache/iceberg/spark/source/BaseTaskDataReader.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source;
-
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Iterables;
-import java.io.Closeable;
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.Map;
-import org.apache.iceberg.CombinedScanTask;
-import org.apache.iceberg.FileScanTask;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.common.DynMethods;
-import org.apache.iceberg.encryption.EncryptedFiles;
-import org.apache.iceberg.encryption.EncryptionManager;
-import org.apache.iceberg.io.FileIO;
-import org.apache.iceberg.io.InputFile;
-import org.apache.spark.rdd.InputFileBlockHolder;
-import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
-
-@SuppressWarnings("checkstyle:VisibilityModifier")
-abstract class BaseTaskDataReader<T> implements Closeable {
-  // for some reason, the apply method can't be called from Java without reflection
-  static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply")
-      .impl(UnsafeProjection.class, InternalRow.class)
-      .build();
-
-  final Iterator<FileScanTask> tasks;
-  final Schema tableSchema;
-  final Schema expectedSchema;
-  final FileIO fileIo;
-  final Map<String, InputFile> inputFiles;
-  final boolean caseSensitive;
-
-  Iterator<T> currentIterator;
-  Closeable currentCloseable = null;
-  T current = null;
-  final int batchSize;
-
-  BaseTaskDataReader(
-      CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
-      EncryptionManager encryptionManager, boolean caseSensitive) {
-    this(task, tableSchema, expectedSchema, fileIo, encryptionManager, caseSensitive, -1);
-  }
-
-  BaseTaskDataReader(
-      CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
-      EncryptionManager encryptionManager, boolean caseSensitive, int bSize) {
-    this.fileIo = fileIo;
-    this.tasks = task.files().iterator();
-    this.tableSchema = tableSchema;
-    this.expectedSchema = expectedSchema;
-    Iterable<InputFile> decryptedFiles = encryptionManager.decrypt(Iterables.transform(
-        task.files(),
-        fileScanTask ->
-            EncryptedFiles.encryptedInput(
-                this.fileIo.newInputFile(fileScanTask.file().path().toString()),
-                fileScanTask.file().keyMetadata())));
-    ImmutableMap.Builder<String, InputFile> inputFileBuilder = ImmutableMap.builder();
-    decryptedFiles.forEach(decrypted -> inputFileBuilder.put(decrypted.location(), decrypted));
-    this.inputFiles = inputFileBuilder.build();
-    this.caseSensitive = caseSensitive;
-    this.batchSize = bSize;
-    // open last because the schemas, fileIo and batchSize must be set
-    this.currentIterator = open(tasks.next());
-  }
-
-  public boolean next() throws IOException {
-    while (true) {
-      if (currentIterator.hasNext()) {
-        this.current = currentIterator.next();
-        return true;
-      } else if (tasks.hasNext()) {
-        this.currentCloseable.close();
-        this.currentIterator = open(tasks.next());
-      } else {
-        return false;
-      }
-    }
-  }
-
-  abstract Iterator<T> open(FileScanTask task);
-
-  @Override
-  public void close() throws IOException {
-    InputFileBlockHolder.unset();
-
-    // close the current iterator
-    this.currentCloseable.close();
-
-    // exhaust the task iterator
-    while (tasks.hasNext()) {
-      tasks.next();
-    }
-  }
-}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/BatchTaskDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
similarity index 85%
rename from spark/src/main/java/org/apache/iceberg/spark/source/BatchTaskDataReader.java
rename to spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
index 187afcb0672e..0814f819a39f 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/BatchTaskDataReader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
@@ -32,22 +32,23 @@
 import org.apache.iceberg.parquet.Parquet;
 import org.apache.iceberg.spark.SparkSchemaUtil;
 import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
-import org.apache.spark.sql.sources.v2.reader.InputPartitionReader;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 
-class BatchTaskDataReader extends BaseTaskDataReader<ColumnarBatch>
-    implements InputPartitionReader<ColumnarBatch> {
+class BatchDataReader extends BaseDataReader<ColumnarBatch> {
+  private final Schema tableSchema;
+  private final Schema expectedSchema;
+  private final boolean caseSensitive;
+  private final int batchSize;
 
-  BatchTaskDataReader(
+  BatchDataReader(
       CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
-      EncryptionManager encryptionManager, boolean caseSensitive, int bSize) {
-    super(task, tableSchema, expectedSchema, fileIo, encryptionManager, caseSensitive, bSize);
-  }
-
-  @Override
-  public ColumnarBatch get() {
-    return current;
+      EncryptionManager encryptionManager, boolean caseSensitive, int size) {
+    super(task, fileIo, encryptionManager);
+    this.tableSchema = tableSchema;
+    this.expectedSchema = expectedSchema;
+    this.caseSensitive = caseSensitive;
+    this.batchSize = size;
   }
 
   @Override
@@ -69,7 +70,7 @@ Iterator<ColumnarBatch> open(FileScanTask task) {
 
   private Iterator<ColumnarBatch> open(FileScanTask task, Schema readSchema) {
     CloseableIterable<ColumnarBatch> iter;
-    InputFile location = inputFiles.get(task.file().path().toString());
+    InputFile location = getInputFile(task);
     Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
     if (task.file().format() == FileFormat.PARQUET) {
       iter = Parquet.read(location)
@@ -80,7 +81,7 @@ private Iterator<ColumnarBatch> open(FileScanTask task, Schema readSchema) {
           .filter(task.residual())
           .caseSensitive(caseSensitive)
           .recordsPerBatch(batchSize)
-          // Spark eagerly consumes the batches so the underlying memory allocated could be reused
+          // Spark eagerly consumes the batches. So the underlying memory allocated could be reused
           // without worrying about subsequent reads clobbering over each other. This improves
           // read performance as every batch read doesn't have to pay the cost of allocating memory.
           .reuseContainers()
@@ -92,5 +93,4 @@ private Iterator<ColumnarBatch> open(FileScanTask task, Schema readSchema) {
     this.currentCloseable = iter;
     return iter.iterator();
   }
-
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/RowTaskDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/RowTaskDataReader.java
deleted file mode 100644
index 275d7c5f3e84..000000000000
--- a/spark/src/main/java/org/apache/iceberg/spark/source/RowTaskDataReader.java
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.source;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Iterators;
-import com.google.common.collect.Lists;
-import java.nio.ByteBuffer;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Set;
-import java.util.function.Function;
-import org.apache.iceberg.CombinedScanTask;
-import org.apache.iceberg.DataFile;
-import org.apache.iceberg.DataTask;
-import org.apache.iceberg.FileScanTask;
-import org.apache.iceberg.PartitionField;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
-import org.apache.iceberg.StructLike;
-import org.apache.iceberg.avro.Avro;
-import org.apache.iceberg.encryption.EncryptionManager;
-import org.apache.iceberg.io.CloseableIterable;
-import org.apache.iceberg.io.FileIO;
-import org.apache.iceberg.io.InputFile;
-import org.apache.iceberg.orc.ORC;
-import org.apache.iceberg.parquet.Parquet;
-import org.apache.iceberg.spark.SparkSchemaUtil;
-import org.apache.iceberg.spark.data.SparkAvroReader;
-import org.apache.iceberg.spark.data.SparkOrcReader;
-import org.apache.iceberg.spark.data.SparkParquetReaders;
-import org.apache.iceberg.types.TypeUtil;
-import org.apache.iceberg.types.Types;
-import org.apache.iceberg.util.ByteBuffers;
-import org.apache.spark.rdd.InputFileBlockHolder;
-import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.catalyst.expressions.Attribute;
-import org.apache.spark.sql.catalyst.expressions.AttributeReference;
-import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
-import org.apache.spark.sql.catalyst.expressions.JoinedRow;
-import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
-import org.apache.spark.sql.sources.v2.reader.InputPartitionReader;
-import org.apache.spark.sql.types.BinaryType;
-import org.apache.spark.sql.types.DataType;
-import org.apache.spark.sql.types.Decimal;
-import org.apache.spark.sql.types.DecimalType;
-import org.apache.spark.sql.types.StringType;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-import org.apache.spark.unsafe.types.UTF8String;
-import scala.collection.JavaConverters;
-
-class RowTaskDataReader extends BaseTaskDataReader<InternalRow> implements InputPartitionReader<InternalRow> {
-
-  RowTaskDataReader(
-      CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
-      EncryptionManager encryptionManager, boolean caseSensitive) {
-    super(task, tableSchema, expectedSchema, fileIo, encryptionManager, caseSensitive);
-  }
-
-  @Override
-  public InternalRow get() {
-    return current;
-  }
-
-  @Override
-  Iterator<InternalRow> open(FileScanTask task) {
-    DataFile file = task.file();
-
-    // update the current file for Spark's filename() function
-    InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());
-
-    // schema or rows returned by readers
-    Schema finalSchema = expectedSchema;
-    PartitionSpec spec = task.spec();
-    Set<Integer> idColumns = spec.identitySourceIds();
-
-    // schema needed for the projection and filtering
-    StructType sparkType = SparkSchemaUtil.convert(finalSchema);
-    Schema requiredSchema = SparkSchemaUtil.prune(tableSchema, sparkType, task.residual(), caseSensitive);
-    boolean hasJoinedPartitionColumns = !idColumns.isEmpty();
-    boolean hasExtraFilterColumns = requiredSchema.columns().size() != finalSchema.columns().size();
-
-    Schema iterSchema;
-    Iterator<InternalRow> iter;
-
-    if (hasJoinedPartitionColumns) {
-      // schema used to read data files
-      Schema readSchema = TypeUtil.selectNot(requiredSchema, idColumns);
-      Schema partitionSchema = TypeUtil.select(requiredSchema, idColumns);
-      PartitionRowConverter convertToRow = new PartitionRowConverter(partitionSchema, spec);
-      JoinedRow joined = new JoinedRow();
-
-      InternalRow partition = convertToRow.apply(file.partition());
-      joined.withRight(partition);
-
-      // create joined rows and project from the joined schema to the final schema
-      iterSchema = TypeUtil.join(readSchema, partitionSchema);
-      iter = Iterators.transform(open(task, readSchema), joined::withLeft);
-    } else if (hasExtraFilterColumns) {
-      // add projection to the final schema
-      iterSchema = requiredSchema;
-      iter = open(task, requiredSchema);
-    } else {
-      // return the base iterator
-      iterSchema = finalSchema;
-      iter = open(task, finalSchema);
-    }
-
-    // TODO: remove the projection by reporting the iterator's schema back to Spark
-    return Iterators.transform(
-        iter,
-        APPLY_PROJECTION.bind(projection(finalSchema, iterSchema))::invoke);
-  }
-
-  private Iterator<InternalRow> open(FileScanTask task, Schema readSchema) {
-    CloseableIterable<InternalRow> iter;
-    //TODO: samarth can there be a data task for columnar batch counterpart?
-    if (task.isDataTask()) {
-      iter = newDataIterable(task.asDataTask(), readSchema);
-    } else {
-      InputFile location = inputFiles.get(task.file().path().toString());
-      Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
-
-      switch (task.file().format()) {
-        case PARQUET:
-          iter = newParquetIterable(location, task, readSchema);
-          break;
-
-        case AVRO:
-          iter = newAvroIterable(location, task, readSchema);
-          break;
-
-        case ORC:
-          iter = newOrcIterable(location, task, readSchema);
-          break;
-
-        default:
-          throw new UnsupportedOperationException(
-              "Cannot read unknown format: " + task.file().format());
-      }
-    }
-
-    this.currentCloseable = iter;
-
-    return iter.iterator();
-  }
-
-  private CloseableIterable<InternalRow> newAvroIterable(
-      InputFile location,
-      FileScanTask task,
-      Schema readSchema) {
-    return Avro.read(location)
-        .reuseContainers()
-        .project(readSchema)
-        .split(task.start(), task.length())
-        .createReaderFunc(SparkAvroReader::new)
-        .build();
-  }
-
-  private CloseableIterable<InternalRow> newParquetIterable(
-      InputFile location,
-      FileScanTask task,
-      Schema readSchema) {
-    return Parquet.read(location)
-        .project(readSchema)
-        .split(task.start(), task.length())
-        .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema))
-        .filter(task.residual())
-        .caseSensitive(caseSensitive)
-        .build();
-  }
-
-  private CloseableIterable<InternalRow> newOrcIterable(
-      InputFile location,
-      FileScanTask task,
-      Schema readSchema) {
-    return ORC.read(location)
-        .schema(readSchema)
-        .split(task.start(), task.length())
-        .createReaderFunc(SparkOrcReader::new)
-        .caseSensitive(caseSensitive)
-        .build();
-  }
-
-  private CloseableIterable<InternalRow> newDataIterable(DataTask task, Schema readSchema) {
-    StructInternalRow row = new StructInternalRow(tableSchema.asStruct());
-    CloseableIterable<InternalRow> asSparkRows = CloseableIterable.transform(
-        task.asDataTask().rows(), row::setStruct);
-    return CloseableIterable.transform(
-        asSparkRows, APPLY_PROJECTION.bind(projection(readSchema, tableSchema))::invoke);
-  }
-
-  private static UnsafeProjection projection(Schema finalSchema, Schema readSchema) {
-    StructType struct = SparkSchemaUtil.convert(readSchema);
-
-    List<AttributeReference> refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava();
-    List<Attribute> attrs = Lists.newArrayListWithExpectedSize(struct.fields().length);
-    List<org.apache.spark.sql.catalyst.expressions.Expression> exprs =
-        Lists.newArrayListWithExpectedSize(struct.fields().length);
-
-    for (AttributeReference ref : refs) {
-      attrs.add(ref.toAttribute());
-    }
-
-    for (Types.NestedField field : finalSchema.columns()) {
-      int indexInReadSchema = struct.fieldIndex(field.name());
-      exprs.add(refs.get(indexInReadSchema));
-    }
-
-    return UnsafeProjection.create(
-        JavaConverters.asScalaBufferConverter(exprs).asScala().toSeq(),
-        JavaConverters.asScalaBufferConverter(attrs).asScala().toSeq());
-  }
-
-  private static class PartitionRowConverter implements Function<StructLike, InternalRow> {
-    private final DataType[] types;
-    private final int[] positions;
-    private final Class<?>[] javaTypes;
-    private final GenericInternalRow reusedRow;
-
-    PartitionRowConverter(Schema partitionSchema, PartitionSpec spec) {
-      StructType partitionType = SparkSchemaUtil.convert(partitionSchema);
-      StructField[] fields = partitionType.fields();
-
-      this.types = new DataType[fields.length];
-      this.positions = new int[types.length];
-      this.javaTypes = new Class<?>[types.length];
-      this.reusedRow = new GenericInternalRow(types.length);
-
-      List<PartitionField> partitionFields = spec.fields();
-      for (int rowIndex = 0; rowIndex < fields.length; rowIndex += 1) {
-        this.types[rowIndex] = fields[rowIndex].dataType();
-
-        int sourceId = partitionSchema.columns().get(rowIndex).fieldId();
-        for (int specIndex = 0; specIndex < partitionFields.size(); specIndex += 1) {
-          PartitionField field = spec.fields().get(specIndex);
-          if (field.sourceId() == sourceId && "identity".equals(field.transform().toString())) {
-            positions[rowIndex] = specIndex;
-            javaTypes[rowIndex] = spec.javaClasses()[specIndex];
-            break;
-          }
-        }
-      }
-    }
-
-    @Override
-    public InternalRow apply(StructLike tuple) {
-      for (int i = 0; i < types.length; i += 1) {
-        Object value = tuple.get(positions[i], javaTypes[i]);
-        if (value != null) {
-          reusedRow.update(i, convert(value, types[i]));
-        } else {
-          reusedRow.setNullAt(i);
-        }
-      }
-
-      return reusedRow;
-    }
-
-    /**
-     * Converts the objects into instances used by Spark's InternalRow.
-     *
-     * @param value a data value
-     * @param type the Spark data type
-     * @return the value converted to the representation expected by Spark's InternalRow.
-     */
-    private static Object convert(Object value, DataType type) {
-      if (type instanceof StringType) {
-        return UTF8String.fromString(value.toString());
-      } else if (type instanceof BinaryType) {
-        return ByteBuffers.toByteArray((ByteBuffer) value);
-      } else if (type instanceof DecimalType) {
-        return Decimal.fromDecimal(value);
-      }
-      return value;
-    }
-  }
-}

From 41064aa505d80537e358f19029178d85bb22014e Mon Sep 17 00:00:00 2001
From: samarthjain <samarth@apache.org>
Date: Wed, 6 May 2020 22:51:13 -0700
Subject: [PATCH 06/12] Code review comments

---
 .../iceberg}/arrow/ArrowAllocation.java       |   2 +-
 ...dDictionaryEncodedParquetValuesReader.java |  43 ++-
 ...ectorizedParquetDefinitionLevelReader.java |  13 +-
 build.gradle                                  |   5 +
 .../data/vectorized/ArrowVectorAccessor.java  |  23 +-
 .../data/vectorized/ArrowVectorAccessors.java | 318 +++++++++---------
 ...hReaders.java => ColumnarBatchReader.java} |  40 +--
 .../vectorized/IcebergArrowColumnVector.java  |  14 +-
 .../VectorizedSparkParquetReaders.java        |  63 ++--
 .../iceberg/spark/source/BatchDataReader.java |   2 +-
 .../apache/iceberg/spark/source/Reader.java   |  30 +-
 .../iceberg/spark/data/AvroDataTest.java      |  16 +-
 .../apache/iceberg/spark/data/RandomData.java |  75 ++++-
 ...DictionaryEncodingForVectorizedReader.java |  18 +-
 .../TestSparkParquetVectorizedReader.java     |  21 +-
 15 files changed, 366 insertions(+), 317 deletions(-)
 rename {spark/src/main/java/org/apache/iceberg/spark => arrow/src/main/java/org/apache/iceberg}/arrow/ArrowAllocation.java (96%)
 rename spark/src/main/java/org/apache/iceberg/spark/data/vectorized/{ColumnarBatchReaders.java => ColumnarBatchReader.java} (68%)

diff --git a/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowAllocation.java b/arrow/src/main/java/org/apache/iceberg/arrow/ArrowAllocation.java
similarity index 96%
rename from spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowAllocation.java
rename to arrow/src/main/java/org/apache/iceberg/arrow/ArrowAllocation.java
index c1a38a7b7f11..49882ce90690 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/arrow/ArrowAllocation.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/ArrowAllocation.java
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.apache.iceberg.spark.arrow;
+package org.apache.iceberg.arrow;
 
 import org.apache.arrow.memory.RootAllocator;
 
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java
index 43d6a50e5968..dab8e4c853ca 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java
@@ -19,7 +19,6 @@
 
 package org.apache.iceberg.arrow.vectorized.parquet;
 
-import io.netty.buffer.ArrowBuf;
 import java.nio.ByteBuffer;
 import org.apache.arrow.vector.BaseVariableWidthVector;
 import org.apache.arrow.vector.BitVectorHelper;
@@ -110,9 +109,9 @@ void readBatchOfDictionaryEncodedLongs(FieldVector vector, int startOffset, int
     }
   }
 
-  void readBatchOfDictionaryEncodedTimestampMillis(FieldVector vector, int startOffset, int numValuesToRead,
-                                                   Dictionary dict, NullabilityHolder nullabilityHolder,
-      int typeWidth) {
+  void readBatchOfDictionaryEncodedTimestampMillis(
+      FieldVector vector, int startOffset, int numValuesToRead,
+      Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth) {
     int left = numValuesToRead;
     int idx = startOffset;
     while (left > 0) {
@@ -159,11 +158,10 @@ void readBatchOfDictionaryEncodedIntegers(FieldVector vector, int startOffset, i
         this.readNextGroup();
       }
       int num = Math.min(left, this.currentCount);
-      ArrowBuf dataBuffer = vector.getDataBuffer();
       switch (mode) {
         case RLE:
           for (int i = 0; i < num; i++) {
-            dataBuffer.setInt(idx * typeWidth, dict.decodeToInt(currentValue));
+            vector.getDataBuffer().setInt(idx * typeWidth, dict.decodeToInt(currentValue));
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
@@ -174,7 +172,8 @@ void readBatchOfDictionaryEncodedIntegers(FieldVector vector, int startOffset, i
           break;
         case PACKED:
           for (int i = 0; i < num; i++) {
-            dataBuffer.setInt(idx * typeWidth, dict.decodeToInt(packedValuesBuffer[packedValuesBufferIdx++]));
+            vector.getDataBuffer()
+                .setInt(idx * typeWidth, dict.decodeToInt(packedValuesBuffer[packedValuesBufferIdx++]));
             if (setArrowValidityVector) {
               BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
             } else {
@@ -282,27 +281,14 @@ void readBatchOfDictionaryEncodedFixedWidthBinary(FieldVector vector, int typeWi
         case RLE:
           for (int i = 0; i < num; i++) {
             ByteBuffer buffer = dict.decodeToBinary(currentValue).toByteBuffer();
-            vector.getDataBuffer().setBytes(idx * typeWidth, buffer.array(),
-                buffer.position() + buffer.arrayOffset(), buffer.limit() - buffer.position());
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setFixedWidthBinary(vector, typeWidth, nullabilityHolder, idx, buffer);
             idx++;
           }
           break;
         case PACKED:
           for (int i = 0; i < num; i++) {
             ByteBuffer buffer = dict.decodeToBinary(packedValuesBuffer[packedValuesBufferIdx++]).toByteBuffer();
-            vector.getDataBuffer()
-                .setBytes(idx * typeWidth, buffer.array(),
-                    buffer.position() + buffer.arrayOffset(), buffer.limit() - buffer.position());
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setFixedWidthBinary(vector, typeWidth, nullabilityHolder, idx, buffer);
             idx++;
           }
           break;
@@ -312,6 +298,19 @@ void readBatchOfDictionaryEncodedFixedWidthBinary(FieldVector vector, int typeWi
     }
   }
 
+  private void setFixedWidthBinary(
+      FieldVector vector, int typeWidth, NullabilityHolder nullabilityHolder,
+      int idx, ByteBuffer buffer) {
+    vector.getDataBuffer()
+        .setBytes(idx * typeWidth, buffer.array(),
+            buffer.position() + buffer.arrayOffset(), buffer.limit() - buffer.position());
+    if (setArrowValidityVector) {
+      BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
+    } else {
+      nullabilityHolder.setNotNull(idx);
+    }
+  }
+
   void readBatchOfDictionaryEncodedFixedLengthDecimals(FieldVector vector, int typeWidth, int startOffset,
                                                        int numValuesToRead, Dictionary dict,
                                                        NullabilityHolder nullabilityHolder) {
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java
index ae9879e56ffc..bbaaaa38ba7f 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java
@@ -63,7 +63,7 @@ public void readBatchOfDictionaryIds(
         case PACKED:
           for (int i = 0; i < numValues; i++) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
-              vector.set(idx, dictionaryEncodedValuesReader.readInteger());
+              vector.getDataBuffer().setInt(idx * IntVector.TYPE_WIDTH, dictionaryEncodedValuesReader.readInteger());
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
               } else {
@@ -202,7 +202,8 @@ public void readBatchOfDictionaryEncodedLongs(
         case PACKED:
           for (int i = 0; i < numValues; i++) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
-              vector.getDataBuffer().setLong(idx, dict.decodeToLong(dictionaryEncodedValuesReader.readInteger()));
+              vector.getDataBuffer().setLong(idx * typeWidth,
+                  dict.decodeToLong(dictionaryEncodedValuesReader.readInteger()));
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
               } else {
@@ -249,7 +250,7 @@ public void readBatchOfDictionaryEncodedTimestampMillis(
         case PACKED:
           for (int i = 0; i < numValues; i++) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
-              vector.getDataBuffer().setLong(idx,
+              vector.getDataBuffer().setLong(idx * typeWidth,
                   dict.decodeToLong(dictionaryEncodedValuesReader.readInteger()) * 1000);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
@@ -428,7 +429,8 @@ public void readBatchOfDictionaryEncodedFloats(
         case PACKED:
           for (int i = 0; i < num; i++) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
-              vector.getDataBuffer().setFloat(idx, dict.decodeToFloat(dictionaryEncodedValuesReader.readInteger()));
+              vector.getDataBuffer()
+                  .setFloat(idx * typeWidth, dict.decodeToFloat(dictionaryEncodedValuesReader.readInteger()));
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
               } else {
@@ -517,7 +519,8 @@ public void readBatchOfDictionaryEncodedDoubles(
         case PACKED:
           for (int i = 0; i < num; i++) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
-              vector.getDataBuffer().setDouble(idx, dict.decodeToDouble(dictionaryEncodedValuesReader.readInteger()));
+              vector.getDataBuffer()
+                  .setDouble(idx * typeWidth, dict.decodeToDouble(dictionaryEncodedValuesReader.readInteger()));
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
               } else {
diff --git a/build.gradle b/build.gradle
index ae33b5f130f1..b7d8aec9f573 100644
--- a/build.gradle
+++ b/build.gradle
@@ -456,6 +456,11 @@ project(':iceberg-spark3') {
     testCompile project(path: ':iceberg-hive', configuration: 'testArtifacts')
     testCompile project(path: ':iceberg-api', configuration: 'testArtifacts')
   }
+  test {
+    // For vectorized reads
+    systemProperty("arrow.enable_unsafe_memory_access", "true")
+    systemProperty("arrow.enable_null_check_for_get", "false")
+  }
 }
 
 project(':iceberg-pig') {
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
index 15af1e53f35e..244856a34270 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
@@ -29,19 +29,26 @@
 abstract class ArrowVectorAccessor {
 
   private final ValueVector vector;
-  ArrowColumnVector[] childColumns;
+  private final ArrowColumnVector[] childColumns;
 
   ArrowVectorAccessor(ValueVector vector) {
     this.vector = vector;
+    this.childColumns = null;
+  }
+
+  ArrowVectorAccessor(ValueVector vector, ArrowColumnVector[] children) {
+    this.vector = vector;
+    this.childColumns = children;
   }
 
   final void close() {
-    vector.close();
     if (childColumns != null) {
-      for (int i = 0; i < childColumns.length; i++) {
-        childColumns[i].close();
+      for (ArrowColumnVector column : childColumns) {
+        // Closing an ArrowColumnVector is expected to not throw any exception
+        column.close();
       }
     }
+    vector.close();
   }
 
   boolean getBoolean(int rowId) {
@@ -80,7 +87,11 @@ ColumnarArray getArray(int rowId) {
     throw new UnsupportedOperationException("Unsupported type: array");
   }
 
-  ArrowColumnVector[] childColumns() {
-    return childColumns;
+  ArrowColumnVector childColumn(int pos) {
+    return childColumns[pos];
+  }
+
+  ValueVector getVector() {
+    return vector;
   }
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
index ba1d3eb959e0..5a215b2cb0b7 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
@@ -21,11 +21,11 @@
 
 import io.netty.buffer.ArrowBuf;
 import java.math.BigInteger;
+import java.util.stream.IntStream;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVector;
 import org.apache.arrow.vector.DateDayVector;
 import org.apache.arrow.vector.FieldVector;
-import org.apache.arrow.vector.FixedSizeBinaryVector;
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
 import org.apache.arrow.vector.IntVector;
@@ -39,18 +39,17 @@
 import org.apache.parquet.Preconditions;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.Dictionary;
-import org.apache.parquet.io.api.Binary;
 import org.apache.parquet.schema.PrimitiveType;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.sql.vectorized.ArrowColumnVector;
 import org.apache.spark.sql.vectorized.ColumnarArray;
 import org.apache.spark.unsafe.types.UTF8String;
+import org.jetbrains.annotations.NotNull;
 
 public class ArrowVectorAccessors {
 
   private ArrowVectorAccessors() {}
 
-  @SuppressWarnings("checkstyle:CyclomaticComplexity")
   static ArrowVectorAccessor getVectorAccessor(VectorHolder holder) {
     Dictionary dictionary = holder.dictionary();
     boolean isVectorDictEncoded = holder.isDictionaryEncoded();
@@ -58,86 +57,100 @@ static ArrowVectorAccessor getVectorAccessor(VectorHolder holder) {
     FieldVector vector = holder.vector();
     PrimitiveType primitive = desc.getPrimitiveType();
     if (isVectorDictEncoded) {
-      Preconditions.checkState(vector instanceof IntVector, "Dictionary ids should be stored in IntVectors only");
-      if (primitive.getOriginalType() != null) {
-        switch (desc.getPrimitiveType().getOriginalType()) {
-          case ENUM:
-          case JSON:
-          case UTF8:
-          case BSON:
-            return new DictionaryStringAccessor((IntVector) vector, dictionary);
-          case INT_64:
-          case TIMESTAMP_MILLIS:
-          case TIMESTAMP_MICROS:
-            return new DictionaryLongAccessor((IntVector) vector, dictionary);
-          case DECIMAL:
-            switch (primitive.getPrimitiveTypeName()) {
-              case BINARY:
-              case FIXED_LEN_BYTE_ARRAY:
-                return new DictionaryDecimalBinaryAccessor(
-                    (IntVector) vector,
-                    dictionary);
-              case INT64:
-                return new DictionaryDecimalLongAccessor(
-                    (IntVector) vector,
-                    dictionary);
-              case INT32:
-                return new DictionaryDecimalIntAccessor(
-                    (IntVector) vector,
-                    dictionary);
-              default:
-                throw new UnsupportedOperationException(
-                    "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
-            }
-          default:
-            throw new UnsupportedOperationException(
-                "Unsupported logical type: " + primitive.getOriginalType());
-        }
-      } else {
-        switch (primitive.getPrimitiveTypeName()) {
-          case FIXED_LEN_BYTE_ARRAY:
-          case BINARY:
-            return new DictionaryBinaryAccessor((IntVector) vector, dictionary);
-          case FLOAT:
-            return new DictionaryFloatAccessor((IntVector) vector, dictionary);
-          case INT64:
-            return new DictionaryLongAccessor((IntVector) vector, dictionary);
-          case DOUBLE:
-            return new DictionaryDoubleAccessor((IntVector) vector, dictionary);
-          default:
-            throw new UnsupportedOperationException("Unsupported type: " + primitive);
-        }
+      return getDictionaryVectorAccessor(dictionary, desc, vector, primitive);
+    } else {
+      return getPlainVectorAccessor(vector);
+    }
+  }
+
+  @NotNull
+  private static ArrowVectorAccessor getDictionaryVectorAccessor(
+      Dictionary dictionary,
+      ColumnDescriptor desc,
+      FieldVector vector, PrimitiveType primitive) {
+    Preconditions.checkState(vector instanceof IntVector, "Dictionary ids should be stored in IntVectors only");
+    if (primitive.getOriginalType() != null) {
+      switch (desc.getPrimitiveType().getOriginalType()) {
+        case ENUM:
+        case JSON:
+        case UTF8:
+        case BSON:
+          return new DictionaryStringAccessor((IntVector) vector, dictionary);
+        case INT_64:
+        case TIMESTAMP_MILLIS:
+        case TIMESTAMP_MICROS:
+          return new DictionaryLongAccessor((IntVector) vector, dictionary);
+        case DECIMAL:
+          switch (primitive.getPrimitiveTypeName()) {
+            case BINARY:
+            case FIXED_LEN_BYTE_ARRAY:
+              return new DictionaryDecimalBinaryAccessor(
+                  (IntVector) vector,
+                  dictionary);
+            case INT64:
+              return new DictionaryDecimalLongAccessor(
+                  (IntVector) vector,
+                  dictionary);
+            case INT32:
+              return new DictionaryDecimalIntAccessor(
+                  (IntVector) vector,
+                  dictionary);
+            default:
+              throw new UnsupportedOperationException(
+                  "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
+          }
+        default:
+          throw new UnsupportedOperationException(
+              "Unsupported logical type: " + primitive.getOriginalType());
       }
     } else {
-      if (vector instanceof BitVector) {
-        return new BooleanAccessor((BitVector) vector);
-      } else if (vector instanceof IntVector) {
-        return new IntAccessor((IntVector) vector);
-      } else if (vector instanceof BigIntVector) {
-        return new LongAccessor((BigIntVector) vector);
-      } else if (vector instanceof Float4Vector) {
-        return new FloatAccessor((Float4Vector) vector);
-      } else if (vector instanceof Float8Vector) {
-        return new DoubleAccessor((Float8Vector) vector);
-      } else if (vector instanceof IcebergArrowVectors.DecimalArrowVector) {
-        return new DecimalAccessor((IcebergArrowVectors.DecimalArrowVector) vector);
-      } else if (vector instanceof IcebergArrowVectors.VarcharArrowVector) {
-        return new StringAccessor((IcebergArrowVectors.VarcharArrowVector) vector);
-      } else if (vector instanceof IcebergArrowVectors.VarBinaryArrowVector) {
-        return new BinaryAccessor((IcebergArrowVectors.VarBinaryArrowVector) vector);
-      } else if (vector instanceof DateDayVector) {
-        return new DateAccessor((DateDayVector) vector);
-      } else if (vector instanceof TimeStampMicroTZVector) {
-        return new TimestampAccessor((TimeStampMicroTZVector) vector);
-      } else if (vector instanceof ListVector) {
-        ListVector listVector = (ListVector) vector;
-        return new ArrayAccessor(listVector);
-      } else if (vector instanceof StructVector) {
-        StructVector structVector = (StructVector) vector;
-        return new StructAccessor(structVector);
+      switch (primitive.getPrimitiveTypeName()) {
+        case FIXED_LEN_BYTE_ARRAY:
+        case BINARY:
+          return new DictionaryBinaryAccessor((IntVector) vector, dictionary);
+        case FLOAT:
+          return new DictionaryFloatAccessor((IntVector) vector, dictionary);
+        case INT64:
+          return new DictionaryLongAccessor((IntVector) vector, dictionary);
+        case DOUBLE:
+          return new DictionaryDoubleAccessor((IntVector) vector, dictionary);
+        default:
+          throw new UnsupportedOperationException("Unsupported type: " + primitive);
       }
     }
-    throw new UnsupportedOperationException("Unsupported type: " + primitive);
+  }
+
+  @NotNull
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
+  private static ArrowVectorAccessor getPlainVectorAccessor(FieldVector vector) {
+    if (vector instanceof BitVector) {
+      return new BooleanAccessor((BitVector) vector);
+    } else if (vector instanceof IntVector) {
+      return new IntAccessor((IntVector) vector);
+    } else if (vector instanceof BigIntVector) {
+      return new LongAccessor((BigIntVector) vector);
+    } else if (vector instanceof Float4Vector) {
+      return new FloatAccessor((Float4Vector) vector);
+    } else if (vector instanceof Float8Vector) {
+      return new DoubleAccessor((Float8Vector) vector);
+    } else if (vector instanceof IcebergArrowVectors.DecimalArrowVector) {
+      return new DecimalAccessor((IcebergArrowVectors.DecimalArrowVector) vector);
+    } else if (vector instanceof IcebergArrowVectors.VarcharArrowVector) {
+      return new StringAccessor((IcebergArrowVectors.VarcharArrowVector) vector);
+    } else if (vector instanceof IcebergArrowVectors.VarBinaryArrowVector) {
+      return new BinaryAccessor((IcebergArrowVectors.VarBinaryArrowVector) vector);
+    } else if (vector instanceof DateDayVector) {
+      return new DateAccessor((DateDayVector) vector);
+    } else if (vector instanceof TimeStampMicroTZVector) {
+      return new TimestampAccessor((TimeStampMicroTZVector) vector);
+    } else if (vector instanceof ListVector) {
+      ListVector listVector = (ListVector) vector;
+      return new ArrayAccessor(listVector);
+    } else if (vector instanceof StructVector) {
+      StructVector structVector = (StructVector) vector;
+      return new StructAccessor(structVector);
+    }
+    throw new UnsupportedOperationException("Unsupported vector: " + vector.getClass());
   }
 
   private static class BooleanAccessor extends ArrowVectorAccessor {
@@ -185,18 +198,20 @@ final long getLong(int rowId) {
     }
   }
 
-  private static class DictionaryLongAccessor extends DictionaryArrowVectorAccessor {
+  private static class DictionaryLongAccessor extends ArrowVectorAccessor {
 
-    private final IntVector vector;
+    private final Dictionary parquetDictionary;
+    private final IntVector offsetVector;
 
     DictionaryLongAccessor(IntVector vector, Dictionary dictionary) {
-      super(vector, dictionary);
-      this.vector = vector;
+      super(vector);
+      this.offsetVector = vector;
+      this.parquetDictionary = dictionary;
     }
 
     @Override
     final long getLong(int rowId) {
-      return parquetDictionary.decodeToLong(vector.get(rowId));
+      return parquetDictionary.decodeToLong(offsetVector.get(rowId));
     }
   }
 
@@ -215,18 +230,20 @@ final float getFloat(int rowId) {
     }
   }
 
-  private static class DictionaryFloatAccessor extends DictionaryArrowVectorAccessor {
+  private static class DictionaryFloatAccessor extends ArrowVectorAccessor {
 
-    private final IntVector vector;
+    private final IntVector offsetVector;
+    private final Dictionary parquetDictionary;
 
     DictionaryFloatAccessor(IntVector vector, Dictionary dictionary) {
-      super(vector, dictionary);
-      this.vector = vector;
+      super(vector);
+      this.parquetDictionary = dictionary;
+      this.offsetVector = vector;
     }
 
     @Override
     final float getFloat(int rowId) {
-      return parquetDictionary.decodeToFloat(vector.get(rowId));
+      return parquetDictionary.decodeToFloat(offsetVector.get(rowId));
     }
   }
 
@@ -245,33 +262,21 @@ final double getDouble(int rowId) {
     }
   }
 
-  private static class DictionaryDoubleAccessor extends DictionaryArrowVectorAccessor {
-
+  private static class DictionaryDoubleAccessor extends ArrowVectorAccessor {
     private final IntVector vector;
+    private final double[] decodedDictionary;
 
     DictionaryDoubleAccessor(IntVector vector, Dictionary dictionary) {
-      super(vector, dictionary);
-      this.vector = vector;
-    }
-
-    @Override
-    final double getDouble(int rowId) {
-      return parquetDictionary.decodeToDouble(vector.get(rowId));
-    }
-  }
-
-  private static class DecimalAccessor extends ArrowVectorAccessor {
-
-    private final IcebergArrowVectors.DecimalArrowVector vector;
-
-    DecimalAccessor(IcebergArrowVectors.DecimalArrowVector vector) {
       super(vector);
       this.vector = vector;
+      this.decodedDictionary = IntStream.rangeClosed(0, dictionary.getMaxId())
+          .mapToDouble(dictionary::decodeToDouble)
+          .toArray();
     }
 
     @Override
-    final Decimal getDecimal(int rowId, int precision, int scale) {
-      return Decimal.apply(vector.getObject(rowId), precision, scale);
+    final double getDouble(int rowId) {
+      return decodedDictionary[vector.get(rowId)];
     }
   }
 
@@ -299,43 +304,23 @@ final UTF8String getUTF8String(int rowId) {
     }
   }
 
-  @SuppressWarnings("checkstyle:VisibilityModifier")
-  private abstract static class DictionaryArrowVectorAccessor extends ArrowVectorAccessor {
-    final Dictionary parquetDictionary;
-    final IntVector dictionaryVector;
-
-    private DictionaryArrowVectorAccessor(IntVector vector, Dictionary dictionary) {
-      super(vector);
-      this.dictionaryVector = vector;
-      this.parquetDictionary = dictionary;
-    }
-  }
-
-  private static class DictionaryStringAccessor extends DictionaryArrowVectorAccessor {
+  private static class DictionaryStringAccessor extends ArrowVectorAccessor {
+    private final UTF8String[] decodedDictionary;
+    private final IntVector offsetVector;
 
     DictionaryStringAccessor(IntVector vector, Dictionary dictionary) {
-      super(vector, dictionary);
-    }
-
-    @Override
-    final UTF8String getUTF8String(int rowId) {
-      Binary binary = parquetDictionary.decodeToBinary(dictionaryVector.get(rowId));
-      return UTF8String.fromBytes(binary.getBytesUnsafe());
-    }
-  }
-
-  private static class FixedSizeBinaryAccessor extends ArrowVectorAccessor {
-
-    private final FixedSizeBinaryVector vector;
-
-    FixedSizeBinaryAccessor(FixedSizeBinaryVector vector) {
       super(vector);
-      this.vector = vector;
+      this.offsetVector = vector;
+      this.decodedDictionary = IntStream.rangeClosed(0, dictionary.getMaxId())
+          .mapToObj(dictionary::decodeToBinary)
+          .map(binary -> UTF8String.fromBytes(binary.getBytes()))
+          .toArray(UTF8String[]::new);
     }
 
     @Override
-    final byte[] getBinary(int rowId) {
-      return vector.getObject(rowId);
+    final UTF8String getUTF8String(int rowId) {
+      int offset = offsetVector.get(rowId);
+      return decodedDictionary[offset];
     }
   }
 
@@ -354,16 +339,23 @@ final byte[] getBinary(int rowId) {
     }
   }
 
-  private static class DictionaryBinaryAccessor extends DictionaryArrowVectorAccessor {
+  private static class DictionaryBinaryAccessor extends ArrowVectorAccessor {
+    private final IntVector offsetVector;
+    private final byte[][] decodedDictionary;
 
     DictionaryBinaryAccessor(IntVector vector, Dictionary dictionary) {
-      super(vector, dictionary);
+      super(vector);
+      this.offsetVector = vector;
+      this.decodedDictionary = IntStream.rangeClosed(0, dictionary.getMaxId())
+          .mapToObj(dictionary::decodeToBinary)
+          .map(binary -> binary.getBytes())
+          .toArray(byte[][]::new);
     }
 
     @Override
     final byte[] getBinary(int rowId) {
-      Binary binary = parquetDictionary.decodeToBinary(dictionaryVector.get(rowId));
-      return binary.getBytesUnsafe();
+      int offset = offsetVector.get(rowId);
+      return decodedDictionary[offset];
     }
   }
 
@@ -424,20 +416,38 @@ final ColumnarArray getArray(int rowId) {
    */
   private static class StructAccessor extends ArrowVectorAccessor {
     StructAccessor(StructVector structVector) {
-      super(structVector);
-      childColumns = new ArrowColumnVector[structVector.size()];
-      for (int i = 0; i < childColumns.length; ++i) {
-        childColumns[i] = new ArrowColumnVector(structVector.getVectorById(i));
-      }
+      super(structVector, IntStream.range(0, structVector.size())
+          .mapToObj(structVector::getVectorById)
+          .map(ArrowColumnVector::new)
+          .toArray(ArrowColumnVector[]::new));
+    }
+  }
+
+  private static class DecimalAccessor extends ArrowVectorAccessor {
+
+    private final IcebergArrowVectors.DecimalArrowVector vector;
+
+    DecimalAccessor(IcebergArrowVectors.DecimalArrowVector vector) {
+      super(vector);
+      this.vector = vector;
+    }
+
+    @Override
+    final Decimal getDecimal(int rowId, int precision, int scale) {
+      return Decimal.apply(vector.getObject(rowId), precision, scale);
     }
   }
 
   @SuppressWarnings("checkstyle:VisibilityModifier")
-  private abstract static class DictionaryDecimalAccessor extends DictionaryArrowVectorAccessor {
+  private abstract static class DictionaryDecimalAccessor extends ArrowVectorAccessor {
     final Decimal[] cache;
+    Dictionary parquetDictionary;
+    final IntVector offsetVector;
 
     private DictionaryDecimalAccessor(IntVector vector, Dictionary dictionary) {
-      super(vector, dictionary);
+      super(vector);
+      this.offsetVector = vector;
+      this.parquetDictionary = dictionary;
       this.cache = new Decimal[dictionary.getMaxId() + 1];
     }
   }
@@ -450,10 +460,10 @@ private static class DictionaryDecimalBinaryAccessor extends DictionaryDecimalAc
 
     @Override
     final Decimal getDecimal(int rowId, int precision, int scale) {
-      int dictId = dictionaryVector.get(rowId);
+      int dictId = offsetVector.get(rowId);
       if (cache[dictId] == null) {
         cache[dictId] = Decimal.apply(
-            new BigInteger(parquetDictionary.decodeToBinary(dictId).getBytesUnsafe()).longValue(),
+            new BigInteger(parquetDictionary.decodeToBinary(dictId).getBytes()).longValue(),
             precision,
             scale);
       }
@@ -469,7 +479,7 @@ private static class DictionaryDecimalLongAccessor extends DictionaryDecimalAcce
 
     @Override
     final Decimal getDecimal(int rowId, int precision, int scale) {
-      int dictId = dictionaryVector.get(rowId);
+      int dictId = offsetVector.get(rowId);
       if (cache[dictId] == null) {
         cache[dictId] = Decimal.apply(parquetDictionary.decodeToLong(dictId), precision, scale);
       }
@@ -485,7 +495,7 @@ private static class DictionaryDecimalIntAccessor extends DictionaryDecimalAcces
 
     @Override
     final Decimal getDecimal(int rowId, int precision, int scale) {
-      int dictId = dictionaryVector.get(rowId);
+      int dictId = offsetVector.get(rowId);
       if (cache[dictId] == null) {
         cache[dictId] = Decimal.apply(parquetDictionary.decodeToInt(dictId), precision, scale);
       }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java
similarity index 68%
rename from spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java
rename to spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java
index 81545a7b51ae..dd6d4096dc44 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java
@@ -19,7 +19,6 @@
 
 package org.apache.iceberg.spark.data.vectorized;
 
-import java.lang.reflect.Array;
 import java.util.List;
 import java.util.Map;
 import org.apache.iceberg.arrow.vectorized.VectorHolder;
@@ -37,54 +36,42 @@
  * {@link ColumnarBatch} returned is created by passing in the Arrow vectors populated via delegated read calls to
  * {@linkplain VectorizedArrowReader VectorReader(s)}.
  */
-public class ColumnarBatchReaders implements VectorizedReader<ColumnarBatch> {
+public class ColumnarBatchReader implements VectorizedReader<ColumnarBatch> {
   private final VectorizedArrowReader[] readers;
 
-  public ColumnarBatchReaders(List<VectorizedReader> readers) {
-    this.readers = (VectorizedArrowReader[]) Array.newInstance(
-        VectorizedArrowReader.class, readers.size());
-    int idx = 0;
-    for (VectorizedReader reader : readers) {
-      this.readers[idx] = (VectorizedArrowReader) reader;
-      idx++;
-    }
+  public ColumnarBatchReader(List<VectorizedReader<?>> readers) {
+    this.readers = readers.stream()
+        .map(VectorizedArrowReader.class::cast)
+        .toArray(VectorizedArrowReader[]::new);
   }
 
   @Override
   public final void setRowGroupInfo(PageReadStore pageStore, Map<ColumnPath, ColumnChunkMetaData> metaData) {
-    for (int i = 0; i < readers.length; i += 1) {
-      if (readers[i] != null) {
-        readers[i].setRowGroupInfo(pageStore, metaData);
+    for (VectorizedArrowReader reader : readers) {
+      if (reader != null) {
+        reader.setRowGroupInfo(pageStore, metaData);
       }
     }
   }
 
   @Override
   public void reuseContainers(boolean reuse) {
-    for (VectorizedReader reader : readers) {
+    for (VectorizedReader<?> reader : readers) {
       reader.reuseContainers(reuse);
     }
   }
 
   @Override
   public final ColumnarBatch read(int numRowsToRead) {
-    Preconditions.checkArgument(numRowsToRead > 0, "Invalid value: " + numRowsToRead);
+    Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead);
     ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length];
-    int prevNum = 0;
     for (int i = 0; i < readers.length; i += 1) {
       VectorHolder holder = readers[i].read(numRowsToRead);
       int numRowsInVector = holder.numValues();
       Preconditions.checkState(
           numRowsInVector == numRowsToRead,
-          "Number of rows in the vector " + numRowsInVector + " didn't match expected " +
-              numRowsToRead);
-      if (prevNum > 0) {
-        // assert that all the vectors in the batch have the same number of rows
-        Preconditions.checkState(numRowsInVector == prevNum, "Number of rows in arrow vectors didn't match " +
-            "for " + readers[i - 1] + " and " + readers[i]);
-      } else {
-        prevNum = numRowsInVector;
-      }
+          "Number of rows in the vector %s didn't match expected %s ", numRowsInVector,
+          numRowsToRead);
       arrowColumnVectors[i] = IcebergArrowColumnVector.forHolder(holder, numRowsInVector);
     }
     ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors);
@@ -94,8 +81,9 @@ public final ColumnarBatch read(int numRowsToRead) {
 
   @Override
   public void close() {
-    for (VectorizedReader reader : readers) {
+    for (VectorizedReader<?> reader : readers) {
       reader.close();
     }
   }
+
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
index 5f18e9d36792..92f7c8ed8c92 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
@@ -19,10 +19,10 @@
 
 package org.apache.iceberg.spark.data.vectorized;
 
+import org.apache.arrow.vector.NullCheckingForGet;
 import org.apache.iceberg.arrow.vectorized.NullabilityHolder;
 import org.apache.iceberg.arrow.vectorized.VectorHolder;
 import org.apache.iceberg.spark.SparkSchemaUtil;
-import org.apache.parquet.Preconditions;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.sql.vectorized.ArrowColumnVector;
 import org.apache.spark.sql.vectorized.ColumnVector;
@@ -39,6 +39,7 @@ public class IcebergArrowColumnVector extends ColumnVector {
 
   private final ArrowVectorAccessor accessor;
   private final NullabilityHolder nullabilityHolder;
+  private static final boolean USE_VECTOR_VALIDITY_BUFFER = NullCheckingForGet.NULL_CHECKING_ENABLED;
 
   public IcebergArrowColumnVector(VectorHolder holder) {
     super(SparkSchemaUtil.convert(holder.icebergType()));
@@ -53,17 +54,17 @@ public void close() {
 
   @Override
   public boolean hasNull() {
-    return nullabilityHolder.hasNulls();
+    return USE_VECTOR_VALIDITY_BUFFER ? accessor.getVector().getNullCount() > 0 : nullabilityHolder.hasNulls();
   }
 
   @Override
   public int numNulls() {
-    return nullabilityHolder.numNulls();
+    return USE_VECTOR_VALIDITY_BUFFER ? accessor.getVector().getNullCount() : nullabilityHolder.numNulls();
   }
 
   @Override
   public boolean isNullAt(int rowId) {
-    return nullabilityHolder.isNullAt(rowId) == 1;
+    return USE_VECTOR_VALIDITY_BUFFER ? accessor.getVector().isNull(rowId) : nullabilityHolder.isNullAt(rowId) == 1;
   }
 
   @Override
@@ -140,10 +141,7 @@ public byte[] getBinary(int rowId) {
 
   @Override
   public ArrowColumnVector getChild(int ordinal) {
-    ArrowColumnVector[] childColumns = accessor.childColumns();
-    Preconditions.checkArgument(childColumns != null && ordinal < childColumns.length, "Invalid call for getChild() " +
-        "with ordinal " + ordinal);
-    return childColumns[ordinal];
+    return accessor.childColumn(ordinal);
   }
 
   static ColumnVector forHolder(VectorHolder holder, int numRows) {
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
index cbb1fb864175..b506557c283f 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
@@ -22,15 +22,16 @@
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.IntStream;
 import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.NullCheckingForGet;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.arrow.ArrowAllocation;
 import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader;
 import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
 import org.apache.iceberg.parquet.VectorizedReader;
-import org.apache.iceberg.spark.arrow.ArrowAllocation;
 import org.apache.iceberg.types.Types;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.schema.GroupType;
@@ -44,29 +45,27 @@ private VectorizedSparkParquetReaders() {
   }
 
   @SuppressWarnings("unchecked")
-  public static ColumnarBatchReaders buildReader(
-      Schema tableSchema,
+  public static ColumnarBatchReader buildReader(
       Schema expectedSchema,
       MessageType fileSchema,
       Integer recordsPerBatch) {
-    return (ColumnarBatchReaders)
+    return (ColumnarBatchReader)
         TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
-            new VectorizedReaderBuilder(tableSchema, expectedSchema, fileSchema, recordsPerBatch));
+            new VectorizedReaderBuilder(expectedSchema, fileSchema, recordsPerBatch));
   }
 
-  private static class VectorizedReaderBuilder extends TypeWithSchemaVisitor<VectorizedReader> {
+  private static class VectorizedReaderBuilder extends TypeWithSchemaVisitor<VectorizedReader<?>> {
     private final MessageType parquetSchema;
-    private final Schema tableIcebergSchema;
+    private final Schema icebergSchema;
     private final BufferAllocator rootAllocator;
     private final int batchSize;
 
     VectorizedReaderBuilder(
-        Schema tableSchema,
-        Schema projectedIcebergSchema,
+        Schema expectedSchema,
         MessageType parquetSchema,
         int bSize) {
       this.parquetSchema = parquetSchema;
-      this.tableIcebergSchema = tableSchema;
+      this.icebergSchema = expectedSchema;
       this.batchSize = bSize;
       this.rootAllocator = ArrowAllocation.rootAllocator()
           .newChildAllocator("VectorizedReadBuilder", 0, Long.MAX_VALUE);
@@ -75,44 +74,41 @@ private static class VectorizedReaderBuilder extends TypeWithSchemaVisitor<Vecto
     @Override
     public VectorizedReader message(
             Types.StructType expected, MessageType message,
-            List<VectorizedReader> fieldReaders) {
+            List<VectorizedReader<?>> fieldReaders) {
       return struct(expected, message.asGroupType(), fieldReaders);
     }
 
     @Override
-    public VectorizedReader struct(
+    public VectorizedReader<?> struct(
             Types.StructType expected, GroupType struct,
-            List<VectorizedReader> fieldReaders) {
+            List<VectorizedReader<?>> fieldReaders) {
 
-      Map<Integer, VectorizedReader> readersById = Maps.newHashMap();
+      Map<Integer, VectorizedReader<?>> readersById = Maps.newHashMap();
       List<Type> fields = struct.getFields();
 
-      for (int i = 0; i < fields.size(); i += 1) {
-        Type fieldType = fields.get(i);
-        int id = fieldType.getId().intValue();
-        readersById.put(id, fieldReaders.get(i));
-      }
+      IntStream.range(0, fields.size())
+          .forEach(pos -> readersById.put(fields.get(pos).getId().intValue(), fieldReaders.get(pos)));
 
       List<Types.NestedField> icebergFields = expected != null ?
           expected.fields() : ImmutableList.of();
 
-      List<VectorizedReader> reorderedFields = Lists.newArrayListWithExpectedSize(
+      List<VectorizedReader<?>> reorderedFields = Lists.newArrayListWithExpectedSize(
           icebergFields.size());
 
       for (Types.NestedField field : icebergFields) {
         int id = field.fieldId();
-        VectorizedReader reader = readersById.get(id);
+        VectorizedReader<?> reader = readersById.get(id);
         if (reader != null) {
           reorderedFields.add(reader);
         } else {
           reorderedFields.add(VectorizedArrowReader.nulls());
         }
       }
-      return new ColumnarBatchReaders(reorderedFields);
+      return new ColumnarBatchReader(reorderedFields);
     }
 
     @Override
-    public VectorizedReader primitive(
+    public VectorizedReader<?> primitive(
         org.apache.iceberg.types.Type.PrimitiveType expected,
         PrimitiveType primitive) {
 
@@ -123,20 +119,13 @@ public VectorizedReader primitive(
       if (desc.getMaxRepetitionLevel() > 0) {
         return null;
       }
-      Types.NestedField icebergField = tableIcebergSchema.findField(parquetFieldId);
-      return new VectorizedArrowReader(desc, icebergField, rootAllocator,
-          batchSize, /* setArrowValidityVector */ false);
-    }
-
-    private String[] currentPath() {
-      String[] path = new String[fieldNames.size()];
-      if (!fieldNames.isEmpty()) {
-        Iterator<String> iter = fieldNames.descendingIterator();
-        for (int i = 0; iter.hasNext(); i += 1) {
-          path[i] = iter.next();
-        }
+      Types.NestedField icebergField = icebergSchema.findField(parquetFieldId);
+      if (icebergField == null) {
+        return null;
       }
-      return path;
+      // Set the validity buffer if null checking is enabled in arrow
+      return new VectorizedArrowReader(desc, icebergField, rootAllocator,
+          batchSize, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED);
     }
 
     protected MessageType type() {
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
index 0814f819a39f..f14f1c54284f 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
@@ -76,7 +76,7 @@ private Iterator<ColumnarBatch> open(FileScanTask task, Schema readSchema) {
       iter = Parquet.read(location)
           .project(readSchema)
           .split(task.start(), task.length())
-          .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(tableSchema, readSchema,
+          .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(readSchema,
               fileSchema, batchSize))
           .filter(task.residual())
           .caseSensitive(caseSensitive)
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
index 0f38df328ce3..32ee2833ced7 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -200,7 +200,7 @@ public StructType readSchema() {
    */
   @Override
   public List<InputPartition<ColumnarBatch>> planBatchInputPartitions() {
-    Preconditions.checkState(enableBatchRead != null && enableBatchRead, "Batched reads not enabled");
+    Preconditions.checkState(enableBatchRead(), "Batched reads not enabled");
     Preconditions.checkState(batchSize > 0, "Invalid batch size");
     String tableSchemaString = SchemaParser.toJson(table.schema());
     String expectedSchemaString = SchemaParser.toJson(lazySchema());
@@ -303,29 +303,17 @@ private boolean lazyCheckEnableBatchRead() {
                   .stream()
                   .allMatch(fileScanTask -> fileScanTask.file().format().equals(
                       FileFormat.PARQUET)));
-      if (!allParquetFileScanTasks) {
-        this.enableBatchRead = false;
-        return false;
-      }
 
-      int numColumns = lazySchema().columns().size();
-      if (numColumns == 0) {
-        this.enableBatchRead = false;
-        return false;
-      }
+      boolean atLeastOneColumn = lazySchema().columns().size() > 0;
 
-      boolean projectIdentityPartitionColumn =
-          tasks().stream()
-              .anyMatch(combinedScanTask -> combinedScanTask.files()
-                  .stream()
-                  .anyMatch(fileScanTask -> !fileScanTask.spec().identitySourceIds().isEmpty()));
-      if (projectIdentityPartitionColumn) {
-        this.enableBatchRead = false;
-        return false;
-      }
+      boolean hasNoIdentityProjections = tasks().stream()
+          .allMatch(combinedScanTask -> combinedScanTask.files()
+              .stream()
+              .allMatch(fileScanTask -> fileScanTask.spec().identitySourceIds().isEmpty()));
+
+      boolean onlyPrimitives = lazySchema().columns().stream().allMatch(c -> c.type().isPrimitiveType());
 
-      // Enable batched reads only if all requested columns are primitive otherwise revert to row-based reads
-      this.enableBatchRead = lazySchema().columns().stream().allMatch(c -> c.type().isPrimitiveType());
+      this.enableBatchRead = allParquetFileScanTasks && atLeastOneColumn && hasNoIdentityProjections && onlyPrimitives;
     }
     return enableBatchRead;
   }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
index 57e61efd6afa..bf72e3da5ae8 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
@@ -44,18 +44,18 @@ public abstract class AvroDataTest {
       optional(101, "data", Types.StringType.get()),
       required(102, "b", Types.BooleanType.get()),
       optional(103, "i", Types.IntegerType.get()),
-      required(104, "l", LongType.get()),
+      optional(104, "l", LongType.get()),
       optional(105, "f", Types.FloatType.get()),
-      required(106, "d", Types.DoubleType.get()),
+      optional(106, "d", Types.DoubleType.get()),
       optional(107, "date", Types.DateType.get()),
-      required(108, "ts", Types.TimestampType.withZone()),
-      required(110, "s", Types.StringType.get()),
+      optional(108, "ts", Types.TimestampType.withZone()),
+      optional(110, "s", Types.StringType.get()),
       //required(111, "uuid", Types.UUIDType.get()),
-      required(112, "fixed", Types.FixedType.ofLength(7)),
+      optional(112, "fixed", Types.FixedType.ofLength(7)),
       optional(113, "bytes", Types.BinaryType.get()),
-      required(114, "dec_9_0", Types.DecimalType.of(9, 0)),
-      required(115, "dec_11_2", Types.DecimalType.of(11, 2)),
-      required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision
+      optional(114, "dec_9_0", Types.DecimalType.of(9, 0)),
+      optional(115, "dec_11_2", Types.DecimalType.of(11, 2)),
+      optional(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision
   );
 
   @Rule
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
index fc7fcbd5d885..22ec0a2df3da 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
@@ -103,13 +103,13 @@ public Record next() {
     };
   }
 
-  public static List<Record> generateListWithFallBackDictionaryEncodingForStrings(
-          Schema schema,
-          int numRecords,
-          long seed,
-          float fraction) {
+  public static List<Record> generateListWithFallBackDictionaryEncoding(
+      Schema schema,
+      int numRecords,
+      long seed,
+      float fraction) {
     FallbackDictionaryEncodedDataGenerator generator =
-            new FallbackDictionaryEncodedDataGenerator(schema, seed, numRecords, fraction);
+        new FallbackDictionaryEncodedDataGenerator(schema, seed, numRecords, fraction);
     List<Record> records = Lists.newArrayListWithExpectedSize(numRecords);
     for (int i = 0; i < numRecords; i += 1) {
       Record rec = (Record) TypeUtil.visit(schema, generator);
@@ -627,30 +627,75 @@ private static Object generateDictionaryEncodablePrimitive(Type.PrimitiveType pr
   }
 
   private static class FallbackDictionaryEncodedDataGenerator extends RandomDataGenerator {
-
-    private final int numRecords;
+    private final long numValues;
     private final float fraction;
     private int current;
 
     private FallbackDictionaryEncodedDataGenerator(Schema schema, long seed, int numRecords, float fraction) {
       super(schema, seed);
-      this.numRecords = numRecords;
+      // for now, vectorized reads are only supported for primitive types
+      this.numValues =
+          numRecords * schema.columns().stream().filter(nestedField -> nestedField.type().isPrimitiveType()).count();
       this.fraction = fraction;
     }
 
     @Override
     public Object primitive(Type.PrimitiveType primitive) {
+      current++;
+      boolean dictionaryEncodable = current < fraction * numValues;
+      Object result;
       switch (primitive.typeId()) {
         case STRING:
-          if (current < fraction * numRecords) {
-            current++;
-            return "ABC";
+          result = dictionaryEncodable ? UTF8String.fromString("ABC") : randomString(random);
+          break;
+        case BOOLEAN:
+          result = true; // doesn't really matter for booleans since they are not dictionary encoded
+          break;
+        case INTEGER:
+        case DATE:
+          result = dictionaryEncodable ? 1 : random.nextInt();
+          break;
+        case LONG:
+        case TIME:
+        case TIMESTAMP:
+          result = dictionaryEncodable ? 1L : random.nextLong();
+          break;
+        case FLOAT:
+          result = dictionaryEncodable ? 1.0f : random.nextFloat();
+          break;
+        case DOUBLE:
+          result = dictionaryEncodable ? 1.0d : random.nextDouble();
+          break;
+        case FIXED:
+          byte[] fixed = new byte[((Types.FixedType) primitive).length()];
+          if (dictionaryEncodable) {
+            fixed[0] = 1;
           } else {
-            current++;
-            return super.primitive(primitive);
+            random.nextBytes(fixed);
           }
+          result = fixed;
+          break;
+        case BINARY:
+          byte[] binary;
+          if (dictionaryEncodable) {
+            binary = new byte[1];
+            binary[0] = 1;
+          } else {
+            binary = new byte[random.nextInt(50)];
+            random.nextBytes(binary);
+          }
+          result = binary;
+          break;
+        case DECIMAL:
+          Types.DecimalType type = (Types.DecimalType) primitive;
+          BigInteger unscaled = dictionaryEncodable ? new BigInteger("1") : randomUnscaled(type.precision(), random);
+          result = Decimal.apply(new BigDecimal(unscaled, type.scale()));
+          break;
+        default:
+          throw new IllegalArgumentException(
+              "Cannot generate value for unknown type: " + primitive);
       }
-      return super.primitive(primitive);
+      return super.getPrimitive(primitive, result);
     }
   }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
index 1b00be85ec83..169ff876516f 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
@@ -19,13 +19,29 @@
 
 package org.apache.iceberg.spark.data;
 
+import java.io.File;
+import java.io.IOException;
 import java.util.List;
 import org.apache.avro.generic.GenericData;
+import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.parquet.Parquet;
 
 public class TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader extends TestSparkParquetVectorizedReader {
   @Override
   public List<GenericData.Record> generateData(Schema schema) {
-    return RandomData.generateListWithFallBackDictionaryEncodingForStrings(schema, 100000, 0L, 0.5f);
+    return RandomData.generateListWithFallBackDictionaryEncoding(schema, 200000, 0L, 0.05f);
   }
+
+  @Override
+  FileAppender<GenericData.Record> getParquetWriter(Schema schema, File testFile) throws IOException {
+    return Parquet.write(Files.localOutput(testFile))
+        .schema(schema)
+        .named("test")
+        .set(TableProperties.PARQUET_DICT_SIZE_BYTES, "512000")
+        .build();
+  }
+
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
index 146f173f0be3..3e3789214e19 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
@@ -36,17 +36,11 @@
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.junit.Assert;
 import org.junit.Assume;
-import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;
 
 public class TestSparkParquetVectorizedReader extends AvroDataTest {
 
-  @BeforeClass
-  public static void beforeClass() {
-    TestHelpers.setArrowFlagsForVectorizedReads();
-  }
-
   @Override
   protected void writeAndValidate(Schema schema) throws IOException {
     // Write test data
@@ -60,15 +54,19 @@ protected void writeAndValidate(Schema schema) throws IOException {
     File testFile = temp.newFile();
     Assert.assertTrue("Delete should succeed", testFile.delete());
 
-    try (FileAppender<GenericData.Record> writer = Parquet.write(Files.localOutput(testFile))
-        .schema(schema)
-        .named("test")
-        .build()) {
+    try (FileAppender<GenericData.Record> writer = getParquetWriter(schema, testFile)) {
       writer.addAll(expected);
     }
     assertRecordsMatch(schema, expected, testFile);
   }
 
+  FileAppender<GenericData.Record> getParquetWriter(Schema schema, File testFile) throws IOException {
+    return Parquet.write(Files.localOutput(testFile))
+        .schema(schema)
+        .named("test")
+        .build();
+  }
+
   List<GenericData.Record> generateData(Schema schema) {
     return RandomData.generateList(schema, 100000, 0L);
   }
@@ -77,7 +75,7 @@ void assertRecordsMatch(Schema schema, List<GenericData.Record> expected, File t
     try (CloseableIterable<ColumnarBatch> batchReader = Parquet.read(Files.localInput(testFile))
         .project(schema)
         .reuseContainers()
-        .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(schema, schema, type, 10000))
+        .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(schema, type, 10000))
         .build()) {
 
       Iterator<ColumnarBatch> batches = batchReader.iterator();
@@ -105,7 +103,6 @@ public void testArray() {
   @Test
   @Ignore
   public void testArrayOfStructs() {
-    System.out.println("Not Supported");
   }
 
   @Test

From c5347bf7af610c7af3ac418261f4a182cd1441cc Mon Sep 17 00:00:00 2001
From: samarthjain <samarth@apache.org>
Date: Tue, 26 May 2020 15:38:26 -0700
Subject: [PATCH 07/12] Code review comments

---
 build.gradle                                  |  3 ++
 .../data/vectorized/ArrowVectorAccessor.java  | 10 +++---
 .../data/vectorized/ArrowVectorAccessors.java | 25 ++++++++-------
 .../VectorizedSparkParquetReaders.java        |  3 +-
 .../iceberg/spark/source/BatchDataReader.java | 31 +++----------------
 .../apache/iceberg/spark/source/Reader.java   |  2 +-
 .../apache/iceberg/spark/data/RandomData.java | 10 +++---
 .../iceberg/spark/data/TestHelpers.java       |  8 -----
 ...quetDictionaryEncodedVectorizedReads.java} |  5 +--
 ...llbackToPlainEncodingVectorizedReads.java} |  7 +++--
 .../TestParquetVectorizedReads.java}          |  7 +++--
 .../spark/source/TestReadProjection.java      |  7 -----
 .../spark/source/TestSparkDataWrite.java      |  2 --
 .../spark/source/TestStructuredStreaming.java |  2 --
 14 files changed, 45 insertions(+), 77 deletions(-)
 rename spark/src/test/java/org/apache/iceberg/spark/data/{TestSparkParquetDictionaryEncodedVectorizedReader.java => parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java} (84%)
 rename spark/src/test/java/org/apache/iceberg/spark/data/{TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java => parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java} (82%)
 rename spark/src/test/java/org/apache/iceberg/spark/data/{TestSparkParquetVectorizedReader.java => parquet/vectorized/TestParquetVectorizedReads.java} (93%)

diff --git a/build.gradle b/build.gradle
index b7d8aec9f573..b62213e96166 100644
--- a/build.gradle
+++ b/build.gradle
@@ -458,7 +458,10 @@ project(':iceberg-spark3') {
   }
   test {
     // For vectorized reads
+    // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds
     systemProperty("arrow.enable_unsafe_memory_access", "true")
+    // Disable expensive null check for every get(index) call.
+    // Iceberg manages nullability checks itself instead of relying on arrow.
     systemProperty("arrow.enable_null_check_for_get", "false")
   }
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
index 244856a34270..688ff6c8b78b 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
@@ -33,7 +33,7 @@ abstract class ArrowVectorAccessor {
 
   ArrowVectorAccessor(ValueVector vector) {
     this.vector = vector;
-    this.childColumns = null;
+    this.childColumns = new ArrowColumnVector[0];
   }
 
   ArrowVectorAccessor(ValueVector vector, ArrowColumnVector[] children) {
@@ -42,11 +42,9 @@ abstract class ArrowVectorAccessor {
   }
 
   final void close() {
-    if (childColumns != null) {
-      for (ArrowColumnVector column : childColumns) {
-        // Closing an ArrowColumnVector is expected to not throw any exception
-        column.close();
-      }
+    for (ArrowColumnVector column : childColumns) {
+      // Closing an ArrowColumnVector is expected to not throw any exception
+      column.close();
     }
     vector.close();
   }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
index 5a215b2cb0b7..34cb63ea5c3e 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
@@ -199,19 +199,20 @@ final long getLong(int rowId) {
   }
 
   private static class DictionaryLongAccessor extends ArrowVectorAccessor {
-
-    private final Dictionary parquetDictionary;
     private final IntVector offsetVector;
+    private final long[] decodedDictionary;
 
     DictionaryLongAccessor(IntVector vector, Dictionary dictionary) {
       super(vector);
       this.offsetVector = vector;
-      this.parquetDictionary = dictionary;
+      this.decodedDictionary = IntStream.rangeClosed(0, dictionary.getMaxId())
+          .mapToLong(dictionary::decodeToLong)
+          .toArray();
     }
 
     @Override
     final long getLong(int rowId) {
-      return parquetDictionary.decodeToLong(offsetVector.get(rowId));
+      return decodedDictionary[offsetVector.get(rowId)];
     }
   }
 
@@ -231,19 +232,21 @@ final float getFloat(int rowId) {
   }
 
   private static class DictionaryFloatAccessor extends ArrowVectorAccessor {
-
     private final IntVector offsetVector;
-    private final Dictionary parquetDictionary;
+    private final float[] decodedDictionary;
 
     DictionaryFloatAccessor(IntVector vector, Dictionary dictionary) {
       super(vector);
-      this.parquetDictionary = dictionary;
       this.offsetVector = vector;
+      this.decodedDictionary = new float[dictionary.getMaxId() + 1];
+      for (int i = 0; i <= dictionary.getMaxId(); i++) {
+        decodedDictionary[i] = dictionary.decodeToFloat(i);
+      }
     }
 
     @Override
     final float getFloat(int rowId) {
-      return parquetDictionary.decodeToFloat(offsetVector.get(rowId));
+      return decodedDictionary[offsetVector.get(rowId)];
     }
   }
 
@@ -263,12 +266,12 @@ final double getDouble(int rowId) {
   }
 
   private static class DictionaryDoubleAccessor extends ArrowVectorAccessor {
-    private final IntVector vector;
+    private final IntVector offsetVector;
     private final double[] decodedDictionary;
 
     DictionaryDoubleAccessor(IntVector vector, Dictionary dictionary) {
       super(vector);
-      this.vector = vector;
+      this.offsetVector = vector;
       this.decodedDictionary = IntStream.rangeClosed(0, dictionary.getMaxId())
           .mapToDouble(dictionary::decodeToDouble)
           .toArray();
@@ -276,7 +279,7 @@ private static class DictionaryDoubleAccessor extends ArrowVectorAccessor {
 
     @Override
     final double getDouble(int rowId) {
-      return decodedDictionary[vector.get(rowId)];
+      return decodedDictionary[offsetVector.get(rowId)];
     }
   }
 
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
index b506557c283f..812d1fa3518e 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
@@ -44,7 +44,6 @@ public class VectorizedSparkParquetReaders {
   private VectorizedSparkParquetReaders() {
   }
 
-  @SuppressWarnings("unchecked")
   public static ColumnarBatchReader buildReader(
       Schema expectedSchema,
       MessageType fileSchema,
@@ -72,7 +71,7 @@ private static class VectorizedReaderBuilder extends TypeWithSchemaVisitor<Vecto
     }
 
     @Override
-    public VectorizedReader message(
+    public VectorizedReader<?> message(
             Types.StructType expected, MessageType message,
             List<VectorizedReader<?>> fieldReaders) {
       return struct(expected, message.asGroupType(), fieldReaders);
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
index f14f1c54284f..d1cc2b13f014 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
@@ -20,63 +20,43 @@
 package org.apache.iceberg.spark.source;
 
 import com.google.common.base.Preconditions;
-import java.util.Iterator;
 import org.apache.iceberg.CombinedScanTask;
 import org.apache.iceberg.FileFormat;
 import org.apache.iceberg.FileScanTask;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.encryption.EncryptionManager;
 import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.CloseableIterator;
 import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.io.InputFile;
 import org.apache.iceberg.parquet.Parquet;
-import org.apache.iceberg.spark.SparkSchemaUtil;
 import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
-import org.apache.spark.sql.types.StructType;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 
 class BatchDataReader extends BaseDataReader<ColumnarBatch> {
-  private final Schema tableSchema;
   private final Schema expectedSchema;
   private final boolean caseSensitive;
   private final int batchSize;
 
   BatchDataReader(
-      CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
+      CombinedScanTask task, Schema expectedSchema, FileIO fileIo,
       EncryptionManager encryptionManager, boolean caseSensitive, int size) {
     super(task, fileIo, encryptionManager);
-    this.tableSchema = tableSchema;
     this.expectedSchema = expectedSchema;
     this.caseSensitive = caseSensitive;
     this.batchSize = size;
   }
 
   @Override
-  Iterator<ColumnarBatch> open(FileScanTask task) {
-    // schema or rows returned by readers
-    Schema finalSchema = expectedSchema;
-    // schema needed for the projection and filtering
-    StructType sparkType = SparkSchemaUtil.convert(finalSchema);
-    Schema requiredSchema = SparkSchemaUtil.prune(tableSchema, sparkType, task.residual(), caseSensitive);
-    boolean hasExtraFilterColumns = requiredSchema.columns().size() != finalSchema.columns().size();
-    Iterator<ColumnarBatch> iter;
-    if (hasExtraFilterColumns) {
-      iter = open(task, requiredSchema);
-    } else {
-      iter = open(task, finalSchema);
-    }
-    return iter;
-  }
-
-  private Iterator<ColumnarBatch> open(FileScanTask task, Schema readSchema) {
+  CloseableIterator<ColumnarBatch> open(FileScanTask task) {
     CloseableIterable<ColumnarBatch> iter;
     InputFile location = getInputFile(task);
     Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
     if (task.file().format() == FileFormat.PARQUET) {
       iter = Parquet.read(location)
-          .project(readSchema)
+          .project(expectedSchema)
           .split(task.start(), task.length())
-          .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(readSchema,
+          .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(expectedSchema,
               fileSchema, batchSize))
           .filter(task.residual())
           .caseSensitive(caseSensitive)
@@ -90,7 +70,6 @@ private Iterator<ColumnarBatch> open(FileScanTask task, Schema readSchema) {
       throw new UnsupportedOperationException(
           "Format: " + task.file().format() + " not supported for batched reads");
     }
-    this.currentCloseable = iter;
     return iter.iterator();
   }
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
index 32ee2833ced7..43f21f463f38 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -470,7 +470,7 @@ private static class ColumnarBatchReadTask extends BaseReadTask<ColumnarBatch> {
 
     @Override
     public InputPartitionReader<ColumnarBatch> createPartitionReader() {
-      return new BatchDataReader(task, lazyTableSchema(), lazyExpectedSchema(), io.value(),
+      return new BatchDataReader(task, lazyExpectedSchema(), io.value(),
           encryptionManager.value(), caseSensitive, batchSize);
     }
   }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
index 22ec0a2df3da..0e1e1dcdd362 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
@@ -103,13 +103,13 @@ public Record next() {
     };
   }
 
-  public static List<Record> generateListWithFallBackDictionaryEncoding(
+  public static List<Record> generateListWithDictionaryFallbackToPlainEncoding(
       Schema schema,
       int numRecords,
       long seed,
       float fraction) {
-    FallbackDictionaryEncodedDataGenerator generator =
-        new FallbackDictionaryEncodedDataGenerator(schema, seed, numRecords, fraction);
+    DictionaryFallbackToPlainEncodingDataGenerator generator =
+        new DictionaryFallbackToPlainEncodingDataGenerator(schema, seed, numRecords, fraction);
     List<Record> records = Lists.newArrayListWithExpectedSize(numRecords);
     for (int i = 0; i < numRecords; i += 1) {
       Record rec = (Record) TypeUtil.visit(schema, generator);
@@ -626,12 +626,12 @@ private static Object generateDictionaryEncodablePrimitive(Type.PrimitiveType pr
     }
   }
 
-  private static class FallbackDictionaryEncodedDataGenerator extends RandomDataGenerator {
+  private static class DictionaryFallbackToPlainEncodingDataGenerator extends RandomDataGenerator {
     private final long numValues;
     private final float fraction;
     private int current;
 
-    private FallbackDictionaryEncodedDataGenerator(Schema schema, long seed, int numRecords, float fraction) {
+    private DictionaryFallbackToPlainEncodingDataGenerator(Schema schema, long seed, int numRecords, float fraction) {
       super(schema, seed);
       // for now, vectorized reads are only supported for primitive types
       this.numValues =
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
index 741b5f96a247..7b0450561d06 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
@@ -688,12 +688,4 @@ private static void assertEquals(String context, MapType map, MapData expected,
           expectedValues.get(i, valueType), actualValues.get(i, valueType));
     }
   }
-
-  public static void setArrowFlagsForVectorizedReads() {
-    // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds
-    System.setProperty("arrow.enable_unsafe_memory_access", "true");
-    // Disable expensive null check for every get(index) call.
-    // Iceberg manages nullability checks itself instead of relying on arrow.
-    System.setProperty("arrow.enable_null_check_for_get", "false");
-  }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
similarity index 84%
rename from spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java
rename to spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
index 235afc7ca698..b158ab964e3a 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetDictionaryEncodedVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
@@ -17,13 +17,14 @@
  * under the License.
  */
 
-package org.apache.iceberg.spark.data;
+package org.apache.iceberg.spark.data.parquet.vectorized;
 
 import java.util.List;
 import org.apache.avro.generic.GenericData;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.spark.data.RandomData;
 
-public class TestSparkParquetDictionaryEncodedVectorizedReader extends TestSparkParquetVectorizedReader {
+public class TestParquetDictionaryEncodedVectorizedReads extends TestParquetVectorizedReads {
   @Override
   List<GenericData.Record> generateData(Schema schema) {
     return RandomData.generateDictionaryEncodableData(schema, 100000, 0L);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java
similarity index 82%
rename from spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
rename to spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java
index 169ff876516f..620e87b50a6d 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.apache.iceberg.spark.data;
+package org.apache.iceberg.spark.data.parquet.vectorized;
 
 import java.io.File;
 import java.io.IOException;
@@ -28,11 +28,12 @@
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.spark.data.RandomData;
 
-public class TestSparkParquetFallbackToDictionaryEncodingForVectorizedReader extends TestSparkParquetVectorizedReader {
+public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads extends TestParquetVectorizedReads {
   @Override
   public List<GenericData.Record> generateData(Schema schema) {
-    return RandomData.generateListWithFallBackDictionaryEncoding(schema, 200000, 0L, 0.05f);
+    return RandomData.generateListWithDictionaryFallbackToPlainEncoding(schema, 200000, 0L, 0.05f);
   }
 
   @Override
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
similarity index 93%
rename from spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
rename to spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
index 3e3789214e19..812137101a9e 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.apache.iceberg.spark.data;
+package org.apache.iceberg.spark.data.parquet.vectorized;
 
 import java.io.File;
 import java.io.IOException;
@@ -30,6 +30,9 @@
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.spark.data.AvroDataTest;
+import org.apache.iceberg.spark.data.RandomData;
+import org.apache.iceberg.spark.data.TestHelpers;
 import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
 import org.apache.iceberg.types.TypeUtil;
 import org.apache.iceberg.types.Types;
@@ -39,7 +42,7 @@
 import org.junit.Ignore;
 import org.junit.Test;
 
-public class TestSparkParquetVectorizedReader extends AvroDataTest {
+public class TestParquetVectorizedReads extends AvroDataTest {
 
   @Override
   protected void writeAndValidate(Schema schema) throws IOException {
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
index 13bdd79a1cbd..8d65b64cab6d 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java
@@ -30,11 +30,9 @@
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
-import org.apache.iceberg.spark.data.TestHelpers;
 import org.apache.iceberg.types.Comparators;
 import org.apache.iceberg.types.Types;
 import org.junit.Assert;
-import org.junit.BeforeClass;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
@@ -56,11 +54,6 @@ protected abstract Record writeAndRead(String desc,
   @Rule
   public TemporaryFolder temp = new TemporaryFolder();
 
-  @BeforeClass
-  public static void beforeClass() {
-    TestHelpers.setArrowFlagsForVectorizedReads();
-  }
-
   @Test
   public void testFullProjection() throws Exception {
     Schema schema = new Schema(
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java
index c721fe0c8d39..a5fae085f9b9 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java
@@ -34,7 +34,6 @@
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.hadoop.HadoopTables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
-import org.apache.iceberg.spark.data.TestHelpers;
 import org.apache.iceberg.types.Types;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
@@ -76,7 +75,6 @@ public static Object[][] parameters() {
   @BeforeClass
   public static void startSpark() {
     TestSparkDataWrite.spark = SparkSession.builder().master("local[2]").getOrCreate();
-    TestHelpers.setArrowFlagsForVectorizedReads();
   }
 
   @AfterClass
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
index 32ab3a9aecdf..72eeab5a283b 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
@@ -29,7 +29,6 @@
 import org.apache.iceberg.hadoop.HadoopTables;
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
-import org.apache.iceberg.spark.data.TestHelpers;
 import org.apache.iceberg.types.Types;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
@@ -70,7 +69,6 @@ public static void startSpark() {
         .master("local[2]")
         .config("spark.sql.shuffle.partitions", 4)
         .getOrCreate();
-    TestHelpers.setArrowFlagsForVectorizedReads();
   }
 
   @AfterClass

From b7b68f5153dcdd55b57fadecf010cbcfce084652 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Wed, 3 Jun 2020 16:14:47 -0700
Subject: [PATCH 08/12] Some changes from the review.

---
 .../apache/iceberg/spark/source/Reader.java   |  81 ++---
 .../iceberg/spark/data/AvroDataTest.java      |  20 +-
 .../apache/iceberg/spark/data/RandomData.java | 303 ++++--------------
 .../iceberg/spark/data/TestHelpers.java       |  45 +--
 ...rquetDictionaryEncodedVectorizedReads.java |   5 +-
 ...allbackToPlainEncodingVectorizedReads.java |   5 +-
 .../TestParquetVectorizedReads.java           |  25 +-
 7 files changed, 145 insertions(+), 339 deletions(-)

diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
index 43f21f463f38..d73180c44ecb 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -207,9 +207,9 @@ public List<InputPartition<ColumnarBatch>> planBatchInputPartitions() {
 
     List<InputPartition<ColumnarBatch>> readTasks = Lists.newArrayList();
     for (CombinedScanTask task : tasks()) {
-      readTasks.add(
-          new ColumnarBatchReadTask(task, tableSchemaString, expectedSchemaString,
-              io, encryptionManager, caseSensitive, localityPreferred, batchSize));
+      readTasks.add(new ReadTask<>(
+          task, tableSchemaString, expectedSchemaString, io, encryptionManager, caseSensitive, localityPreferred,
+          new BatchReaderFactory(batchSize)));
     }
     LOG.info("Batching input partitions with {} tasks.", readTasks.size());
 
@@ -226,9 +226,9 @@ public List<InputPartition<InternalRow>> planInputPartitions() {
 
     List<InputPartition<InternalRow>> readTasks = Lists.newArrayList();
     for (CombinedScanTask task : tasks()) {
-      readTasks.add(
-          new InternalRowReadTask(task, tableSchemaString, expectedSchemaString, io, encryptionManager,
-              caseSensitive, localityPreferred));
+      readTasks.add(new ReadTask<>(
+          task, tableSchemaString, expectedSchemaString, io, encryptionManager, caseSensitive, localityPreferred,
+          InternalRowReaderFactory.INSTANCE));
     }
 
     return readTasks;
@@ -383,23 +383,23 @@ public String toString() {
         table, lazySchema().asStruct(), filterExpressions, caseSensitive, enableBatchRead());
   }
 
-  @SuppressWarnings("checkstyle:VisibilityModifier")
-  private abstract static class BaseReadTask<T> implements Serializable, InputPartition<T> {
-    final CombinedScanTask task;
+  private static class ReadTask<T> implements Serializable, InputPartition<T> {
+    private final CombinedScanTask task;
     private final String tableSchemaString;
     private final String expectedSchemaString;
-    final Broadcast<FileIO> io;
-    final Broadcast<EncryptionManager> encryptionManager;
-    final boolean caseSensitive;
+    private final Broadcast<FileIO> io;
+    private final Broadcast<EncryptionManager> encryptionManager;
+    private final boolean caseSensitive;
     private final boolean localityPreferred;
+    private final ReaderFactory<T> readerFactory;
 
     private transient Schema tableSchema = null;
     private transient Schema expectedSchema = null;
     private transient String[] preferredLocations;
 
-    private BaseReadTask(CombinedScanTask task, String tableSchemaString, String expectedSchemaString,
-        Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
-        boolean caseSensitive, boolean localityPreferred) {
+    private ReadTask(CombinedScanTask task, String tableSchemaString, String expectedSchemaString,
+                     Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
+                     boolean caseSensitive, boolean localityPreferred, ReaderFactory<T> readerFactory) {
       this.task = task;
       this.tableSchemaString = tableSchemaString;
       this.expectedSchemaString = expectedSchemaString;
@@ -408,6 +408,13 @@ private BaseReadTask(CombinedScanTask task, String tableSchemaString, String exp
       this.caseSensitive = caseSensitive;
       this.localityPreferred = localityPreferred;
       this.preferredLocations = getPreferredLocations();
+      this.readerFactory = readerFactory;
+    }
+
+    @Override
+    public InputPartitionReader<T> createPartitionReader() {
+      return readerFactory.create(task, lazyTableSchema(), lazyExpectedSchema(), io.value(),
+          encryptionManager.value(), caseSensitive);
     }
 
     @Override
@@ -415,14 +422,14 @@ public String[] preferredLocations() {
       return preferredLocations;
     }
 
-    Schema lazyTableSchema() {
+    private Schema lazyTableSchema() {
       if (tableSchema == null) {
         this.tableSchema = SchemaParser.fromJson(tableSchemaString);
       }
       return tableSchema;
     }
 
-    Schema lazyExpectedSchema() {
+    private Schema lazyExpectedSchema() {
       if (expectedSchema == null) {
         this.expectedSchema = SchemaParser.fromJson(expectedSchemaString);
       }
@@ -439,39 +446,37 @@ private String[] getPreferredLocations() {
     }
   }
 
-  private static class InternalRowReadTask extends BaseReadTask<InternalRow> {
+  private interface ReaderFactory<T> {
+    InputPartitionReader<T> create(CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO io,
+                                   EncryptionManager encryptionManager, boolean caseSensitive);
+  }
 
-    private InternalRowReadTask(
-        CombinedScanTask task, String tableSchemaString, String expectedSchemaString,
-        Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
-        boolean caseSensitive, boolean localityPreferred) {
-      super(task, tableSchemaString, expectedSchemaString, io, encryptionManager, caseSensitive, localityPreferred);
+  private static class InternalRowReaderFactory implements ReaderFactory<InternalRow> {
+    private static final InternalRowReaderFactory INSTANCE = new InternalRowReaderFactory();
+
+    private InternalRowReaderFactory() {
     }
 
     @Override
-    public InputPartitionReader<InternalRow> createPartitionReader() {
-      return new RowDataReader(task, lazyTableSchema(), lazyExpectedSchema(), io.value(),
-          encryptionManager.value(), caseSensitive);
+    public InputPartitionReader<InternalRow> create(CombinedScanTask task, Schema tableSchema, Schema expectedSchema,
+                                                    FileIO io, EncryptionManager encryptionManager,
+                                                    boolean caseSensitive) {
+      return new RowDataReader(task, tableSchema, expectedSchema, io, encryptionManager, caseSensitive);
     }
   }
 
-  /**
-   * Organizes input data into [InputPartition]s for Vectorized [ColumnarBatch] reads
-   */
-  private static class ColumnarBatchReadTask extends BaseReadTask<ColumnarBatch> {
+  private static class BatchReaderFactory implements ReaderFactory<ColumnarBatch> {
     private final int batchSize;
 
-    ColumnarBatchReadTask(
-        CombinedScanTask task, String tableSchemaString, String expectedSchemaString, Broadcast<FileIO> fileIo,
-        Broadcast<EncryptionManager> encryptionManager, boolean caseSensitive, boolean localityPreferred, int size) {
-      super(task, tableSchemaString, expectedSchemaString, fileIo, encryptionManager, caseSensitive, localityPreferred);
-      this.batchSize = size;
+    BatchReaderFactory(int batchSize) {
+      this.batchSize = batchSize;
     }
 
     @Override
-    public InputPartitionReader<ColumnarBatch> createPartitionReader() {
-      return new BatchDataReader(task, lazyExpectedSchema(), io.value(),
-          encryptionManager.value(), caseSensitive, batchSize);
+    public InputPartitionReader<ColumnarBatch> create(CombinedScanTask task, Schema tableSchema, Schema expectedSchema,
+                                                    FileIO io, EncryptionManager encryptionManager,
+                                                    boolean caseSensitive) {
+      return new BatchDataReader(task, expectedSchema, io, encryptionManager, caseSensitive, batchSize);
     }
   }
 
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
index bf72e3da5ae8..5e396c2a8d60 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
@@ -22,6 +22,7 @@
 import java.io.IOException;
 import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.types.TypeUtil;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.types.Types.ListType;
@@ -54,7 +55,7 @@ public abstract class AvroDataTest {
       optional(112, "fixed", Types.FixedType.ofLength(7)),
       optional(113, "bytes", Types.BinaryType.get()),
       optional(114, "dec_9_0", Types.DecimalType.of(9, 0)),
-      optional(115, "dec_11_2", Types.DecimalType.of(11, 2)),
+      required(115, "dec_11_2", Types.DecimalType.of(11, 2)),
       optional(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision
   );
 
@@ -66,6 +67,23 @@ public void testSimpleStruct() throws IOException {
     writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())));
   }
 
+  @Test
+  public void testStructWithRequiredFields() throws IOException {
+    writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(
+        Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired))));
+  }
+
+  @Test
+  public void testStructWithOptionalFields() throws IOException {
+    writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(
+        Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional))));
+  }
+
+  @Test
+  public void testNestedStruct() throws IOException {
+    writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES))));
+  }
+
   @Test
   public void testArray() throws IOException {
     Schema schema = new Schema(
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
index 0e1e1dcdd362..01363fa40c97 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
@@ -21,6 +21,7 @@
 
 import java.math.BigDecimal;
 import java.nio.ByteBuffer;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -83,9 +84,22 @@ public InternalRow next() {
   }
 
   public static Iterable<Record> generate(Schema schema, int numRecords, long seed) {
+    return newIterable(() -> new RandomDataGenerator(schema, seed), schema, numRecords);
+  }
+
+  public static Iterable<Record> generateFallbackData(Schema schema, int numRecords, long seed, long numDictRecords) {
+    return newIterable(() -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords);
+  }
+
+  public static Iterable<GenericData.Record> generateDictionaryEncodableData(Schema schema, int numRecords, long seed) {
+    return newIterable(() -> new DictionaryEncodedDataGenerator(schema, seed), schema, numRecords);
+  }
+
+  private static Iterable<Record> newIterable(Supplier<RandomDataGenerator> newGenerator,
+                                              Schema schema, int numRecords) {
     return () -> new Iterator<Record>() {
-      private RandomDataGenerator generator = new RandomDataGenerator(schema, seed);
       private int count = 0;
+      private RandomDataGenerator generator = newGenerator.get();
 
       @Override
       public boolean hasNext() {
@@ -103,37 +117,9 @@ public Record next() {
     };
   }
 
-  public static List<Record> generateListWithDictionaryFallbackToPlainEncoding(
-      Schema schema,
-      int numRecords,
-      long seed,
-      float fraction) {
-    DictionaryFallbackToPlainEncodingDataGenerator generator =
-        new DictionaryFallbackToPlainEncodingDataGenerator(schema, seed, numRecords, fraction);
-    List<Record> records = Lists.newArrayListWithExpectedSize(numRecords);
-    for (int i = 0; i < numRecords; i += 1) {
-      Record rec = (Record) TypeUtil.visit(schema, generator);
-      records.add(rec);
-    }
-
-    return records;
-  }
-
-  public static List<GenericData.Record> generateDictionaryEncodableData(Schema schema, int numRecords, long seed) {
-    List<GenericData.Record> records = Lists.newArrayListWithExpectedSize(numRecords);
-    DictionaryEncodedDataGenerator
-        dictionaryDataGenerator = new DictionaryEncodedDataGenerator(schema, seed);
-    for (int i = 0; i < numRecords; i += 1) {
-      GenericData.Record rec = (GenericData.Record) TypeUtil.visit(schema, dictionaryDataGenerator);
-      records.add(rec);
-    }
-    return records;
-  }
-
-  @SuppressWarnings("checkstyle:VisibilityModifier")
   private static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
     private final Map<Type, org.apache.avro.Schema> typeToSchema;
-    final Random random;
+    private final Random random;
 
     private RandomDataGenerator(Schema schema, long seed) {
       this.typeToSchema = AvroSchemaUtil.convertTypes(schema.asStruct(), "test");
@@ -214,10 +200,6 @@ public Object primitive(Type.PrimitiveType primitive) {
       Object result = RandomUtil.generatePrimitive(primitive, random);
       // For the primitives that Avro needs a different type than Spark, fix
       // them here.
-      return getPrimitive(primitive, result);
-    }
-
-    Object getPrimitive(Type.PrimitiveType primitive, Object result) {
       switch (primitive.typeId()) {
         case FIXED:
           return new GenericData.Fixed(typeToSchema.get(primitive),
@@ -328,6 +310,42 @@ public Object primitive(Type.PrimitiveType primitive) {
     }
   }
 
+  private static Object generateDictionaryEncodablePrimitive(Type.PrimitiveType primitive, Random random) {
+    int value = random.nextInt(3);
+    switch (primitive.typeId()) {
+      case BOOLEAN:
+        return true; // doesn't really matter for booleans since they are not dictionary encoded
+      case INTEGER:
+      case DATE:
+        return value;
+      case FLOAT:
+        return (float) value;
+      case DOUBLE:
+        return (double) value;
+      case LONG:
+      case TIME:
+      case TIMESTAMP:
+        return (long) value;
+      case STRING:
+        return UTF8String.fromString(String.valueOf(value));
+      case FIXED:
+        byte[] fixed = new byte[((Types.FixedType) primitive).length()];
+        Arrays.fill(fixed, (byte) value);
+        return fixed;
+      case BINARY:
+        byte[] binary = new byte[value + 1];
+        Arrays.fill(binary, (byte) value);
+        return binary;
+      case DECIMAL:
+        Types.DecimalType type = (Types.DecimalType) primitive;
+        BigInteger unscaled = new BigInteger(String.valueOf(value + 1));
+        return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+      default:
+        throw new IllegalArgumentException(
+            "Cannot generate random value for unknown type: " + primitive);
+    }
+  }
+
   @SuppressWarnings("RandomModInteger")
   private static Object generatePrimitive(Type.PrimitiveType primitive,
                                          Random random) {
@@ -478,224 +496,33 @@ private static BigInteger randomUnscaled(int precision, Random random) {
   }
 
   private static class DictionaryEncodedDataGenerator extends RandomDataGenerator {
-
     private DictionaryEncodedDataGenerator(Schema schema, long seed) {
       super(schema, seed);
     }
 
     @Override
-    public Object primitive(Type.PrimitiveType primitive) {
-      Object result = generateDictionaryEncodablePrimitive(primitive, random);
-      return super.getPrimitive(primitive, result);
-    }
-
-    @SuppressWarnings("checkstyle:CyclomaticComplexity")
-    private static Object generateDictionaryEncodablePrimitive(Type.PrimitiveType primitive, Random random) {
-      // 3 choices
-      int choice = random.nextInt(3);
-      switch (primitive.typeId()) {
-        case BOOLEAN:
-          return true; // doesn't really matter for booleans since they are not dictionary encoded
-
-        case INTEGER:
-          switch (choice) {
-            case 0:
-              return 0;
-            case 1:
-              return 1;
-            case 2:
-              return 2;
-          }
-
-        case LONG:
-          switch (choice) {
-            case 0:
-              return 0L;
-            case 1:
-              return 1L;
-            case 2:
-              return 2L;
-          }
-
-        case FLOAT:
-          switch (choice) {
-            case 0:
-              return 0.0f;
-            case 1:
-              return 1.0f;
-            case 2:
-              return 2.0f;
-          }
-
-        case DOUBLE:
-          switch (choice) {
-            case 0:
-              return 0.0d;
-            case 1:
-              return 1.0d;
-            case 2:
-              return 2.0d;
-          }
-
-        case DATE:
-          switch (choice) {
-            case 0:
-              return 0;
-            case 1:
-              return 1;
-            case 2:
-              return 2;
-          }
-
-        case TIME:
-          switch (choice) {
-            case 0:
-              return 0L;
-            case 1:
-              return 1L;
-            case 2:
-              return 2L;
-          }
-
-        case TIMESTAMP:
-          switch (choice) {
-            case 0:
-              return 0L;
-            case 1:
-              return 1L;
-            case 2:
-              return 2L;
-          }
-
-        case STRING:
-          switch (choice) {
-            case 0:
-              return UTF8String.fromString("0");
-            case 1:
-              return UTF8String.fromString("1");
-            case 2:
-              return UTF8String.fromString("2");
-          }
-
-        case FIXED:
-          byte[] fixed = new byte[((Types.FixedType) primitive).length()];
-          switch (choice) {
-            case 0:
-              fixed[0] = 0;
-              return fixed;
-            case 1:
-              fixed[0] = 1;
-              return fixed;
-            case 2:
-              fixed[0] = 2;
-              return fixed;
-          }
-
-        case BINARY:
-          byte[] binary = new byte[4];
-          switch (choice) {
-            case 0:
-              binary[0] = 0;
-              return binary;
-            case 1:
-              binary[0] = 1;
-              return binary;
-            case 2:
-              binary[0] = 2;
-              return binary;
-          }
-
-        case DECIMAL:
-          Types.DecimalType type = (Types.DecimalType) primitive;
-          switch (choice) {
-            case 0:
-              BigInteger unscaled = new BigInteger("1");
-              return Decimal.apply(new BigDecimal(unscaled, type.scale()));
-            case 1:
-              unscaled = new BigInteger("2");
-              return Decimal.apply(new BigDecimal(unscaled, type.scale()));
-            case 2:
-              unscaled = new BigInteger("3");
-              return Decimal.apply(new BigDecimal(unscaled, type.scale()));
-          }
-
-        default:
-          throw new IllegalArgumentException(
-              "Cannot generate random value for unknown type: " + primitive);
-      }
+    protected Object randomValue(Type.PrimitiveType primitive, Random random) {
+      return generateDictionaryEncodablePrimitive(primitive, random);
     }
   }
 
-  private static class DictionaryFallbackToPlainEncodingDataGenerator extends RandomDataGenerator {
-    private final long numValues;
-    private final float fraction;
-    private int current;
+  private static class FallbackDataGenerator extends RandomDataGenerator {
+    private final long dictionaryEncodedRows;
+    private long rowCount = 0;
 
-    private DictionaryFallbackToPlainEncodingDataGenerator(Schema schema, long seed, int numRecords, float fraction) {
+    private FallbackDataGenerator(Schema schema, long seed, long numDictionaryEncoded) {
       super(schema, seed);
-      // for now, vectorized reads are only supported for primitive types
-      this.numValues =
-          numRecords * schema.columns().stream().filter(nestedField -> nestedField.type().isPrimitiveType()).count();
-      this.fraction = fraction;
+      this.dictionaryEncodedRows = numDictionaryEncoded;
     }
 
     @Override
-    public Object primitive(Type.PrimitiveType primitive) {
-      current++;
-      boolean dictionaryEncodable = current < fraction * numValues;
-      Object result;
-      switch (primitive.typeId()) {
-        case STRING:
-          result = dictionaryEncodable ? UTF8String.fromString("ABC") : randomString(random);
-          break;
-        case BOOLEAN:
-          result = true; // doesn't really matter for booleans since they are not dictionary encoded
-          break;
-        case INTEGER:
-        case DATE:
-          result = dictionaryEncodable ? 1 : random.nextInt();
-          break;
-        case LONG:
-        case TIME:
-        case TIMESTAMP:
-          result = dictionaryEncodable ? 1L : random.nextLong();
-          break;
-        case FLOAT:
-          result = dictionaryEncodable ? 1.0f : random.nextFloat();
-          break;
-        case DOUBLE:
-          result = dictionaryEncodable ? 1.0d : random.nextDouble();
-          break;
-        case FIXED:
-          byte[] fixed = new byte[((Types.FixedType) primitive).length()];
-          if (dictionaryEncodable) {
-            fixed[0] = 1;
-          } else {
-            random.nextBytes(fixed);
-          }
-          result = fixed;
-          break;
-        case BINARY:
-          byte[] binary;
-          if (dictionaryEncodable) {
-            binary = new byte[1];
-            binary[0] = 1;
-          } else {
-            binary = new byte[random.nextInt(50)];
-            random.nextBytes(binary);
-          }
-          result = binary;
-          break;
-        case DECIMAL:
-          Types.DecimalType type = (Types.DecimalType) primitive;
-          BigInteger unscaled = dictionaryEncodable ? new BigInteger("1") : randomUnscaled(type.precision(), random);
-          result = Decimal.apply(new BigDecimal(unscaled, type.scale()));
-          break;
-        default:
-          throw new IllegalArgumentException(
-              "Cannot generate value for unknown type: " + primitive);
+    protected Object randomValue(Type.PrimitiveType primitive, Random rand) {
+      this.rowCount += 1;
+      if (rowCount > dictionaryEncodedRows) {
+        return generatePrimitive(primitive, rand);
+      } else {
+        return generateDictionaryEncodablePrimitive(primitive, rand);
       }
-      return super.getPrimitive(primitive, result);
     }
   }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
index 7b0450561d06..743865da3aba 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
@@ -29,6 +29,7 @@
 import java.time.temporal.ChronoUnit;
 import java.util.Collection;
 import java.util.Date;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.UUID;
@@ -36,7 +37,6 @@
 import org.apache.avro.generic.GenericData.Record;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
-import org.apache.iceberg.spark.data.vectorized.IcebergArrowColumnVector;
 import org.apache.iceberg.types.Type;
 import org.apache.iceberg.types.Types;
 import org.apache.orc.storage.serde2.io.DateWritable;
@@ -54,7 +54,6 @@
 import org.apache.spark.sql.types.MapType;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
-import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.UTF8String;
 import org.junit.Assert;
@@ -81,45 +80,9 @@ public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row
     }
   }
 
-  public static void assertEqualsUnsafe(Types.StructType struct, List<Record> expected, ColumnarBatch batch) {
-    List<Types.NestedField> fields = struct.fields();
+  public static void assertEqualsBatch(Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch) {
     for (int r = 0; r < batch.numRows(); r++) {
-
-      Record expRec = expected.get(r);
-      InternalRow actualRow = batch.getRow(r);
-
-      for (int i = 0; i < fields.size(); i += 1) {
-
-        Type fieldType = fields.get(i).type();
-        Object expectedValue = expRec.get(i);
-        if (actualRow.isNullAt(i)) {
-          Assert.assertTrue("Expect null at " + r, expectedValue == null);
-        } else {
-          Object actualValue = actualRow.get(i, convert(fieldType));
-          assertEqualsUnsafe(fieldType, expectedValue, actualValue);
-        }
-      }
-    }
-  }
-
-  public static void assertArrowVectors(Types.StructType struct, List<Record> expected,
-                                        ColumnarBatch batch) {
-    List<Types.NestedField> fields = struct.fields();
-    for (int r = 0; r < batch.numRows(); r++) {
-      Record expRec = expected.get(r);
-      InternalRow actualRow = batch.getRow(r);
-      for (int i = 0; i < fields.size(); i += 1) {
-        ColumnVector vector = batch.column(i);
-        Assert.assertTrue(vector instanceof IcebergArrowColumnVector);
-        Type fieldType = fields.get(i).type();
-        Object expectedValue = expRec.get(i);
-        if (actualRow.isNullAt(i)) {
-          Assert.assertNull(expectedValue);
-        } else {
-          Object actualValue = actualRow.get(i, convert(fieldType));
-          assertEqualsUnsafe(fieldType, expectedValue, actualValue);
-        }
-      }
+      assertEqualsUnsafe(struct, expected.next(), batch.getRow(r));
     }
   }
 
@@ -244,7 +207,7 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record rec, Inter
       Type fieldType = fields.get(i).type();
 
       Object expectedValue = rec.get(i);
-      Object actualValue = row.get(i, convert(fieldType));
+      Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType));
 
       assertEqualsUnsafe(fieldType, expectedValue, actualValue);
     }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
index b158ab964e3a..f7ecf4d1157f 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
@@ -19,14 +19,13 @@
 
 package org.apache.iceberg.spark.data.parquet.vectorized;
 
-import java.util.List;
 import org.apache.avro.generic.GenericData;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.spark.data.RandomData;
 
 public class TestParquetDictionaryEncodedVectorizedReads extends TestParquetVectorizedReads {
   @Override
-  List<GenericData.Record> generateData(Schema schema) {
-    return RandomData.generateDictionaryEncodableData(schema, 100000, 0L);
+  Iterable<GenericData.Record> generateData(int numRows, Schema schema) {
+    return RandomData.generateDictionaryEncodableData(schema, numRows, 0L);
   }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java
index 620e87b50a6d..ea7c4a3f6d90 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java
@@ -21,7 +21,6 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.List;
 import org.apache.avro.generic.GenericData;
 import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
@@ -32,8 +31,8 @@
 
 public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads extends TestParquetVectorizedReads {
   @Override
-  public List<GenericData.Record> generateData(Schema schema) {
-    return RandomData.generateListWithDictionaryFallbackToPlainEncoding(schema, 200000, 0L, 0.05f);
+  public Iterable<GenericData.Record> generateData(int numRows, Schema schema) {
+    return RandomData.generateFallbackData(schema, numRows, 0L, numRows / 20);
   }
 
   @Override
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
index 812137101a9e..9280addc15ea 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
@@ -21,9 +21,7 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Iterator;
-import java.util.List;
 import org.apache.avro.generic.GenericData;
 import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
@@ -43,6 +41,7 @@
 import org.junit.Test;
 
 public class TestParquetVectorizedReads extends AvroDataTest {
+  private static final int NUM_ROWS = 100_000_000;
 
   @Override
   protected void writeAndValidate(Schema schema) throws IOException {
@@ -51,7 +50,7 @@ protected void writeAndValidate(Schema schema) throws IOException {
         schema,
         type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));
 
-    List<GenericData.Record> expected = generateData(schema);
+    Iterable<GenericData.Record> expected = generateData(NUM_ROWS, schema);
 
     // write a test parquet file using iceberg writer
     File testFile = temp.newFile();
@@ -60,7 +59,7 @@ protected void writeAndValidate(Schema schema) throws IOException {
     try (FileAppender<GenericData.Record> writer = getParquetWriter(schema, testFile)) {
       writer.addAll(expected);
     }
-    assertRecordsMatch(schema, expected, testFile);
+    assertRecordsMatch(schema, NUM_ROWS, expected, testFile);
   }
 
   FileAppender<GenericData.Record> getParquetWriter(Schema schema, File testFile) throws IOException {
@@ -70,31 +69,27 @@ FileAppender<GenericData.Record> getParquetWriter(Schema schema, File testFile)
         .build();
   }
 
-  List<GenericData.Record> generateData(Schema schema) {
-    return RandomData.generateList(schema, 100000, 0L);
+  Iterable<GenericData.Record> generateData(int numRows, Schema schema) {
+    return RandomData.generate(schema, numRows, 0L);
   }
 
-  void assertRecordsMatch(Schema schema, List<GenericData.Record> expected, File testFile) throws IOException {
+  void assertRecordsMatch(Schema schema, int expectedSize, Iterable<GenericData.Record> expected, File testFile) throws IOException {
     try (CloseableIterable<ColumnarBatch> batchReader = Parquet.read(Files.localInput(testFile))
         .project(schema)
         .reuseContainers()
+        .recordsPerBatch(10000)
         .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(schema, type, 10000))
         .build()) {
 
+      Iterator<GenericData.Record> expectedIter = expected.iterator();
       Iterator<ColumnarBatch> batches = batchReader.iterator();
       int numRowsRead = 0;
-      int numExpectedRead = 0;
       while (batches.hasNext()) {
         ColumnarBatch batch = batches.next();
         numRowsRead += batch.numRows();
-        List<GenericData.Record> expectedBatch = new ArrayList<>(batch.numRows());
-        for (int i = numExpectedRead; i < numExpectedRead + batch.numRows(); i++) {
-          expectedBatch.add(expected.get(i));
-        }
-        TestHelpers.assertArrowVectors(schema.asStruct(), expectedBatch, batch);
-        numExpectedRead += batch.numRows();
+        TestHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch);
       }
-      Assert.assertEquals(expected.size(), numRowsRead);
+      Assert.assertEquals(expectedSize, numRowsRead);
     }
   }
 

From 58f2cf79ea49e89cf217c773a0591c4de6795428 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Wed, 3 Jun 2020 16:45:57 -0700
Subject: [PATCH 09/12] Enable projection tests for vectorized Parquet.

---
 .../org/apache/iceberg/TableProperties.java   |  6 ++++
 .../VectorizedSparkParquetReaders.java        |  4 ---
 .../apache/iceberg/spark/source/Reader.java   | 32 ++++++++-----------
 .../TestParquetVectorizedReads.java           |  2 +-
 .../spark/source/TestSparkReadProjection.java | 23 ++++++++-----
 5 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java
index f2ad3e5451b4..39d066893fe1 100644
--- a/core/src/main/java/org/apache/iceberg/TableProperties.java
+++ b/core/src/main/java/org/apache/iceberg/TableProperties.java
@@ -77,6 +77,12 @@ private TableProperties() {}
   public static final String SPLIT_OPEN_FILE_COST = "read.split.open-file-cost";
   public static final long SPLIT_OPEN_FILE_COST_DEFAULT = 4 * 1024 * 1024; // 4MB
 
+  public static final String PARQUET_VECTORIZATION_ENABLED = "read.parquet.vectorization.enabled";
+  public static final boolean PARQUET_VECTORIZATION_ENABLED_DEFAULT = false;
+
+  public static final String PARQUET_BATCH_SIZE = "read.parquet.vectorization.batch-size";
+  public static final int PARQUET_BATCH_SIZE_DEFAULT = 5000;
+
   public static final String OBJECT_STORE_ENABLED = "write.object-storage.enabled";
   public static final boolean OBJECT_STORE_ENABLED_DEFAULT = false;
 
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
index 812d1fa3518e..a3dbf865b36d 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
@@ -126,9 +126,5 @@ public VectorizedReader<?> primitive(
       return new VectorizedArrowReader(desc, icebergField, rootAllocator,
           batchSize, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED);
     }
-
-    protected MessageType type() {
-      return parquetSchema;
-    }
   }
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
index d73180c44ecb..328312d11285 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -25,7 +25,6 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-import java.util.Optional;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -38,7 +37,6 @@
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.TableScan;
-import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader;
 import org.apache.iceberg.encryption.EncryptionManager;
 import org.apache.iceberg.exceptions.RuntimeIOException;
 import org.apache.iceberg.expressions.Expression;
@@ -51,6 +49,7 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.spark.SparkFilters;
 import org.apache.iceberg.spark.SparkSchemaUtil;
+import org.apache.iceberg.util.PropertyUtil;
 import org.apache.spark.broadcast.Broadcast;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.catalyst.InternalRow;
@@ -93,13 +92,14 @@ class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPus
   private List<Expression> filterExpressions = null;
   private Filter[] pushedFilters = NO_FILTERS;
   private final boolean localityPreferred;
+  private final boolean batchReadsEnabled;
   private final int batchSize;
 
   // lazy variables
   private Schema schema = null;
   private StructType type = null; // cached because Spark accesses it multiple times
   private List<CombinedScanTask> tasks = null; // lazy cache of tasks
-  private Boolean enableBatchRead = null; // cache variable for enabling batched reads
+  private Boolean readUsingBatch = null;
 
   Reader(
       Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
@@ -155,13 +155,12 @@ class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPus
     this.encryptionManager = encryptionManager;
     this.caseSensitive = caseSensitive;
 
-    boolean enableBatchReadsConfig =
-        options.get("iceberg.read.parquet-vectorization.enabled").map(Boolean::parseBoolean).orElse(true);
-    if (!enableBatchReadsConfig) {
-      enableBatchRead = Boolean.FALSE;
-    }
-    Optional<String> numRecordsPerBatchOpt = options.get("iceberg.read.parquet-vectorization.batch-size");
-    this.batchSize = numRecordsPerBatchOpt.map(Integer::parseInt).orElse(VectorizedArrowReader.DEFAULT_BATCH_SIZE);
+    this.batchReadsEnabled = options.get("vectorization-enabled").map(Boolean::parseBoolean).orElse(
+        PropertyUtil.propertyAsBoolean(table.properties(),
+            TableProperties.PARQUET_VECTORIZATION_ENABLED, TableProperties.PARQUET_VECTORIZATION_ENABLED_DEFAULT));
+    this.batchSize = options.get("batch-size").map(Integer::parseInt).orElse(
+        PropertyUtil.propertyAsInt(table.properties(),
+          TableProperties.PARQUET_BATCH_SIZE, TableProperties.PARQUET_BATCH_SIZE_DEFAULT));
   }
 
   private Schema lazySchema() {
@@ -292,11 +291,7 @@ public Statistics estimateStatistics() {
 
   @Override
   public boolean enableBatchRead() {
-    return lazyCheckEnableBatchRead();
-  }
-
-  private boolean lazyCheckEnableBatchRead() {
-    if (enableBatchRead == null) {
+    if (readUsingBatch == null) {
       boolean allParquetFileScanTasks =
           tasks().stream()
               .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files()
@@ -313,9 +308,10 @@ private boolean lazyCheckEnableBatchRead() {
 
       boolean onlyPrimitives = lazySchema().columns().stream().allMatch(c -> c.type().isPrimitiveType());
 
-      this.enableBatchRead = allParquetFileScanTasks && atLeastOneColumn && hasNoIdentityProjections && onlyPrimitives;
+      this.readUsingBatch = batchReadsEnabled && allParquetFileScanTasks && atLeastOneColumn &&
+          hasNoIdentityProjections && onlyPrimitives;
     }
-    return enableBatchRead;
+    return readUsingBatch;
   }
 
   private static void mergeIcebergHadoopConfs(
@@ -446,7 +442,7 @@ private String[] getPreferredLocations() {
     }
   }
 
-  private interface ReaderFactory<T> {
+  private interface ReaderFactory<T> extends Serializable {
     InputPartitionReader<T> create(CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO io,
                                    EncryptionManager encryptionManager, boolean caseSensitive);
   }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
index 9280addc15ea..32600900a351 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
@@ -41,7 +41,7 @@
 import org.junit.Test;
 
 public class TestParquetVectorizedReads extends AvroDataTest {
-  private static final int NUM_ROWS = 100_000_000;
+  private static final int NUM_ROWS = 200_000;
 
   @Override
   protected void writeAndValidate(Schema schema) throws IOException {
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java
index 41b00918e18c..8bb951818258 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java
@@ -31,6 +31,7 @@
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.avro.Avro;
 import org.apache.iceberg.data.Record;
 import org.apache.iceberg.data.avro.DataWriter;
@@ -66,14 +67,20 @@ public class TestSparkReadProjection extends TestReadProjection {
   @Parameterized.Parameters
   public static Object[][] parameters() {
     return new Object[][] {
-        new Object[] { "parquet" },
-        new Object[] { "avro" },
-        new Object[] { "orc" }
+        new Object[] { "parquet", false },
+        new Object[] { "parquet", true },
+        new Object[] { "avro", false },
+        new Object[] { "orc", false }
     };
   }
 
-  public TestSparkReadProjection(String format) {
+  private final FileFormat format;
+  private final boolean vectorized;
+
+  public TestSparkReadProjection(String format, boolean vectorized) {
     super(format);
+    this.format = FileFormat.valueOf(format.toUpperCase(Locale.ROOT));
+    this.vectorized = vectorized;
   }
 
   @BeforeClass
@@ -96,9 +103,7 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema
     File dataFolder = new File(location, "data");
     Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());
 
-    FileFormat fileFormat = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));
-
-    File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString()));
+    File testFile = new File(dataFolder, format.addExtension(UUID.randomUUID().toString()));
 
     Table table = TestTables.create(location, desc, writeSchema, PartitionSpec.unpartitioned());
     try {
@@ -106,7 +111,7 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema
       // When tables are created, the column ids are reassigned.
       Schema tableSchema = table.schema();
 
-      switch (fileFormat) {
+      switch (format) {
         case AVRO:
           try (FileAppender<Record> writer = Avro.write(localOutput(testFile))
               .createWriterFunc(DataWriter::create)
@@ -143,6 +148,8 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema
 
       table.newAppend().appendFile(file).commit();
 
+      table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit();
+
       // rewrite the read schema for the table's reassigned ids
       Map<Integer, Integer> idMapping = Maps.newHashMap();
       for (int id : allIds(writeSchema)) {

From edecc2a4c3887fc04b5ce54ad4818b971de5850c Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Wed, 3 Jun 2020 16:54:52 -0700
Subject: [PATCH 10/12] Revert changes to AvroDataTest.

---
 .../apache/iceberg/spark/data/AvroDataTest.java    | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
index 5e396c2a8d60..966a0e656dd3 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
@@ -45,18 +45,18 @@ public abstract class AvroDataTest {
       optional(101, "data", Types.StringType.get()),
       required(102, "b", Types.BooleanType.get()),
       optional(103, "i", Types.IntegerType.get()),
-      optional(104, "l", LongType.get()),
+      required(104, "l", LongType.get()),
       optional(105, "f", Types.FloatType.get()),
-      optional(106, "d", Types.DoubleType.get()),
+      required(106, "d", Types.DoubleType.get()),
       optional(107, "date", Types.DateType.get()),
-      optional(108, "ts", Types.TimestampType.withZone()),
-      optional(110, "s", Types.StringType.get()),
+      required(108, "ts", Types.TimestampType.withZone()),
+      required(110, "s", Types.StringType.get()),
       //required(111, "uuid", Types.UUIDType.get()),
-      optional(112, "fixed", Types.FixedType.ofLength(7)),
+      required(112, "fixed", Types.FixedType.ofLength(7)),
       optional(113, "bytes", Types.BinaryType.get()),
-      optional(114, "dec_9_0", Types.DecimalType.of(9, 0)),
+      required(114, "dec_9_0", Types.DecimalType.of(9, 0)),
       required(115, "dec_11_2", Types.DecimalType.of(11, 2)),
-      optional(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision
+      required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision
   );
 
   @Rule

From f094507acfc77715803606729c38a690407ecf10 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Wed, 3 Jun 2020 17:00:14 -0700
Subject: [PATCH 11/12] Run TestParquetScan tests with vectorization enabled.

---
 .../iceberg/spark/source/TestParquetScan.java | 25 ++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java
index fcecb17c5b4d..f171fdb5766c 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java
@@ -31,6 +31,7 @@
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.hadoop.HadoopTables;
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.parquet.Parquet;
@@ -48,15 +49,15 @@
 import org.junit.BeforeClass;
 import org.junit.Rule;
 import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 
 import static org.apache.iceberg.Files.localOutput;
 
+@RunWith(Parameterized.class)
 public class TestParquetScan extends AvroDataTest {
   private static final Configuration CONF = new Configuration();
 
-  @Rule
-  public TemporaryFolder temp = new TemporaryFolder();
-
   private static SparkSession spark = null;
 
   @BeforeClass
@@ -71,6 +72,23 @@ public static void stopSpark() {
     currentSpark.stop();
   }
 
+  @Rule
+  public TemporaryFolder temp = new TemporaryFolder();
+
+  @Parameterized.Parameters
+  public static Object[][] parameters() {
+    return new Object[][] {
+        new Object[] { false },
+        new Object[] { true },
+    };
+  }
+
+  private final boolean vectorized;
+
+  public TestParquetScan(boolean vectorized) {
+    this.vectorized = vectorized;
+  }
+
   @Override
   protected void writeAndValidate(Schema schema) throws IOException {
     Assume.assumeTrue("Cannot handle non-string map keys in parquet-avro",
@@ -108,6 +126,7 @@ protected void writeAndValidate(Schema schema) throws IOException {
         .build();
 
     table.newAppend().appendFile(file).commit();
+    table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit();
 
     Dataset<Row> df = spark.read()
         .format("iceberg")

From 24161c76e6aa54394c00739dbf31084ccdbf550f Mon Sep 17 00:00:00 2001
From: samarthjain <samarth@apache.org>
Date: Fri, 12 Jun 2020 01:07:01 -0700
Subject: [PATCH 12/12] Cleanup and address code review comments.

Summary of changes:
1) Below new test cases added:
   - Test for code path when optional values are mostly null
   - Test for case when containers are not reused for every batch
   - Test for case to verify arrow's validity vector is set correctly when setArrowValidityVector = true
2) Reuse container logic is now similar to row based read path
3) We now always set the nullability holder. Arrow validity vector is set only for purpose of supplying complete arrow vectors when requested to do so.
---
 .../arrow/vectorized/IcebergArrowVectors.java |  33 +--
 .../vectorized/VectorizedArrowReader.java     |  88 ++++----
 .../parquet/VectorizedColumnIterator.java     |  10 +-
 ...dDictionaryEncodedParquetValuesReader.java |  77 ++-----
 ...ectorizedParquetDefinitionLevelReader.java |  54 ++---
 build.gradle                                  |  15 ++
 .../parquet/VectorizedParquetReader.java      |  17 +-
 .../iceberg/parquet/VectorizedReader.java     |  16 +-
 .../data/vectorized/ArrowVectorAccessor.java  |   4 +-
 .../data/vectorized/ArrowVectorAccessors.java |   6 +-
 .../data/vectorized/ColumnarBatchReader.java  |  43 +++-
 .../vectorized/IcebergArrowColumnVector.java  |  12 +-
 .../VectorizedSparkParquetReaders.java        |  41 ++--
 .../iceberg/spark/source/BatchDataReader.java |   7 +-
 .../apache/iceberg/spark/source/Reader.java   |   5 +-
 .../apache/iceberg/spark/data/RandomData.java | 208 ++++--------------
 .../iceberg/spark/data/TestHelpers.java       |  25 ++-
 .../spark/data/TestParquetAvroReader.java     |   2 +-
 .../spark/data/TestParquetAvroWriter.java     |   2 +-
 .../spark/data/TestSparkParquetWriter.java    |   2 +-
 ...rquetDictionaryEncodedVectorizedReads.java |  15 +-
 ...allbackToPlainEncodingVectorizedReads.java |  27 ++-
 .../TestParquetVectorizedReads.java           |  92 ++++++--
 23 files changed, 381 insertions(+), 420 deletions(-)

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/IcebergArrowVectors.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/IcebergArrowVectors.java
index d6fa260a58f6..a82fa57e1e43 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/IcebergArrowVectors.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/IcebergArrowVectors.java
@@ -21,8 +21,8 @@
 
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.DecimalVector;
-import org.apache.arrow.vector.VarBinaryVector;
 import org.apache.arrow.vector.VarCharVector;
+import org.apache.arrow.vector.holders.NullableVarCharHolder;
 
 /**
  * The general way of getting a value at an index in the Arrow vector
@@ -64,38 +64,9 @@ public void setNullabilityHolder(NullabilityHolder nullabilityHolder) {
     }
   }
 
-  /**
-   * Extension of Arrow's @{@link VarBinaryVector}. The whole reason of having this implementation is to override the
-   * expensive {@link VarBinaryVector#isSet(int)} method.
-   */
-  public static class VarBinaryArrowVector extends VarBinaryVector {
-    private NullabilityHolder nullabilityHolder;
-
-    public VarBinaryArrowVector(
-        String name,
-        BufferAllocator allocator) {
-      super(name, allocator);
-    }
-
-    /**
-     * Same as {@link #isNull(int)}.
-     *
-     * @param index position of element
-     * @return 1 if element at given index is not null, 0 otherwise
-     */
-    @Override
-    public int isSet(int index) {
-      return nullabilityHolder.isNullAt(index) ^ 1;
-    }
-
-    public void setNullabilityHolder(NullabilityHolder nullabilityHolder) {
-      this.nullabilityHolder = nullabilityHolder;
-    }
-  }
-
   /**
    * Extension of Arrow's @{@link VarCharVector}. The reason of having this implementation is to override the expensive
-   * {@link VarCharVector#isSet(int)} method.
+   * {@link VarCharVector#isSet(int)} method called by {@link VarCharVector#get(int, NullableVarCharHolder)}
    */
   public static class VarcharArrowVector extends VarCharVector {
 
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java
index 4fd6ccef6c7e..dbde001b9764 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java
@@ -48,9 +48,8 @@
 import org.apache.parquet.schema.PrimitiveType;
 
 /**
- * {@link VectorizedReader VectorReader(s)} that read in a batch of values into Arrow vectors.
- * It also takes care of allocating the right kind of Arrow vectors depending on the corresponding
- * Iceberg/Parquet data types.
+ * {@link VectorizedReader VectorReader(s)} that read in a batch of values into Arrow vectors. It also takes care of
+ * allocating the right kind of Arrow vectors depending on the corresponding Iceberg/Parquet data types.
  */
 public class VectorizedArrowReader implements VectorizedReader<VectorHolder> {
   public static final int DEFAULT_BATCH_SIZE = 5000;
@@ -58,14 +57,14 @@ public class VectorizedArrowReader implements VectorizedReader<VectorHolder> {
   private static final int AVERAGE_VARIABLE_WIDTH_RECORD_SIZE = 10;
 
   private final ColumnDescriptor columnDescriptor;
-  private final int batchSize;
   private final VectorizedColumnIterator vectorizedColumnIterator;
   private final Types.NestedField icebergField;
   private final BufferAllocator rootAlloc;
+
+  private int batchSize;
   private FieldVector vec;
   private Integer typeWidth;
   private ReadType readType;
-  private boolean reuseContainers = true;
   private NullabilityHolder nullabilityHolder;
 
   // In cases when Parquet employs fall back to plain encoding, we eagerly decode the dictionary encoded pages
@@ -77,13 +76,11 @@ public VectorizedArrowReader(
       ColumnDescriptor desc,
       Types.NestedField icebergField,
       BufferAllocator ra,
-      int batchSize,
       boolean setArrowValidityVector) {
     this.icebergField = icebergField;
-    this.batchSize = (batchSize == 0) ? DEFAULT_BATCH_SIZE : batchSize;
     this.columnDescriptor = desc;
     this.rootAlloc = ra;
-    this.vectorizedColumnIterator = new VectorizedColumnIterator(desc, "", batchSize, setArrowValidityVector);
+    this.vectorizedColumnIterator = new VectorizedColumnIterator(desc, "", setArrowValidityVector);
   }
 
   private VectorizedArrowReader() {
@@ -95,20 +92,35 @@ private VectorizedArrowReader() {
   }
 
   private enum ReadType {
-    FIXED_LENGTH_DECIMAL, INT_LONG_BACKED_DECIMAL, VARCHAR, VARBINARY, FIXED_WIDTH_BINARY,
-    BOOLEAN, INT, LONG, FLOAT, DOUBLE, TIMESTAMP_MILLIS
+    FIXED_LENGTH_DECIMAL,
+    INT_LONG_BACKED_DECIMAL,
+    VARCHAR,
+    VARBINARY,
+    FIXED_WIDTH_BINARY,
+    BOOLEAN,
+    INT,
+    LONG,
+    FLOAT,
+    DOUBLE,
+    TIMESTAMP_MILLIS
   }
 
   @Override
-  public VectorHolder read(int numValsToRead) {
-    boolean dictEncoded = vectorizedColumnIterator.producesDictionaryEncodedVector();
-    if (vec == null || !reuseContainers) {
-      allocateFieldVector(dictEncoded);
+  public void setBatchSize(int batchSize) {
+    this.batchSize = (batchSize == 0) ? DEFAULT_BATCH_SIZE : batchSize;
+    this.vectorizedColumnIterator.setBatchSize(batchSize);
+  }
+
+  @Override
+  public VectorHolder read(VectorHolder reuse, int numValsToRead) {
+    if (reuse == null) {
+      allocateFieldVector(this.vectorizedColumnIterator.producesDictionaryEncodedVector());
       nullabilityHolder = new NullabilityHolder(batchSize);
     } else {
       vec.setValueCount(0);
       nullabilityHolder.reset();
     }
+    boolean dictEncoded = vectorizedColumnIterator.producesDictionaryEncodedVector();
     if (vectorizedColumnIterator.hasNext()) {
       if (dictEncoded) {
         vectorizedColumnIterator.nextBatchDictionaryIds((IntVector) vec, nullabilityHolder);
@@ -123,7 +135,6 @@ public VectorHolder read(int numValsToRead) {
             vectorizedColumnIterator.nextBatchIntLongBackedDecimal(vec, typeWidth, nullabilityHolder);
             break;
           case VARBINARY:
-            ((IcebergArrowVectors.VarBinaryArrowVector) vec).setNullabilityHolder(nullabilityHolder);
             vectorizedColumnIterator.nextBatchVarWidthType(vec, nullabilityHolder);
             break;
           case VARCHAR:
@@ -131,7 +142,6 @@ public VectorHolder read(int numValsToRead) {
             vectorizedColumnIterator.nextBatchVarWidthType(vec, nullabilityHolder);
             break;
           case FIXED_WIDTH_BINARY:
-            ((IcebergArrowVectors.VarBinaryArrowVector) vec).setNullabilityHolder(nullabilityHolder);
             vectorizedColumnIterator.nextBatchFixedWidthBinary(vec, typeWidth, nullabilityHolder);
             break;
           case BOOLEAN:
@@ -183,7 +193,7 @@ private void allocateFieldVector(boolean dictionaryEncodedVector) {
             //TODO: Possibly use the uncompressed page size info to set the initial capacity
             vec.setInitialCapacity(batchSize * AVERAGE_VARIABLE_WIDTH_RECORD_SIZE);
             vec.allocateNewSafe();
-            this.readType =  ReadType.VARCHAR;
+            this.readType = ReadType.VARCHAR;
             this.typeWidth = UNKNOWN_WIDTH;
             break;
           case INT_8:
@@ -191,31 +201,31 @@ private void allocateFieldVector(boolean dictionaryEncodedVector) {
           case INT_32:
             this.vec = arrowField.createVector(rootAlloc);
             ((IntVector) vec).allocateNew(batchSize);
-            this.readType =  ReadType.INT;
+            this.readType = ReadType.INT;
             this.typeWidth = (int) IntVector.TYPE_WIDTH;
             break;
           case DATE:
             this.vec = arrowField.createVector(rootAlloc);
             ((DateDayVector) vec).allocateNew(batchSize);
-            this.readType =  ReadType.INT;
+            this.readType = ReadType.INT;
             this.typeWidth = (int) IntVector.TYPE_WIDTH;
             break;
           case INT_64:
             this.vec = arrowField.createVector(rootAlloc);
             ((BigIntVector) vec).allocateNew(batchSize);
-            this.readType =  ReadType.LONG;
+            this.readType = ReadType.LONG;
             this.typeWidth = (int) BigIntVector.TYPE_WIDTH;
             break;
           case TIMESTAMP_MILLIS:
             this.vec = arrowField.createVector(rootAlloc);
             ((BigIntVector) vec).allocateNew(batchSize);
-            this.readType =  ReadType.TIMESTAMP_MILLIS;
+            this.readType = ReadType.TIMESTAMP_MILLIS;
             this.typeWidth = (int) BigIntVector.TYPE_WIDTH;
             break;
           case TIMESTAMP_MICROS:
             this.vec = arrowField.createVector(rootAlloc);
             ((TimeStampMicroTZVector) vec).allocateNew(batchSize);
-            this.readType =  ReadType.LONG;
+            this.readType = ReadType.LONG;
             this.typeWidth = (int) BigIntVector.TYPE_WIDTH;
             break;
           case DECIMAL:
@@ -226,15 +236,15 @@ private void allocateFieldVector(boolean dictionaryEncodedVector) {
             switch (primitive.getPrimitiveTypeName()) {
               case BINARY:
               case FIXED_LEN_BYTE_ARRAY:
-                this.readType =  ReadType.FIXED_LENGTH_DECIMAL;
+                this.readType = ReadType.FIXED_LENGTH_DECIMAL;
                 this.typeWidth = primitive.getTypeLength();
                 break;
               case INT64:
-                this.readType =  ReadType.INT_LONG_BACKED_DECIMAL;
+                this.readType = ReadType.INT_LONG_BACKED_DECIMAL;
                 this.typeWidth = (int) BigIntVector.TYPE_WIDTH;
                 break;
               case INT32:
-                this.readType =  ReadType.INT_LONG_BACKED_DECIMAL;
+                this.readType = ReadType.INT_LONG_BACKED_DECIMAL;
                 this.typeWidth = (int) IntVector.TYPE_WIDTH;
                 break;
               default:
@@ -250,48 +260,48 @@ private void allocateFieldVector(boolean dictionaryEncodedVector) {
         switch (primitive.getPrimitiveTypeName()) {
           case FIXED_LEN_BYTE_ARRAY:
             int len = ((Types.FixedType) icebergField.type()).length();
-            this.vec = new IcebergArrowVectors.VarBinaryArrowVector(icebergField.name(), rootAlloc);
+            this.vec = arrowField.createVector(rootAlloc);
             vec.setInitialCapacity(batchSize * len);
             vec.allocateNew();
-            this.readType =  ReadType.FIXED_WIDTH_BINARY;
+            this.readType = ReadType.FIXED_WIDTH_BINARY;
             this.typeWidth = len;
             break;
           case BINARY:
-            this.vec = new IcebergArrowVectors.VarBinaryArrowVector(icebergField.name(), rootAlloc);
+            this.vec = arrowField.createVector(rootAlloc);
             //TODO: Possibly use the uncompressed page size info to set the initial capacity
             vec.setInitialCapacity(batchSize * AVERAGE_VARIABLE_WIDTH_RECORD_SIZE);
             vec.allocateNewSafe();
-            this.readType =  ReadType.VARBINARY;
+            this.readType = ReadType.VARBINARY;
             this.typeWidth = UNKNOWN_WIDTH;
             break;
           case INT32:
             this.vec = arrowField.createVector(rootAlloc);
             ((IntVector) vec).allocateNew(batchSize);
-            this.readType =  ReadType.INT;
+            this.readType = ReadType.INT;
             this.typeWidth = (int) IntVector.TYPE_WIDTH;
             break;
           case FLOAT:
             this.vec = arrowField.createVector(rootAlloc);
             ((Float4Vector) vec).allocateNew(batchSize);
-            this.readType =  ReadType.FLOAT;
+            this.readType = ReadType.FLOAT;
             this.typeWidth = (int) Float4Vector.TYPE_WIDTH;
             break;
           case BOOLEAN:
             this.vec = arrowField.createVector(rootAlloc);
             ((BitVector) vec).allocateNew(batchSize);
-            this.readType =  ReadType.BOOLEAN;
+            this.readType = ReadType.BOOLEAN;
             this.typeWidth = UNKNOWN_WIDTH;
             break;
           case INT64:
             this.vec = arrowField.createVector(rootAlloc);
             ((BigIntVector) vec).allocateNew(batchSize);
-            this.readType =  ReadType.LONG;
+            this.readType = ReadType.LONG;
             this.typeWidth = (int) BigIntVector.TYPE_WIDTH;
             break;
           case DOUBLE:
             this.vec = arrowField.createVector(rootAlloc);
             ((Float8Vector) vec).allocateNew(batchSize);
-            this.readType =  ReadType.DOUBLE;
+            this.readType = ReadType.DOUBLE;
             this.typeWidth = (int) Float8Vector.TYPE_WIDTH;
             break;
           default:
@@ -309,11 +319,6 @@ public void setRowGroupInfo(PageReadStore source, Map<ColumnPath, ColumnChunkMet
         !ParquetUtil.hasNonDictionaryPages(chunkMetaData));
   }
 
-  @Override
-  public void reuseContainers(boolean reuse) {
-    this.reuseContainers = reuse;
-  }
-
   @Override
   public void close() {
     if (vec != null) {
@@ -334,7 +339,7 @@ private static final class NullVectorReader extends VectorizedArrowReader {
     private static final NullVectorReader INSTANCE = new NullVectorReader();
 
     @Override
-    public VectorHolder read(int numValsToRead) {
+    public VectorHolder read(VectorHolder reuse, int numValsToRead) {
       return VectorHolder.dummyHolder(numValsToRead);
     }
 
@@ -346,6 +351,9 @@ public void setRowGroupInfo(PageReadStore source, Map<ColumnPath, ColumnChunkMet
     public String toString() {
       return "NullReader";
     }
+
+    @Override
+    public void setBatchSize(int batchSize) {}
   }
 
 }
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java
index 57f55a39f589..cb9d27890a4a 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java
@@ -36,17 +36,19 @@
 public class VectorizedColumnIterator extends BaseColumnIterator {
 
   private final VectorizedPageIterator vectorizedPageIterator;
-  private final int batchSize;
+  private int batchSize;
 
-  public VectorizedColumnIterator(ColumnDescriptor desc, String writerVersion, int batchSize,
-                                  boolean setArrowValidityVector) {
+  public VectorizedColumnIterator(ColumnDescriptor desc, String writerVersion, boolean setArrowValidityVector) {
     super(desc);
     Preconditions.checkArgument(desc.getMaxRepetitionLevel() == 0,
         "Only non-nested columns are supported for vectorized reads");
-    this.batchSize = batchSize;
     this.vectorizedPageIterator = new VectorizedPageIterator(desc, writerVersion, setArrowValidityVector);
   }
 
+  public void setBatchSize(int batchSize) {
+    this.batchSize = batchSize;
+  }
+
   public Dictionary setRowGroupInfo(PageReader store, boolean allPagesDictEncoded) {
     // setPageSource can result in a data page read. If that happens, we need
     // to know in advance whether all the pages in the row group are dictionary encoded or not
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java
index dab8e4c853ca..52e389ece40b 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java
@@ -52,15 +52,14 @@ void readBatchOfDictionaryIds(IntVector intVector, int startOffset, int numValue
         case RLE:
           for (int i = 0; i < numValues; i++) {
             intVector.set(idx, currentValue);
-            nullabilityHolder.setNotNull(idx);
+            setNotNull(intVector, nullabilityHolder, idx);
             idx++;
           }
           break;
         case PACKED:
           for (int i = 0; i < numValues; i++) {
-            intVector.set(idx, packedValuesBuffer[packedValuesBufferIdx]);
-            nullabilityHolder.setNotNull(idx);
-            packedValuesBufferIdx++;
+            intVector.set(idx, packedValuesBuffer[packedValuesBufferIdx++]);
+            setNotNull(intVector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -83,11 +82,7 @@ void readBatchOfDictionaryEncodedLongs(FieldVector vector, int startOffset, int
         case RLE:
           for (int i = 0; i < numValues; i++) {
             vector.getDataBuffer().setLong(idx * typeWidth, dict.decodeToLong(currentValue));
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setNotNull(vector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -95,11 +90,7 @@ void readBatchOfDictionaryEncodedLongs(FieldVector vector, int startOffset, int
           for (int i = 0; i < numValues; i++) {
             vector.getDataBuffer()
                 .setLong(idx * typeWidth, dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]));
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setNotNull(vector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -123,11 +114,7 @@ void readBatchOfDictionaryEncodedTimestampMillis(
         case RLE:
           for (int i = 0; i < numValues; i++) {
             vector.getDataBuffer().setLong(idx * typeWidth, dict.decodeToLong(currentValue) * 1000);
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setNotNull(vector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -135,11 +122,7 @@ void readBatchOfDictionaryEncodedTimestampMillis(
           for (int i = 0; i < numValues; i++) {
             vector.getDataBuffer()
                 .setLong(idx * typeWidth, dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]) * 1000);
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setNotNull(vector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -162,11 +145,7 @@ void readBatchOfDictionaryEncodedIntegers(FieldVector vector, int startOffset, i
         case RLE:
           for (int i = 0; i < num; i++) {
             vector.getDataBuffer().setInt(idx * typeWidth, dict.decodeToInt(currentValue));
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setNotNull(vector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -174,11 +153,7 @@ void readBatchOfDictionaryEncodedIntegers(FieldVector vector, int startOffset, i
           for (int i = 0; i < num; i++) {
             vector.getDataBuffer()
                 .setInt(idx * typeWidth, dict.decodeToInt(packedValuesBuffer[packedValuesBufferIdx++]));
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setNotNull(vector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -201,11 +176,7 @@ void readBatchOfDictionaryEncodedFloats(FieldVector vector, int startOffset, int
         case RLE:
           for (int i = 0; i < num; i++) {
             vector.getDataBuffer().setFloat(idx * typeWidth, dict.decodeToFloat(currentValue));
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setNotNull(vector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -213,11 +184,7 @@ void readBatchOfDictionaryEncodedFloats(FieldVector vector, int startOffset, int
           for (int i = 0; i < num; i++) {
             vector.getDataBuffer()
                 .setFloat(idx * typeWidth, dict.decodeToFloat(packedValuesBuffer[packedValuesBufferIdx++]));
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setNotNull(vector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -240,12 +207,7 @@ void readBatchOfDictionaryEncodedDoubles(FieldVector vector, int startOffset, in
         case RLE:
           for (int i = 0; i < num; i++) {
             vector.getDataBuffer().setDouble(idx * typeWidth, dict.decodeToDouble(currentValue));
-            nullabilityHolder.setNotNull(idx);
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setNotNull(vector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -253,11 +215,7 @@ void readBatchOfDictionaryEncodedDoubles(FieldVector vector, int startOffset, in
           for (int i = 0; i < num; i++) {
             vector.getDataBuffer()
                 .setDouble(idx * typeWidth, dict.decodeToDouble(packedValuesBuffer[packedValuesBufferIdx++]));
-            if (setArrowValidityVector) {
-              BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-            } else {
-              nullabilityHolder.setNotNull(idx);
-            }
+            setNotNull(vector, nullabilityHolder, idx);
             idx++;
           }
           break;
@@ -304,10 +262,13 @@ private void setFixedWidthBinary(
     vector.getDataBuffer()
         .setBytes(idx * typeWidth, buffer.array(),
             buffer.position() + buffer.arrayOffset(), buffer.limit() - buffer.position());
+    setNotNull(vector, nullabilityHolder, idx);
+  }
+
+  private void setNotNull(FieldVector vector, NullabilityHolder nullabilityHolder, int idx) {
+    nullabilityHolder.setNotNull(idx);
     if (setArrowValidityVector) {
       BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-    } else {
-      nullabilityHolder.setNotNull(idx);
     }
   }
 
@@ -407,7 +368,7 @@ void readBatchOfDictionaryEncodedIntLongBackedDecimals(FieldVector vector, int t
             ((DecimalVector) vector).set(
                 idx,
                 typeWidth == Integer.BYTES ?
-                    dict.decodeToInt(currentValue)
+                    dict.decodeToInt(packedValuesBuffer[packedValuesBufferIdx++])
                     : dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]));
             nullabilityHolder.setNotNull(idx);
             idx++;
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java
index bbaaaa38ba7f..8a263483f89d 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java
@@ -64,10 +64,9 @@ public void readBatchOfDictionaryIds(
           for (int i = 0; i < numValues; i++) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer().setInt(idx * IntVector.TYPE_WIDTH, dictionaryEncodedValuesReader.readInteger());
+              nullabilityHolder.setNotNull(idx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-              } else {
-                nullabilityHolder.setNotNull(idx);
               }
             } else {
               setNull(nullabilityHolder, idx, vector.getValidityBuffer());
@@ -106,10 +105,9 @@ public void readBatchOfLongs(
           for (int i = 0; i < numValues; ++i) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer().setLong(bufferIdx * typeWidth, valuesReader.readLong());
+              nullabilityHolder.setNotNull(bufferIdx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), bufferIdx);
-              } else {
-                nullabilityHolder.setNotNull(bufferIdx);
               }
             } else {
               setNull(nullabilityHolder, bufferIdx, vector.getValidityBuffer());
@@ -140,12 +138,11 @@ public void readBatchOfTimestampMillis(final FieldVector vector, final int start
             for (int i = 0; i < numValues; i++) {
               vector.getDataBuffer().setLong(bufferIdx * typeWidth, valuesReader.readLong() * 1000);
             }
+            nullabilityHolder.setNotNulls(bufferIdx, numValues);
             if (setArrowValidityVector) {
               for (int i = 0; i < numValues; i++) {
                 BitVectorHelper.setValidityBitToOne(validityBuffer, bufferIdx + i);
               }
-            } else {
-              nullabilityHolder.setNotNulls(bufferIdx, numValues);
             }
           } else {
             setNulls(nullabilityHolder, bufferIdx, numValues, validityBuffer);
@@ -156,10 +153,9 @@ public void readBatchOfTimestampMillis(final FieldVector vector, final int start
           for (int i = 0; i < numValues; i++) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer().setLong(bufferIdx * typeWidth, valuesReader.readLong() * 1000);
+              nullabilityHolder.setNotNull(bufferIdx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), bufferIdx);
-              } else {
-                nullabilityHolder.setNotNull(bufferIdx);
               }
             } else {
               setNull(nullabilityHolder, bufferIdx, vector.getValidityBuffer());
@@ -204,10 +200,9 @@ public void readBatchOfDictionaryEncodedLongs(
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer().setLong(idx * typeWidth,
                   dict.decodeToLong(dictionaryEncodedValuesReader.readInteger()));
+              nullabilityHolder.setNotNull(idx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-              } else {
-                nullabilityHolder.setNotNull(idx);
               }
             } else {
               setNull(nullabilityHolder, idx, validityBuffer);
@@ -252,10 +247,9 @@ public void readBatchOfDictionaryEncodedTimestampMillis(
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer().setLong(idx * typeWidth,
                   dict.decodeToLong(dictionaryEncodedValuesReader.readInteger()) * 1000);
+              nullabilityHolder.setNotNull(idx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-              } else {
-                nullabilityHolder.setNotNull(idx);
               }
             } else {
               setNull(nullabilityHolder, idx, validityBuffer);
@@ -294,10 +288,9 @@ public void readBatchOfIntegers(final FieldVector vector, final int startOffset,
           for (int i = 0; i < num; ++i) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer().setInt(bufferIdx * typeWidth, valuesReader.readInteger());
+              nullabilityHolder.setNotNull(bufferIdx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), bufferIdx);
-              } else {
-                nullabilityHolder.setNotNull(bufferIdx);
               }
             } else {
               setNull(nullabilityHolder, bufferIdx, vector.getValidityBuffer());
@@ -341,10 +334,9 @@ public void readBatchOfDictionaryEncodedIntegers(
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer()
                   .setInt(idx * typeWidth, dict.decodeToInt(dictionaryEncodedValuesReader.readInteger()));
+              nullabilityHolder.setNotNull(idx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-              } else {
-                nullabilityHolder.setNotNull(idx);
               }
             } else {
               setNull(nullabilityHolder, idx, vector.getValidityBuffer());
@@ -383,10 +375,9 @@ public void readBatchOfFloats(final FieldVector vector, final int startOffset, f
           for (int i = 0; i < num; ++i) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer().setFloat(bufferIdx * typeWidth, valuesReader.readFloat());
+              nullabilityHolder.setNotNull(bufferIdx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), bufferIdx);
-              } else {
-                nullabilityHolder.setNotNull(bufferIdx);
               }
             } else {
               setNull(nullabilityHolder, bufferIdx, vector.getValidityBuffer());
@@ -431,10 +422,9 @@ public void readBatchOfDictionaryEncodedFloats(
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer()
                   .setFloat(idx * typeWidth, dict.decodeToFloat(dictionaryEncodedValuesReader.readInteger()));
+              nullabilityHolder.setNotNull(idx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-              } else {
-                nullabilityHolder.setNotNull(idx);
               }
             } else {
               setNull(nullabilityHolder, idx, validityBuffer);
@@ -474,10 +464,9 @@ public void readBatchOfDoubles(
           for (int i = 0; i < num; ++i) {
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer().setDouble(bufferIdx * typeWidth, valuesReader.readDouble());
+              nullabilityHolder.setNotNull(bufferIdx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(),  bufferIdx);
-              } else {
-                nullabilityHolder.setNotNull(bufferIdx);
               }
             } else {
               setNull(nullabilityHolder, bufferIdx, vector.getValidityBuffer());
@@ -521,10 +510,9 @@ public void readBatchOfDictionaryEncodedDoubles(
             if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) {
               vector.getDataBuffer()
                   .setDouble(idx * typeWidth, dict.decodeToDouble(dictionaryEncodedValuesReader.readInteger()));
+              nullabilityHolder.setNotNull(idx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-              } else {
-                nullabilityHolder.setNotNull(idx);
               }
             } else {
               setNull(nullabilityHolder, idx, vector.getValidityBuffer());
@@ -608,10 +596,9 @@ public void readBatchOfDictionaryEncodedFixedWidthBinary(
               ByteBuffer buffer = dict.decodeToBinary(dictionaryEncodedValuesReader.readInteger()).toByteBuffer();
               vector.getDataBuffer().setBytes(idx * typeWidth, buffer.array(),
                   buffer.position() + buffer.arrayOffset(), buffer.limit() - buffer.position());
+              nullabilityHolder.setNotNull(idx);
               if (setArrowValidityVector) {
                 BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), idx);
-              } else {
-                nullabilityHolder.setNotNull(idx);
               }
             } else {
               setNull(nullabilityHolder, idx, vector.getValidityBuffer());
@@ -768,10 +755,9 @@ private void setVarWidthBinaryValue(FieldVector vector, ValuesAsBytesReader valu
         buffer.limit() - buffer.position());
     // Similarly, we need to get the latest reference to the validity buffer as well
     // since reallocation changes reference of the validity buffers as well.
+    nullabilityHolder.setNotNull(bufferIdx);
     if (setArrowValidityVector) {
       BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), bufferIdx);
-    } else {
-      nullabilityHolder.setNotNull(bufferIdx);
     }
   }
 
@@ -862,10 +848,9 @@ private void setIntLongBackedDecimal(FieldVector vector, int typeWidth, Nullabil
                                        ValuesAsBytesReader valuesReader, int bufferIdx, byte[] byteArray) {
     valuesReader.getBuffer(typeWidth).get(byteArray, 0, typeWidth);
     vector.getDataBuffer().setBytes(bufferIdx * DecimalVector.TYPE_WIDTH, byteArray);
+    nullabilityHolder.setNotNull(bufferIdx);
     if (setArrowValidityVector) {
       BitVectorHelper.setValidityBitToOne(vector.getValidityBuffer(), bufferIdx);
-    } else {
-      nullabilityHolder.setNotNull(bufferIdx);
     }
   }
 
@@ -976,12 +961,11 @@ private void setNextNValuesInVector(
     if (currentValue == maxDefLevel) {
       ByteBuffer buffer = valuesReader.getBuffer(numValues * typeWidth);
       vector.getDataBuffer().setBytes(bufferIdx * typeWidth, buffer);
+      nullabilityHolder.setNotNulls(bufferIdx, numValues);
       if (setArrowValidityVector) {
         for (int i = 0; i < numValues; i++) {
           BitVectorHelper.setValidityBitToOne(validityBuffer, bufferIdx + i);
         }
-      } else {
-        nullabilityHolder.setNotNulls(bufferIdx, numValues);
       }
     } else {
       setNulls(nullabilityHolder, bufferIdx, numValues, validityBuffer);
@@ -989,20 +973,18 @@ private void setNextNValuesInVector(
   }
 
   private void setNull(NullabilityHolder nullabilityHolder, int bufferIdx, ArrowBuf validityBuffer) {
+    nullabilityHolder.setNull(bufferIdx);
     if (setArrowValidityVector) {
       BitVectorHelper.setValidityBit(validityBuffer, bufferIdx, 0);
-    } else {
-      nullabilityHolder.setNull(bufferIdx);
     }
   }
 
   private void setNulls(NullabilityHolder nullabilityHolder, int idx, int numValues, ArrowBuf validityBuffer) {
+    nullabilityHolder.setNulls(idx, numValues);
     if (setArrowValidityVector) {
       for (int i = 0; i < numValues; i++) {
         BitVectorHelper.setValidityBit(validityBuffer, idx + i, 0);
       }
-    } else {
-      nullabilityHolder.setNulls(idx, numValues);
     }
   }
 
diff --git a/build.gradle b/build.gradle
index b62213e96166..f76c719fced3 100644
--- a/build.gradle
+++ b/build.gradle
@@ -429,6 +429,18 @@ project(':iceberg-spark') {
       exclude group: 'org.apache.avro', module: 'avro'
     }
   }
+
+  test {
+    // For vectorized reads
+    // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds
+    systemProperty("arrow.enable_unsafe_memory_access", "true")
+    // Disable expensive null check for every get(index) call.
+    // Iceberg manages nullability checks itself instead of relying on arrow.
+    systemProperty("arrow.enable_null_check_for_get", "false")
+
+    // Vectorized reads need more memory
+    maxHeapSize '2500m'
+  }
 }
 
 project(':iceberg-spark3') {
@@ -463,6 +475,9 @@ project(':iceberg-spark3') {
     // Disable expensive null check for every get(index) call.
     // Iceberg manages nullability checks itself instead of relying on arrow.
     systemProperty("arrow.enable_null_check_for_get", "false")
+
+    // Vectorized reads need more memory
+    maxHeapSize '2500m'
   }
 }
 
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java
index c3f87eee8296..6cb9da574caa 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java
@@ -90,6 +90,7 @@ private static class FileIterator<T> implements CloseableIterator<T> {
     private final long totalValues;
     private final int batchSize;
     private final List<Map<ColumnPath, ColumnChunkMetaData>> columnChunkMetadata;
+    private final boolean reuseContainers;
     private int nextRowGroup = 0;
     private long nextRowGroupStart = 0;
     private long valuesRead = 0;
@@ -98,13 +99,15 @@ private static class FileIterator<T> implements CloseableIterator<T> {
     FileIterator(ReadConf conf) {
       this.reader = conf.reader();
       this.shouldSkip = conf.shouldSkip();
-      this.model = conf.vectorizedModel();
       this.totalValues = conf.totalValues();
-      this.model.reuseContainers(conf.reuseContainers());
+      this.reuseContainers = conf.reuseContainers();
+      this.model = conf.vectorizedModel();
       this.batchSize = conf.batchSize();
+      this.model.setBatchSize(this.batchSize);
       this.columnChunkMetadata = conf.columnChunkMetadataForRowGroups();
     }
 
+
     @Override
     public boolean hasNext() {
       return valuesRead < totalValues;
@@ -118,10 +121,16 @@ public T next() {
       if (valuesRead >= nextRowGroupStart) {
         advance();
       }
-      long numValuesToRead = Math.min(nextRowGroupStart - valuesRead, batchSize);
+
       // batchSize is an integer, so casting to integer is safe
-      this.last = model.read((int) numValuesToRead);
+      int numValuesToRead = (int) Math.min(nextRowGroupStart - valuesRead, batchSize);
+      if (reuseContainers) {
+        this.last = model.read(last, numValuesToRead);
+      } else {
+        this.last = model.read(null, numValuesToRead);
+      }
       valuesRead += numValuesToRead;
+
       return last;
     }
 
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedReader.java
index 3eb3303d4eef..25c16f09bfb3 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedReader.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedReader.java
@@ -31,25 +31,23 @@ public interface VectorizedReader<T> {
 
   /**
    * Reads a batch of type @param &lt;T&gt; and of size numRows
+   *
+   * @param reuse   container for the last batch to be reused for next batch
    * @param numRows number of rows to read
    * @return batch of records of type @param &lt;T&gt;
    */
-  T read(int numRows);
+  T read(T reuse, int numRows);
+
+  void setBatchSize(int batchSize);
 
   /**
-   *
-   * @param pages row group information for all the columns
+   * @param pages    row group information for all the columns
    * @param metadata map of {@link ColumnPath} -&gt; {@link ColumnChunkMetaData} for the row group
    */
   void setRowGroupInfo(PageReadStore pages, Map<ColumnPath, ColumnChunkMetaData> metadata);
 
   /**
-   * Set up the reader to reuse the underlying containers used for storing batches
-   */
-  void reuseContainers(boolean reuse);
-
-  /**
-   * Release any resources allocated
+   * Release any resources allocated.
    */
   void close();
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
index 688ff6c8b78b..c9c9959c9e95 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessor.java
@@ -26,7 +26,7 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 @SuppressWarnings("checkstyle:VisibilityModifier")
-abstract class ArrowVectorAccessor {
+public abstract class ArrowVectorAccessor {
 
   private final ValueVector vector;
   private final ArrowColumnVector[] childColumns;
@@ -89,7 +89,7 @@ ArrowColumnVector childColumn(int pos) {
     return childColumns[pos];
   }
 
-  ValueVector getVector() {
+  public ValueVector getVector() {
     return vector;
   }
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
index 34cb63ea5c3e..74732a3e4192 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java
@@ -137,8 +137,8 @@ private static ArrowVectorAccessor getPlainVectorAccessor(FieldVector vector) {
       return new DecimalAccessor((IcebergArrowVectors.DecimalArrowVector) vector);
     } else if (vector instanceof IcebergArrowVectors.VarcharArrowVector) {
       return new StringAccessor((IcebergArrowVectors.VarcharArrowVector) vector);
-    } else if (vector instanceof IcebergArrowVectors.VarBinaryArrowVector) {
-      return new BinaryAccessor((IcebergArrowVectors.VarBinaryArrowVector) vector);
+    } else if (vector instanceof VarBinaryVector) {
+      return new BinaryAccessor((VarBinaryVector) vector);
     } else if (vector instanceof DateDayVector) {
       return new DateAccessor((DateDayVector) vector);
     } else if (vector instanceof TimeStampMicroTZVector) {
@@ -338,7 +338,7 @@ private static class BinaryAccessor extends ArrowVectorAccessor {
 
     @Override
     final byte[] getBinary(int rowId) {
-      return vector.getObject(rowId);
+      return vector.get(rowId);
     }
   }
 
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java
index dd6d4096dc44..c76321ecd61d 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java
@@ -38,11 +38,13 @@
  */
 public class ColumnarBatchReader implements VectorizedReader<ColumnarBatch> {
   private final VectorizedArrowReader[] readers;
+  private final VectorHolder[] vectorHolders;
 
   public ColumnarBatchReader(List<VectorizedReader<?>> readers) {
     this.readers = readers.stream()
         .map(VectorizedArrowReader.class::cast)
         .toArray(VectorizedArrowReader[]::new);
+    this.vectorHolders = new VectorHolder[readers.size()];
   }
 
   @Override
@@ -55,30 +57,41 @@ public final void setRowGroupInfo(PageReadStore pageStore, Map<ColumnPath, Colum
   }
 
   @Override
-  public void reuseContainers(boolean reuse) {
-    for (VectorizedReader<?> reader : readers) {
-      reader.reuseContainers(reuse);
-    }
-  }
-
-  @Override
-  public final ColumnarBatch read(int numRowsToRead) {
+  public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) {
     Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead);
     ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length];
+
+    if (reuse == null) {
+      closeVectors();
+    }
+
     for (int i = 0; i < readers.length; i += 1) {
-      VectorHolder holder = readers[i].read(numRowsToRead);
-      int numRowsInVector = holder.numValues();
+      vectorHolders[i] = readers[i].read(vectorHolders[i], numRowsToRead);
+      int numRowsInVector = vectorHolders[i].numValues();
       Preconditions.checkState(
           numRowsInVector == numRowsToRead,
           "Number of rows in the vector %s didn't match expected %s ", numRowsInVector,
           numRowsToRead);
-      arrowColumnVectors[i] = IcebergArrowColumnVector.forHolder(holder, numRowsInVector);
+      arrowColumnVectors[i] =
+          IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector);
     }
     ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors);
     batch.setNumRows(numRowsToRead);
     return batch;
   }
 
+  private void closeVectors() {
+    for (int i = 0; i < vectorHolders.length; i++) {
+      if (vectorHolders[i] != null) {
+        // Release any resources used by the vector
+        if (vectorHolders[i].vector() != null) {
+          vectorHolders[i].vector().close();
+        }
+        vectorHolders[i] = null;
+      }
+    }
+  }
+
   @Override
   public void close() {
     for (VectorizedReader<?> reader : readers) {
@@ -86,4 +99,12 @@ public void close() {
     }
   }
 
+  @Override
+  public void setBatchSize(int batchSize) {
+    for (VectorizedArrowReader reader : readers) {
+      if (reader != null) {
+        reader.setBatchSize(batchSize);
+      }
+    }
+  }
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
index 92f7c8ed8c92..9d10cd935512 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java
@@ -19,7 +19,6 @@
 
 package org.apache.iceberg.spark.data.vectorized;
 
-import org.apache.arrow.vector.NullCheckingForGet;
 import org.apache.iceberg.arrow.vectorized.NullabilityHolder;
 import org.apache.iceberg.arrow.vectorized.VectorHolder;
 import org.apache.iceberg.spark.SparkSchemaUtil;
@@ -39,7 +38,6 @@ public class IcebergArrowColumnVector extends ColumnVector {
 
   private final ArrowVectorAccessor accessor;
   private final NullabilityHolder nullabilityHolder;
-  private static final boolean USE_VECTOR_VALIDITY_BUFFER = NullCheckingForGet.NULL_CHECKING_ENABLED;
 
   public IcebergArrowColumnVector(VectorHolder holder) {
     super(SparkSchemaUtil.convert(holder.icebergType()));
@@ -54,17 +52,17 @@ public void close() {
 
   @Override
   public boolean hasNull() {
-    return USE_VECTOR_VALIDITY_BUFFER ? accessor.getVector().getNullCount() > 0 : nullabilityHolder.hasNulls();
+    return nullabilityHolder.hasNulls();
   }
 
   @Override
   public int numNulls() {
-    return USE_VECTOR_VALIDITY_BUFFER ? accessor.getVector().getNullCount() : nullabilityHolder.numNulls();
+    return nullabilityHolder.numNulls();
   }
 
   @Override
   public boolean isNullAt(int rowId) {
-    return USE_VECTOR_VALIDITY_BUFFER ? accessor.getVector().isNull(rowId) : nullabilityHolder.isNullAt(rowId) == 1;
+    return nullabilityHolder.isNullAt(rowId) == 1;
   }
 
   @Override
@@ -148,4 +146,8 @@ static ColumnVector forHolder(VectorHolder holder, int numRows) {
     return holder.isDummy() ? new NullValuesColumnVector(numRows) :
         new IcebergArrowColumnVector(holder);
   }
+
+  public ArrowVectorAccessor vectorAccessor() {
+    return accessor;
+  }
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
index a3dbf865b36d..01cbe6f286ad 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
@@ -19,19 +19,18 @@
 
 package org.apache.iceberg.spark.data.vectorized;
 
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
 import java.util.List;
 import java.util.Map;
 import java.util.stream.IntStream;
 import org.apache.arrow.memory.BufferAllocator;
-import org.apache.arrow.vector.NullCheckingForGet;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.arrow.ArrowAllocation;
 import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader;
 import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
 import org.apache.iceberg.parquet.VectorizedReader;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.types.Types;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.schema.GroupType;
@@ -47,43 +46,36 @@ private VectorizedSparkParquetReaders() {
   public static ColumnarBatchReader buildReader(
       Schema expectedSchema,
       MessageType fileSchema,
-      Integer recordsPerBatch) {
+      boolean setArrowValidityVector) {
     return (ColumnarBatchReader)
         TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
-            new VectorizedReaderBuilder(expectedSchema, fileSchema, recordsPerBatch));
+            new VectorizedReaderBuilder(expectedSchema, fileSchema, setArrowValidityVector));
   }
 
   private static class VectorizedReaderBuilder extends TypeWithSchemaVisitor<VectorizedReader<?>> {
     private final MessageType parquetSchema;
     private final Schema icebergSchema;
     private final BufferAllocator rootAllocator;
-    private final int batchSize;
+    private final boolean setArrowValidityVector;
 
     VectorizedReaderBuilder(
         Schema expectedSchema,
         MessageType parquetSchema,
-        int bSize) {
+        boolean setArrowValidityVector) {
       this.parquetSchema = parquetSchema;
       this.icebergSchema = expectedSchema;
-      this.batchSize = bSize;
       this.rootAllocator = ArrowAllocation.rootAllocator()
           .newChildAllocator("VectorizedReadBuilder", 0, Long.MAX_VALUE);
+      this.setArrowValidityVector = setArrowValidityVector;
     }
 
     @Override
     public VectorizedReader<?> message(
             Types.StructType expected, MessageType message,
             List<VectorizedReader<?>> fieldReaders) {
-      return struct(expected, message.asGroupType(), fieldReaders);
-    }
-
-    @Override
-    public VectorizedReader<?> struct(
-            Types.StructType expected, GroupType struct,
-            List<VectorizedReader<?>> fieldReaders) {
-
+      GroupType groupType = message.asGroupType();
       Map<Integer, VectorizedReader<?>> readersById = Maps.newHashMap();
-      List<Type> fields = struct.getFields();
+      List<Type> fields = groupType.getFields();
 
       IntStream.range(0, fields.size())
           .forEach(pos -> readersById.put(fields.get(pos).getId().intValue(), fieldReaders.get(pos)));
@@ -106,6 +98,16 @@ public VectorizedReader<?> struct(
       return new ColumnarBatchReader(reorderedFields);
     }
 
+    @Override
+    public VectorizedReader<?> struct(
+        Types.StructType expected, GroupType groupType,
+        List<VectorizedReader<?>> fieldReaders) {
+      if (expected != null) {
+        throw new UnsupportedOperationException("Vectorized reads are not supported yet for struct fields");
+      }
+      return null;
+    }
+
     @Override
     public VectorizedReader<?> primitive(
         org.apache.iceberg.types.Type.PrimitiveType expected,
@@ -123,8 +125,7 @@ public VectorizedReader<?> primitive(
         return null;
       }
       // Set the validity buffer if null checking is enabled in arrow
-      return new VectorizedArrowReader(desc, icebergField, rootAllocator,
-          batchSize, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED);
+      return new VectorizedArrowReader(desc, icebergField, rootAllocator, setArrowValidityVector);
     }
   }
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
index d1cc2b13f014..eeb3ad559858 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
@@ -19,7 +19,7 @@
 
 package org.apache.iceberg.spark.source;
 
-import com.google.common.base.Preconditions;
+import org.apache.arrow.vector.NullCheckingForGet;
 import org.apache.iceberg.CombinedScanTask;
 import org.apache.iceberg.FileFormat;
 import org.apache.iceberg.FileScanTask;
@@ -30,6 +30,7 @@
 import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.io.InputFile;
 import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 
@@ -57,10 +58,10 @@ CloseableIterator<ColumnarBatch> open(FileScanTask task) {
           .project(expectedSchema)
           .split(task.start(), task.length())
           .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(expectedSchema,
-              fileSchema, batchSize))
+              fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED))
+          .recordsPerBatch(batchSize)
           .filter(task.residual())
           .caseSensitive(caseSensitive)
-          .recordsPerBatch(batchSize)
           // Spark eagerly consumes the batches. So the underlying memory allocated could be reused
           // without worrying about subsequent reads clobbering over each other. This improves
           // read performance as every batch read doesn't have to pay the cost of allocating memory.
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
index 328312d11285..d205c22a77f6 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -19,7 +19,6 @@
 
 package org.apache.iceberg.spark.source;
 
-import com.google.common.base.Preconditions;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.List;
@@ -45,6 +44,7 @@
 import org.apache.iceberg.hadoop.Util;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.spark.SparkFilters;
@@ -101,8 +101,7 @@ class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPus
   private List<CombinedScanTask> tasks = null; // lazy cache of tasks
   private Boolean readUsingBatch = null;
 
-  Reader(
-      Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
+  Reader(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
       boolean caseSensitive, DataSourceOptions options) {
     this.table = table;
     this.snapshotId = options.get("snapshot-id").map(Long::parseLong).orElse(null);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
index 01363fa40c97..f99c0fccb89c 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
@@ -20,6 +20,7 @@
 package org.apache.iceberg.spark.data;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.Iterator;
@@ -34,6 +35,7 @@
 import org.apache.avro.generic.GenericData.Record;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.avro.AvroSchemaUtil;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
@@ -50,10 +52,13 @@
 
 public class RandomData {
 
+  // Default percentage of number of values that are null for optional fields
+  public static final float DEFAULT_NULL_PERCENTAGE = 0.05f;
+
   private RandomData() {}
 
   public static List<Record> generateList(Schema schema, int numRecords, long seed) {
-    RandomDataGenerator generator = new RandomDataGenerator(schema, seed);
+    RandomDataGenerator generator = new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE);
     List<Record> records = Lists.newArrayListWithExpectedSize(numRecords);
     for (int i = 0; i < numRecords; i += 1) {
       records.add((Record) TypeUtil.visit(schema, generator));
@@ -84,15 +89,20 @@ public InternalRow next() {
   }
 
   public static Iterable<Record> generate(Schema schema, int numRecords, long seed) {
-    return newIterable(() -> new RandomDataGenerator(schema, seed), schema, numRecords);
+    return newIterable(() -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords);
+  }
+
+  public static Iterable<Record> generate(Schema schema, int numRecords, long seed, float nullPercentage) {
+    return newIterable(() -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords);
   }
 
   public static Iterable<Record> generateFallbackData(Schema schema, int numRecords, long seed, long numDictRecords) {
     return newIterable(() -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords);
   }
 
-  public static Iterable<GenericData.Record> generateDictionaryEncodableData(Schema schema, int numRecords, long seed) {
-    return newIterable(() -> new DictionaryEncodedDataGenerator(schema, seed), schema, numRecords);
+  public static Iterable<GenericData.Record> generateDictionaryEncodableData(
+      Schema schema, int numRecords, long seed, float nullPercentage) {
+    return newIterable(() -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords);
   }
 
   private static Iterable<Record> newIterable(Supplier<RandomDataGenerator> newGenerator,
@@ -120,8 +130,14 @@ public Record next() {
   private static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
     private final Map<Type, org.apache.avro.Schema> typeToSchema;
     private final Random random;
-
-    private RandomDataGenerator(Schema schema, long seed) {
+    // Percentage of number of values that are null for optional fields
+    private final float nullPercentage;
+
+    private RandomDataGenerator(Schema schema, long seed, float nullPercentage) {
+      Preconditions.checkArgument(
+          0.0f <= nullPercentage && nullPercentage <= 1.0f,
+          "Percentage needs to be in the range (0.0, 1.0)");
+      this.nullPercentage = nullPercentage;
       this.typeToSchema = AvroSchemaUtil.convertTypes(schema.asStruct(), "test");
       this.random = new Random(seed);
     }
@@ -145,21 +161,23 @@ public Record struct(Types.StructType struct, Iterable<Object> fieldResults) {
 
     @Override
     public Object field(Types.NestedField field, Supplier<Object> fieldResult) {
-      // return null 5% of the time when the value is optional
-      if (field.isOptional() && random.nextInt(20) == 1) {
+      if (field.isOptional() && isNull()) {
         return null;
       }
       return fieldResult.get();
     }
 
+    private boolean isNull() {
+      return random.nextFloat() < nullPercentage;
+    }
+
     @Override
     public Object list(Types.ListType list, Supplier<Object> elementResult) {
       int numElements = random.nextInt(20);
 
       List<Object> result = Lists.newArrayListWithExpectedSize(numElements);
       for (int i = 0; i < numElements; i += 1) {
-        // return null 5% of the time when the value is optional
-        if (list.isElementOptional() && random.nextInt(20) == 1) {
+        if (list.isElementOptional() && isNull()) {
           result.add(null);
         } else {
           result.add(elementResult.get());
@@ -184,8 +202,7 @@ public Object map(Types.MapType map, Supplier<Object> keyResult, Supplier<Object
 
         keySet.add(key);
 
-        // return null 5% of the time when the value is optional
-        if (map.isValueOptional() && random.nextInt(20) == 1) {
+        if (map.isValueOptional() && isNull()) {
           result.put(key, null);
         } else {
           result.put(key, valueResult.get());
@@ -197,7 +214,7 @@ public Object map(Types.MapType map, Supplier<Object> keyResult, Supplier<Object
 
     @Override
     public Object primitive(Type.PrimitiveType primitive) {
-      Object result = RandomUtil.generatePrimitive(primitive, random);
+      Object result = randomValue(primitive, random);
       // For the primitives that Avro needs a different type than Spark, fix
       // them here.
       switch (primitive.typeId()) {
@@ -212,6 +229,10 @@ public Object primitive(Type.PrimitiveType primitive) {
           return result;
       }
     }
+
+    protected Object randomValue(Type.PrimitiveType primitive, Random rand) {
+      return RandomUtil.generatePrimitive(primitive, random);
+    }
   }
 
   private static class SparkRandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
@@ -327,7 +348,7 @@ private static Object generateDictionaryEncodablePrimitive(Type.PrimitiveType pr
       case TIMESTAMP:
         return (long) value;
       case STRING:
-        return UTF8String.fromString(String.valueOf(value));
+        return String.valueOf(value);
       case FIXED:
         byte[] fixed = new byte[((Types.FixedType) primitive).length()];
         Arrays.fill(fixed, (byte) value);
@@ -339,165 +360,16 @@ private static Object generateDictionaryEncodablePrimitive(Type.PrimitiveType pr
       case DECIMAL:
         Types.DecimalType type = (Types.DecimalType) primitive;
         BigInteger unscaled = new BigInteger(String.valueOf(value + 1));
-        return Decimal.apply(new BigDecimal(unscaled, type.scale()));
-      default:
-        throw new IllegalArgumentException(
-            "Cannot generate random value for unknown type: " + primitive);
-    }
-  }
-
-  @SuppressWarnings("RandomModInteger")
-  private static Object generatePrimitive(Type.PrimitiveType primitive,
-                                         Random random) {
-    int choice = random.nextInt(20);
-
-    switch (primitive.typeId()) {
-      case BOOLEAN:
-        return choice < 10;
-
-      case INTEGER:
-        switch (choice) {
-          case 1:
-            return Integer.MIN_VALUE;
-          case 2:
-            return Integer.MAX_VALUE;
-          case 3:
-            return 0;
-          default:
-            return random.nextInt();
-        }
-
-      case LONG:
-        switch (choice) {
-          case 1:
-            return Long.MIN_VALUE;
-          case 2:
-            return Long.MAX_VALUE;
-          case 3:
-            return 0L;
-          default:
-            return random.nextLong();
-        }
-
-      case FLOAT:
-        switch (choice) {
-          case 1:
-            return Float.MIN_VALUE;
-          case 2:
-            return -Float.MIN_VALUE;
-          case 3:
-            return Float.MAX_VALUE;
-          case 4:
-            return -Float.MAX_VALUE;
-          case 5:
-            return Float.NEGATIVE_INFINITY;
-          case 6:
-            return Float.POSITIVE_INFINITY;
-          case 7:
-            return 0.0F;
-          case 8:
-            return Float.NaN;
-          default:
-            return random.nextFloat();
-        }
-
-      case DOUBLE:
-        switch (choice) {
-          case 1:
-            return Double.MIN_VALUE;
-          case 2:
-            return -Double.MIN_VALUE;
-          case 3:
-            return Double.MAX_VALUE;
-          case 4:
-            return -Double.MAX_VALUE;
-          case 5:
-            return Double.NEGATIVE_INFINITY;
-          case 6:
-            return Double.POSITIVE_INFINITY;
-          case 7:
-            return 0.0D;
-          case 8:
-            return Double.NaN;
-          default:
-            return random.nextDouble();
-        }
-
-      case DATE:
-        // this will include negative values (dates before 1970-01-01)
-        return random.nextInt() % ABOUT_380_YEARS_IN_DAYS;
-
-      case TIME:
-        return (random.nextLong() & Integer.MAX_VALUE) % ONE_DAY_IN_MICROS;
-
-      case TIMESTAMP:
-        return random.nextLong() % FIFTY_YEARS_IN_MICROS;
-
-      case STRING:
-        return randomString(random);
-
-      case UUID:
-        byte[] uuidBytes = new byte[16];
-        random.nextBytes(uuidBytes);
-        // this will hash the uuidBytes
-        return uuidBytes;
-
-      case FIXED:
-        byte[] fixed = new byte[((Types.FixedType) primitive).length()];
-        random.nextBytes(fixed);
-        return fixed;
-
-      case BINARY:
-        byte[] binary = new byte[random.nextInt(50)];
-        random.nextBytes(binary);
-        return binary;
-
-      case DECIMAL:
-        Types.DecimalType type = (Types.DecimalType) primitive;
-        BigInteger unscaled = randomUnscaled(type.precision(), random);
-        return Decimal.apply(new BigDecimal(unscaled, type.scale()));
-
+        return new BigDecimal(unscaled, type.scale());
       default:
         throw new IllegalArgumentException(
             "Cannot generate random value for unknown type: " + primitive);
     }
   }
 
-  private static final long FIFTY_YEARS_IN_MICROS =
-      (50L * (365 * 3 + 366) * 24 * 60 * 60 * 1_000_000) / 4;
-  private static final int ABOUT_380_YEARS_IN_DAYS = 380 * 365;
-  private static final long ONE_DAY_IN_MICROS = 24 * 60 * 60 * 1_000_000L;
-  private static final String CHARS =
-      "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-.!?";
-
-  private static UTF8String randomString(Random random) {
-    int length = random.nextInt(50);
-    byte[] buffer = new byte[length];
-
-    for (int i = 0; i < length; i += 1) {
-      buffer[i] = (byte) CHARS.charAt(random.nextInt(CHARS.length()));
-    }
-
-    return UTF8String.fromBytes(buffer);
-  }
-
-  private static final String DIGITS = "0123456789";
-
-  private static BigInteger randomUnscaled(int precision, Random random) {
-    int length = random.nextInt(precision);
-    if (length == 0) {
-      return BigInteger.ZERO;
-    }
-
-    StringBuilder sb = new StringBuilder();
-    for (int i = 0; i < length; i += 1) {
-      sb.append(DIGITS.charAt(random.nextInt(DIGITS.length())));
-    }
-  }
-
   private static class DictionaryEncodedDataGenerator extends RandomDataGenerator {
-    private DictionaryEncodedDataGenerator(Schema schema, long seed) {
-      super(schema, seed);
+    private DictionaryEncodedDataGenerator(Schema schema, long seed, float nullPercentage) {
+      super(schema, seed, nullPercentage);
     }
 
     @Override
@@ -511,7 +383,7 @@ private static class FallbackDataGenerator extends RandomDataGenerator {
     private long rowCount = 0;
 
     private FallbackDataGenerator(Schema schema, long seed, long numDictionaryEncoded) {
-      super(schema, seed);
+      super(schema, seed, DEFAULT_NULL_PERCENTAGE);
       this.dictionaryEncodedRows = numDictionaryEncoded;
     }
 
@@ -519,7 +391,7 @@ private FallbackDataGenerator(Schema schema, long seed, long numDictionaryEncode
     protected Object randomValue(Type.PrimitiveType primitive, Random rand) {
       this.rowCount += 1;
       if (rowCount > dictionaryEncodedRows) {
-        return generatePrimitive(primitive, rand);
+        return RandomUtil.generatePrimitive(primitive, rand);
       } else {
         return generateDictionaryEncodablePrimitive(primitive, rand);
       }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
index 743865da3aba..f603757c2c44 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
@@ -33,10 +33,12 @@
 import java.util.List;
 import java.util.Map;
 import java.util.UUID;
+import org.apache.arrow.vector.ValueVector;
 import org.apache.avro.generic.GenericData;
 import org.apache.avro.generic.GenericData.Record;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.data.vectorized.IcebergArrowColumnVector;
 import org.apache.iceberg.types.Type;
 import org.apache.iceberg.types.Types;
 import org.apache.orc.storage.serde2.io.DateWritable;
@@ -54,6 +56,7 @@
 import org.apache.spark.sql.types.MapType;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.UTF8String;
 import org.junit.Assert;
@@ -80,12 +83,28 @@ public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row
     }
   }
 
-  public static void assertEqualsBatch(Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch) {
-    for (int r = 0; r < batch.numRows(); r++) {
-      assertEqualsUnsafe(struct, expected.next(), batch.getRow(r));
+  public static void assertEqualsBatch(Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch,
+                                       boolean checkArrowValidityVector) {
+    for (int rowId = 0; rowId < batch.numRows(); rowId++) {
+      List<Types.NestedField> fields = struct.fields();
+      InternalRow row = batch.getRow(rowId);
+      Record rec = expected.next();
+      for (int i = 0; i < fields.size(); i += 1) {
+        Type fieldType = fields.get(i).type();
+        Object expectedValue = rec.get(i);
+        Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType));
+        assertEqualsUnsafe(fieldType, expectedValue, actualValue);
+
+        if (checkArrowValidityVector) {
+          ColumnVector columnVector = batch.column(i);
+          ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector();
+          Assert.assertEquals("Nullability doesn't match", expectedValue == null, arrowVector.isNull(rowId));
+        }
+      }
     }
   }
 
+
   private static void assertEqualsSafe(Types.ListType list, Collection<?> expected, List actual) {
     Type elementType = list.elementType();
     List<?> expectedElements = Lists.newArrayList(expected);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java
index 1466deab2af2..d7bd696ad608 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java
@@ -186,7 +186,7 @@ public void testWithOldReadPath() throws IOException {
 
   @Test
   public void testCorrectness() throws IOException {
-    Iterable<Record> records = RandomData.generate(COMPLEX_SCHEMA, 250_000, 34139);
+    Iterable<Record> records = RandomData.generate(COMPLEX_SCHEMA, 50_000, 34139);
 
     File testFile = temp.newFile();
     Assert.assertTrue("Delete should succeed", testFile.delete());
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java
index 0e97c37ffe79..dcfc873a5a67 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java
@@ -74,7 +74,7 @@ public class TestParquetAvroWriter {
 
   @Test
   public void testCorrectness() throws IOException {
-    Iterable<Record> records = RandomData.generate(COMPLEX_SCHEMA, 250_000, 34139);
+    Iterable<Record> records = RandomData.generate(COMPLEX_SCHEMA, 50_000, 34139);
 
     File testFile = temp.newFile();
     Assert.assertTrue("Delete should succeed", testFile.delete());
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java
index 4ff784448e80..c75a87abc45c 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java
@@ -71,7 +71,7 @@ public class TestSparkParquetWriter {
 
   @Test
   public void testCorrectness() throws IOException {
-    int numRows = 250_000;
+    int numRows = 50_000;
     Iterable<InternalRow> records = RandomData.generateSpark(COMPLEX_SCHEMA, numRows, 19981);
 
     File testFile = temp.newFile();
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
index f7ecf4d1157f..7f2d9c32cac8 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
@@ -19,13 +19,24 @@
 
 package org.apache.iceberg.spark.data.parquet.vectorized;
 
+import java.io.IOException;
 import org.apache.avro.generic.GenericData;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.spark.data.RandomData;
+import org.junit.Ignore;
+import org.junit.Test;
 
 public class TestParquetDictionaryEncodedVectorizedReads extends TestParquetVectorizedReads {
+
   @Override
-  Iterable<GenericData.Record> generateData(int numRows, Schema schema) {
-    return RandomData.generateDictionaryEncodableData(schema, numRows, 0L);
+  Iterable<GenericData.Record> generateData(Schema schema, int numRecords, long seed, float nullPercentage) {
+    return RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage);
+  }
+
+  @Test
+  @Override
+  @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads
+  public void testVectorizedReadsWithNewContainers() throws IOException {
+
   }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java
index ea7c4a3f6d90..ad9d020c74f4 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java
@@ -28,11 +28,21 @@
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.parquet.Parquet;
 import org.apache.iceberg.spark.data.RandomData;
+import org.junit.Ignore;
+import org.junit.Test;
 
 public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads extends TestParquetVectorizedReads {
+  private static final int NUM_ROWS = 1_000_000;
+
+  @Override
+  protected int getNumRows() {
+    return NUM_ROWS;
+  }
+
   @Override
-  public Iterable<GenericData.Record> generateData(int numRows, Schema schema) {
-    return RandomData.generateFallbackData(schema, numRows, 0L, numRows / 20);
+  Iterable<GenericData.Record> generateData(Schema schema, int numRecords, long seed, float nullPercentage) {
+    //TODO: take into account nullPercentage when generating fallback encoding data
+    return RandomData.generateFallbackData(schema, numRecords, seed, numRecords / 20);
   }
 
   @Override
@@ -44,4 +54,17 @@ FileAppender<GenericData.Record> getParquetWriter(Schema schema, File testFile)
         .build();
   }
 
+  @Test
+  @Override
+  @Ignore // Fallback encoding not triggered when data is mostly null
+  public void testMostlyNullsForOptionalFields() {
+
+  }
+
+  @Test
+  @Override
+  @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads
+  public void testVectorizedReadsWithNewContainers() throws IOException {
+
+  }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
index 32600900a351..3e4f5f95c57e 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
@@ -23,34 +23,48 @@
 import java.io.IOException;
 import java.util.Iterator;
 import org.apache.avro.generic.GenericData;
+import org.apache.iceberg.AssertHelpers;
 import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.spark.data.AvroDataTest;
 import org.apache.iceberg.spark.data.RandomData;
 import org.apache.iceberg.spark.data.TestHelpers;
 import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
 import org.apache.iceberg.types.TypeUtil;
 import org.apache.iceberg.types.Types;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.Type;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.junit.Assert;
 import org.junit.Assume;
 import org.junit.Ignore;
 import org.junit.Test;
 
+import static org.apache.iceberg.types.Types.NestedField.required;
+
 public class TestParquetVectorizedReads extends AvroDataTest {
   private static final int NUM_ROWS = 200_000;
 
   @Override
   protected void writeAndValidate(Schema schema) throws IOException {
+    writeAndValidate(schema, getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, false, true);
+  }
+
+  private void writeAndValidate(
+      Schema schema, int numRecords, long seed, float nullPercentage,
+      boolean setAndCheckArrowValidityVector, boolean reuseContainers)
+      throws IOException {
     // Write test data
     Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(
         schema,
         type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));
 
-    Iterable<GenericData.Record> expected = generateData(NUM_ROWS, schema);
+    Iterable<GenericData.Record> expected = generateData(schema, numRecords, seed, nullPercentage);
 
     // write a test parquet file using iceberg writer
     File testFile = temp.newFile();
@@ -59,7 +73,15 @@ protected void writeAndValidate(Schema schema) throws IOException {
     try (FileAppender<GenericData.Record> writer = getParquetWriter(schema, testFile)) {
       writer.addAll(expected);
     }
-    assertRecordsMatch(schema, NUM_ROWS, expected, testFile);
+    assertRecordsMatch(schema, numRecords, expected, testFile, setAndCheckArrowValidityVector, reuseContainers);
+  }
+
+  protected int getNumRows() {
+    return NUM_ROWS;
+  }
+
+  Iterable<GenericData.Record> generateData(Schema schema, int numRecords, long seed, float nullPercentage) {
+    return RandomData.generate(schema, numRecords, seed, nullPercentage);
   }
 
   FileAppender<GenericData.Record> getParquetWriter(Schema schema, File testFile) throws IOException {
@@ -69,25 +91,29 @@ FileAppender<GenericData.Record> getParquetWriter(Schema schema, File testFile)
         .build();
   }
 
-  Iterable<GenericData.Record> generateData(int numRows, Schema schema) {
-    return RandomData.generate(schema, numRows, 0L);
-  }
-
-  void assertRecordsMatch(Schema schema, int expectedSize, Iterable<GenericData.Record> expected, File testFile) throws IOException {
-    try (CloseableIterable<ColumnarBatch> batchReader = Parquet.read(Files.localInput(testFile))
+  private void assertRecordsMatch(
+      Schema schema, int expectedSize, Iterable<GenericData.Record> expected, File testFile,
+      boolean setAndCheckArrowValidityBuffer, boolean reuseContainers)
+      throws IOException {
+    Parquet.ReadBuilder readBuilder = Parquet.read(Files.localInput(testFile))
         .project(schema)
-        .reuseContainers()
         .recordsPerBatch(10000)
-        .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(schema, type, 10000))
-        .build()) {
-
+        .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(
+            schema,
+            type,
+            setAndCheckArrowValidityBuffer));
+    if (reuseContainers) {
+      readBuilder.reuseContainers();
+    }
+    try (CloseableIterable<ColumnarBatch> batchReader =
+        readBuilder.build()) {
       Iterator<GenericData.Record> expectedIter = expected.iterator();
       Iterator<ColumnarBatch> batches = batchReader.iterator();
       int numRowsRead = 0;
       while (batches.hasNext()) {
         ColumnarBatch batch = batches.next();
         numRowsRead += batch.numRows();
-        TestHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch);
+        TestHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer);
       }
       Assert.assertEquals(expectedSize, numRowsRead);
     }
@@ -127,4 +153,44 @@ public void testMapOfStructs() {
   @Ignore
   public void testMixedTypes() {
   }
+
+  @Test
+  @Override
+  public void testNestedStruct() {
+    AssertHelpers.assertThrows(
+        "Vectorized reads are not supported yet for struct fields",
+        UnsupportedOperationException.class,
+        "Vectorized reads are not supported yet for struct fields",
+        () -> VectorizedSparkParquetReaders.buildReader(
+            TypeUtil.assignIncreasingFreshIds(new Schema(required(
+                1,
+                "struct",
+                SUPPORTED_PRIMITIVES))),
+            new MessageType("struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)),
+            false));
+  }
+
+  @Test
+  public void testMostlyNullsForOptionalFields() throws IOException {
+    writeAndValidate(
+        TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())),
+        getNumRows(),
+        0L,
+        0.99f,
+        false,
+        true);
+  }
+
+  @Test
+  public void testSettingArrowValidityVector() throws IOException {
+    writeAndValidate(new Schema(
+            Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)),
+        getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, true);
+  }
+
+  @Test
+  public void testVectorizedReadsWithNewContainers() throws IOException {
+    writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())),
+        getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, false);
+  }
 }