From 91ee4acfebdcdce49e26177c874ccfe1257c02ef Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Tue, 17 Jan 2023 17:17:42 -0500
Subject: [PATCH 01/20] add test base for hudi

---
 build.gradle                                  |  68 +++++++
 .../hudi/SparkHudiMigrationTestBase.java      |  74 +++++++
 .../iceberg/hudi/TestSnapshotHudiTable.java   | 180 ++++++++++++++++++
 settings.gradle                               |   2 +
 4 files changed, 324 insertions(+)
 create mode 100644 hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java
 create mode 100644 hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
diff --git a/build.gradle b/build.gradle
index 7b14f3b73163..1dc6f04345a0 100644
--- a/build.gradle
+++ b/build.gradle
@@ -48,6 +48,10 @@ plugins {
   id 'nebula.dependency-recommender' version '11.0.0'
 }
 
+String scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion")
+String sparkVersionsString = System.getProperty("sparkVersions") != null ? System.getProperty("sparkVersions") : System.getProperty("defaultSparkVersions")
+List<String> sparkVersions = sparkVersionsString != null && !sparkVersionsString.isEmpty() ? sparkVersionsString.split(",") : []
+
 try {
   // apply these plugins in a try-catch block so that we can handle cases without .git directory
   apply plugin: 'com.palantir.git-version'
@@ -438,6 +442,70 @@ project(':iceberg-aws') {
   }
 }
 
+project(':iceberg-hudi') {
+
+  configurations {
+    integrationImplementation.extendsFrom testImplementation
+    integrationRuntime.extendsFrom testRuntimeOnly
+  }
+
+  dependencies {
+    implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow')
+    api project(':iceberg-api')
+    implementation project(':iceberg-common')
+    implementation project(':iceberg-core')
+    implementation project(':iceberg-parquet')
+    implementation project(':iceberg-orc')
+    implementation "com.fasterxml.jackson.core:jackson-databind"
+
+
+    compileOnly("org.apache.hadoop:hadoop-common") {
+      exclude group: 'org.apache.avro', module: 'avro'
+      exclude group: 'org.slf4j', module: 'slf4j-log4j12'
+      exclude group: 'javax.servlet', module: 'servlet-api'
+      exclude group: 'com.google.code.gson', module: 'gson'
+    }
+    if (sparkVersions.contains("3.3") && scalaVersion == "2.12") {
+      integrationImplementation("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2")
+      integrationImplementation project(path: ":iceberg-spark:iceberg-spark-3.3_${scalaVersion}")
+      integrationImplementation("org.apache.hadoop:hadoop-minicluster") {
+        exclude group: 'org.apache.avro', module: 'avro'
+        // to make sure netty libs only come from project(':iceberg-arrow')
+        exclude group: 'io.netty', module: 'netty-buffer'
+        exclude group: 'io.netty', module: 'netty-common'
+      }
+      integrationImplementation project(path: ':iceberg-hive-metastore')
+      integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
+      integrationImplementation("org.apache.spark:spark-hive_${scalaVersion}:3.3.1") {
+        exclude group: 'org.apache.avro', module: 'avro'
+        exclude group: 'org.apache.arrow'
+        exclude group: 'org.apache.parquet'
+        // to make sure netty libs only come from project(':iceberg-arrow')
+        exclude group: 'io.netty', module: 'netty-buffer'
+        exclude group: 'io.netty', module: 'netty-common'
+        exclude group: 'org.roaringbitmap'
+      }
+    }
+  }
+
+  if (sparkVersions.contains("3.3") && scalaVersion == "2.12") {
+    sourceSets {
+      integration {
+        java.srcDir "$projectDir/src/integration/java"
+        resources.srcDir "$projectDir/src/integration/resources"
+        compileClasspath += main.output + test.output
+        runtimeClasspath += main.output + test.output
+      }
+    }
+
+    task integrationTest(type: Test) {
+      testClassesDirs = sourceSets.integration.output.classesDirs
+      classpath = sourceSets.integration.runtimeClasspath
+    }
+    check.dependsOn integrationTest
+  }
+}
+
 project(':iceberg-gcp') {
   dependencies {
     implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow')
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java b/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java
new file mode 100644
index 000000000000..42703c4403ae
--- /dev/null
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.hudi;
+
+import java.util.Map;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.iceberg.hive.TestHiveMetastore;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.internal.SQLConf;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+@SuppressWarnings("VisibilityModifier")
+public abstract class SparkHudiMigrationTestBase {
+  protected static TestHiveMetastore metastore = null;
+  protected static HiveConf hiveConf = null;
+  protected static SparkSession spark = null;
+
+  @BeforeClass
+  public static void startMetastoreAndSpark() {
+    SparkHudiMigrationTestBase.metastore = new TestHiveMetastore();
+    metastore.start();
+    SparkHudiMigrationTestBase.hiveConf = metastore.hiveConf();
+
+    SparkHudiMigrationTestBase.spark =
+        SparkSession.builder()
+            .master("local[2]")
+            .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic")
+            .config(
+                "spark.hadoop." + HiveConf.ConfVars.METASTOREURIS.varname,
+                hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname))
+            .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true")
+            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+            .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
+            .enableHiveSupport()
+            .getOrCreate();
+  }
+
+  @AfterClass
+  public static void stopMetastoreAndSpark() throws Exception {
+    if (metastore != null) {
+      metastore.stop();
+      SparkHudiMigrationTestBase.metastore = null;
+    }
+    if (spark != null) {
+      spark.stop();
+      SparkHudiMigrationTestBase.spark = null;
+    }
+  }
+
+  public SparkHudiMigrationTestBase(
+      String catalogName, String implementation, Map<String, String> config) {
+
+    spark.conf().set("spark.sql.catalog." + catalogName, implementation);
+    config.forEach(
+        (key, value) -> spark.conf().set("spark.sql.catalog." + catalogName + "." + key, value));
+  }
+}
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
new file mode 100644
index 000000000000..3268b7f5d820
--- /dev/null
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.hudi;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import org.apache.hudi.DataSourceWriteOptions;
+import org.apache.hudi.QuickstartUtils;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.SparkSessionCatalog;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.hudi.catalog.HoodieCatalog;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@RunWith(Parameterized.class)
+public class TestSnapshotHudiTable extends SparkHudiMigrationTestBase {
+
+  private static final Logger LOG = LoggerFactory.getLogger(TestSnapshotHudiTable.class.getName());
+  private static final String row1 =
+      "{\"name\":\"Michael\",\"addresses\":[{\"city\":\"SanJose\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}],"
+          + "\"address_nested\":{\"current\":{\"state\":\"NY\",\"city\":\"NewYork\"},\"previous\":{\"state\":\"NJ\",\"city\":\"Newark\"}},"
+          + "\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"6\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"],"
+          + "[\"Spark\",\"Java\"]],\"id\":1,\"magic_number\":1.123123123123}";
+  private static final String row2 =
+      "{\"name\":\"Test\",\"addresses\":[{\"city\":\"SanJos123123e\",\"state\":\"CA\"},{\"city\":\"Sand12312iago\",\"state\":\"CA\"}],"
+          + "\"address_nested\":{\"current\":{\"state\":\"N12Y\",\"city\":\"NewY1231ork\"}},\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"},"
+          + "\"secondProp\":{\"height\":\"6\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"],[\"Spark\",\"Java\"]],\"id\":2,\"magic_number\":2.123123123123}";
+  private static final String row3 =
+      "{\"name\":\"Test\",\"addresses\":[{\"city\":\"SanJose\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}],"
+          + "\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"6\"},\"subjects\":"
+          + "[[\"Java\",\"Scala\",\"C++\"],[\"Spark\",\"Java\"]],\"id\":3,\"magic_number\":3.123123123123}";
+  private static final String row4 =
+      "{\"name\":\"John\",\"addresses\":[{\"city\":\"LA\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}],"
+          + "\"address_nested\":{\"current\":{\"state\":\"NY\",\"city\":\"NewYork\"},\"previous\":{\"state\":\"NJ123\"}},"
+          + "\"properties\":{\"hair\":\"b12rown\",\"eye\":\"bla3221ck\"},\"secondProp\":{\"height\":\"633\"},\"subjects\":"
+          + "[[\"Spark\",\"Java\"]],\"id\":4,\"magic_number\":4.123123123123}";
+  private static final String row5 =
+      "{\"name\":\"Jonas\",\"addresses\":[{\"city\":\"Pittsburgh\",\"state\":\"PA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}],"
+          + "\"address_nested\":{\"current\":{\"state\":\"PA\",\"city\":\"Haha\"},\"previous\":{\"state\":\"NJ\"}},"
+          + "\"properties\":{\"hair\":\"black\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"7\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"],"
+          + "[\"Spark\",\"Java\"]],\"id\":5,\"magic_number\":5.123123123123}";
+  private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source";
+  private static final String DELTA_SOURCE_VALUE = "delta";
+  private static final String ORIGINAL_LOCATION_PROP = "original_location";
+  private static final String NAMESPACE = "delta_conversion_test";
+  private static final String defaultSparkCatalog = "spark_catalog";
+  private static final String icebergCatalogName = "iceberg_hive";
+  private String partitionedIdentifier;
+  private String unpartitionedIdentifier;
+  private String externalDataFilesIdentifier;
+  private final String partitionedTableName = "partitioned_table";
+  private final String unpartitionedTableName = "unpartitioned_table";
+  private final String externalDataFilesTableName = "external_data_files_table";
+  private String partitionedLocation;
+  private String unpartitionedLocation;
+  private String newIcebergTableLocation;
+  private String externalDataFilesTableLocation;
+
+  @Parameterized.Parameters(name = "Catalog Name {0} - Options {2}")
+  public static Object[][] parameters() {
+    return new Object[][] {
+      new Object[] {
+        icebergCatalogName,
+        SparkSessionCatalog.class.getName(),
+        ImmutableMap.of(
+            "type",
+            "hive",
+            "default-namespace",
+            "default",
+            "parquet-enabled",
+            "true",
+            "cache-enabled",
+            "false" // Spark will delete tables using v1, leaving the cache out of sync
+            )
+      }
+    };
+  }
+
+  @Rule public TemporaryFolder temp1 = new TemporaryFolder();
+  @Rule public TemporaryFolder temp2 = new TemporaryFolder();
+  @Rule public TemporaryFolder temp3 = new TemporaryFolder();
+  @Rule public TemporaryFolder temp4 = new TemporaryFolder();
+
+  public TestSnapshotHudiTable(
+      String catalogName, String implementation, Map<String, String> config) {
+    super(catalogName, implementation, config);
+    spark.conf().set("spark.sql.catalog." + defaultSparkCatalog, HoodieCatalog.class.getName());
+  }
+
+  @Before
+  public void before() throws IOException {
+    File partitionedFolder = temp1.newFolder();
+    File unpartitionedFolder = temp2.newFolder();
+    File newIcebergTableFolder = temp3.newFolder();
+    File externalDataFilesTableFolder = temp4.newFolder();
+    partitionedLocation = partitionedFolder.toURI().toString();
+    unpartitionedLocation = unpartitionedFolder.toURI().toString();
+    newIcebergTableLocation = newIcebergTableFolder.toURI().toString();
+    externalDataFilesTableLocation = externalDataFilesTableFolder.toURI().toString();
+
+    spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", NAMESPACE));
+
+    partitionedIdentifier = destName(defaultSparkCatalog, partitionedTableName);
+    unpartitionedIdentifier = destName(defaultSparkCatalog, unpartitionedTableName);
+    externalDataFilesIdentifier = destName(defaultSparkCatalog, externalDataFilesTableName);
+
+    spark.sql(String.format("DROP TABLE IF EXISTS %s", partitionedIdentifier));
+    spark.sql(String.format("DROP TABLE IF EXISTS %s", unpartitionedIdentifier));
+    spark.sql(String.format("DROP TABLE IF EXISTS %s", externalDataFilesIdentifier));
+
+    // hard code the dataframe
+    List<String> jsonList = Lists.newArrayList();
+    jsonList.add(row1);
+    jsonList.add(row2);
+    jsonList.add(row3);
+    jsonList.add(row4);
+    jsonList.add(row5);
+    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
+    SQLContext sqlContext = new SQLContext(javaSparkContext);
+    JavaRDD<String> rdd = javaSparkContext.parallelize(jsonList);
+    Dataset<Row> df = sqlContext.read().json(rdd);
+
+    df.write()
+        .format("hudi")
+        .options(QuickstartUtils.getQuickstartWriteConfigs())
+        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "id")
+        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name")
+        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "")
+        .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier)
+        .mode(SaveMode.Overwrite)
+        .save(unpartitionedLocation);
+  }
+
+  @Test
+  public void TestHudiTableWrite() {
+    Dataset<Row> df = spark.read().format("hudi").load(unpartitionedLocation);
+    LOG.info("Generated dataframe shcema: {}", df.schema().treeString());
+    LOG.info("Generated dataframe: {}", df.showString(10, 20,false));
+    df.show();
+  }
+
+  private String destName(String catalogName, String dest) {
+    if (catalogName.equals(defaultSparkCatalog)) {
+      return NAMESPACE + "." + catalogName + "_" + dest;
+    }
+    return catalogName + "." + NAMESPACE + "." + catalogName + "_" + dest;
+  }
+}
diff --git a/settings.gradle b/settings.gradle
index c5ac07e080c2..5201184f42af 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -35,6 +35,7 @@ include 'nessie'
 include 'gcp'
 include 'dell'
 include 'snowflake'
+include 'hudi'
 
 project(':api').name = 'iceberg-api'
 project(':common').name = 'iceberg-common'
@@ -53,6 +54,7 @@ project(':nessie').name = 'iceberg-nessie'
 project(':gcp').name = 'iceberg-gcp'
 project(':dell').name = 'iceberg-dell'
 project(':snowflake').name = 'iceberg-snowflake'
+project(':hudi').name = 'iceberg-hudi'
 
 if (null != System.getProperty("allVersions")) {
   System.setProperty("flinkVersions", System.getProperty("knownFlinkVersions"))

From 57f93f17bf8b355fc1afe24df9d8696d45737d0f Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Tue, 17 Jan 2023 22:00:17 -0500
Subject: [PATCH 02/20] add write data to partitioned hudi table

---
 .../iceberg/hudi/TestSnapshotHudiTable.java   | 70 +++++++++++++++++--
 1 file changed, 65 insertions(+), 5 deletions(-)

diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index 3268b7f5d820..9bfc85c9cc11 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -119,6 +119,50 @@ public TestSnapshotHudiTable(
     spark.conf().set("spark.sql.catalog." + defaultSparkCatalog, HoodieCatalog.class.getName());
   }
 
+  /**
+   * The test hardcode a nested dataframe to test the snapshot feature. The schema of created
+   * dataframe is:
+   *
+   * <pre>
+   *  root
+   *  |-- address_nested: struct (nullable = true)
+   *  |    |-- current: struct (nullable = true)
+   *  |    |    |-- city: string (nullable = true)
+   *  |    |    |-- state: string (nullable = true)
+   *  |    |-- previous: struct (nullable = true)
+   *  |    |    |-- city: string (nullable = true)
+   *  |    |    |-- state: string (nullable = true)
+   *  |-- addresses: array (nullable = true)
+   *  |    |-- element: struct (containsNull = true)
+   *  |    |    |-- city: string (nullable = true)
+   *  |    |    |-- state: string (nullable = true)
+   *  |-- id: long (nullable = true)
+   *  |-- magic_number: double (nullable = true)
+   *  |-- name: string (nullable = true)
+   *  |-- properties: struct (nullable = true)
+   *  |    |-- eye: string (nullable = true)
+   *  |    |-- hair: string (nullable = true)
+   *  |-- secondProp: struct (nullable = true)
+   *  |    |-- height: string (nullable = true)
+   *  |-- subjects: array (nullable = true)
+   *  |    |-- element: array (containsNull = true)
+   *  |    |    |-- element: string (containsNull = true)
+   * </pre>
+   *
+   * The dataframe content is (by calling df.show()):
+   *
+   * <pre>
+   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
+   * |      address_nested|           addresses| id|  magic_number|   name|          properties|secondProp|            subjects|
+   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
+   * |{{NewYork, NY}, {...|[{SanJose, CA}, {...|  1|1.123123123123|Michael|      {black, brown}|       {6}|[[Java, Scala, C+...|
+   * |{{NewY1231ork, N1...|[{SanJos123123e, ...|  2|2.123123123123|   Test|      {black, brown}|       {6}|[[Java, Scala, C+...|
+   * |                null|[{SanJose, CA}, {...|  3|3.123123123123|   Test|      {black, brown}|       {6}|[[Java, Scala, C+...|
+   * |{{NewYork, NY}, {...|[{LA, CA}, {Sandi...|  4|4.123123123123|   John|{bla3221ck, b12rown}|     {633}|     [[Spark, Java]]|
+   * |{{Haha, PA}, {nul...|[{Pittsburgh, PA}...|  5|5.123123123123|  Jonas|      {black, black}|       {7}|[[Java, Scala, C+...|
+   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
+   * </pre>
+   */
   @Before
   public void before() throws IOException {
     File partitionedFolder = temp1.newFolder();
@@ -155,7 +199,17 @@ public void before() throws IOException {
     df.write()
         .format("hudi")
         .options(QuickstartUtils.getQuickstartWriteConfigs())
-        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "id")
+        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number")
+        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name")
+        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "id")
+        .option(HoodieWriteConfig.TABLE_NAME, partitionedIdentifier)
+        .mode(SaveMode.Overwrite)
+        .save(partitionedLocation);
+
+    df.write()
+        .format("hudi")
+        .options(QuickstartUtils.getQuickstartWriteConfigs())
+        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number")
         .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name")
         .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "")
         .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier)
@@ -164,11 +218,17 @@ public void before() throws IOException {
   }
 
   @Test
-  public void TestHudiTableWrite() {
+  public void TestHudiUnpartitionedTableWrite() {
     Dataset<Row> df = spark.read().format("hudi").load(unpartitionedLocation);
-    LOG.info("Generated dataframe shcema: {}", df.schema().treeString());
-    LOG.info("Generated dataframe: {}", df.showString(10, 20,false));
-    df.show();
+    LOG.info("Generated unpartitioned dataframe shcema: {}", df.schema().treeString());
+    LOG.info("Generated unpartitioned dataframe: {}", df.showString(10, 20, false));
+  }
+
+  @Test
+  public void TestHudiPartitionedTableWrite() {
+    Dataset<Row> df = spark.read().format("hudi").load(partitionedLocation);
+    LOG.info("Generated partitioned dataframe shcema: {}", df.schema().treeString());
+    LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false));
   }
 
   private String destName(String catalogName, String dest) {

From c09198848baf5af354487252c119ae6e49c82cd2 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Wed, 18 Jan 2023 13:39:00 -0500
Subject: [PATCH 03/20] test fail

---
 build.gradle                                  |  1 +
 ...udiToIcebergMigrationSparkIntegration.java | 29 +++++++++
 .../iceberg/hudi/TestSnapshotHudiTable.java   |  7 +++
 .../hudi/BaseSnapshotHudiTableAction.java     | 61 +++++++++++++++++++
 .../BaseSnapshotHudiTableActionResult.java    | 33 ++++++++++
 ...HudiToIcebergMigrationActionsProvider.java | 41 +++++++++++++
 .../iceberg/hudi/SnapshotHudiTable.java       | 34 +++++++++++
 7 files changed, 206 insertions(+)
 create mode 100644 hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java
 create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
 create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableActionResult.java
 create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java
 create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java

diff --git a/build.gradle b/build.gradle
index 1dc6f04345a0..c0c0f05f1ae0 100644
--- a/build.gradle
+++ b/build.gradle
@@ -458,6 +458,7 @@ project(':iceberg-hudi') {
     implementation project(':iceberg-orc')
     implementation "com.fasterxml.jackson.core:jackson-databind"
 
+    compileOnly("org.apache.hudi:hudi-common:0.12.2")
 
     compileOnly("org.apache.hadoop:hadoop-common") {
       exclude group: 'org.apache.avro', module: 'avro'
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java
new file mode 100644
index 000000000000..ae213d99fb7f
--- /dev/null
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.hudi;
+
+import org.apache.spark.sql.SparkSession;
+
+public class HudiToIcebergMigrationSparkIntegration {
+  private HudiToIcebergMigrationSparkIntegration() {}
+
+  static SnapshotHudiTable snapshotHudiTable(SparkSession spark, String hudiTablePath) {
+    return new BaseSnapshotHudiTableAction(spark.sessionState().newHadoopConf(), hudiTablePath);
+  }
+}
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index 9bfc85c9cc11..060ddee428fd 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -231,6 +231,13 @@ public void TestHudiPartitionedTableWrite() {
     LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false));
   }
 
+  @Test
+  public void TestHudiMetaClientAlpha() {
+    SnapshotHudiTable.Result result =
+        HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(spark, unpartitionedLocation)
+            .execute();
+  }
+
   private String destName(String catalogName, String dest) {
     if (catalogName.equals(defaultSparkCatalog)) {
       return NAMESPACE + "." + catalogName + "_" + dest;
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
new file mode 100644
index 000000000000..8da8746c1746
--- /dev/null
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.hudi;
+
+import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class BaseSnapshotHudiTableAction implements SnapshotHudiTable {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(BaseSnapshotHudiTableAction.class.getName());
+
+  private HoodieTableMetaClient HoodieMetaClient;
+
+  public BaseSnapshotHudiTableAction(
+      Configuration hoodieConfiguration, String hoodieTableBasePath) {
+    this.HoodieMetaClient = buildTableMetaClient(hoodieConfiguration, hoodieTableBasePath);
+  }
+
+  @Override
+  public SnapshotHudiTable tableProperties(Map<String, String> properties) {
+    return null;
+  }
+
+  @Override
+  public SnapshotHudiTable tableProperty(String key, String value) {
+    return null;
+  }
+
+  @Override
+  public Result execute() {
+    LOG.info("Alpha test: hoodie table base path: {}", HoodieMetaClient.getBasePathV2());
+
+    return null;
+  }
+
+  private static HoodieTableMetaClient buildTableMetaClient(Configuration conf, String basePath) {
+    HoodieTableMetaClient metaClient =
+        HoodieTableMetaClient.builder().setConf(conf).setBasePath(basePath).build();
+    return metaClient;
+  }
+}
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableActionResult.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableActionResult.java
new file mode 100644
index 000000000000..ba6c85ab97d3
--- /dev/null
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableActionResult.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.hudi;
+
+public class BaseSnapshotHudiTableActionResult implements SnapshotHudiTable.Result {
+
+  private final long snapshotFilesCount;
+
+  public BaseSnapshotHudiTableActionResult(long snapshotFilesCount) {
+    this.snapshotFilesCount = snapshotFilesCount;
+  }
+
+  @Override
+  public long snapshotFilesCount() {
+    return snapshotFilesCount;
+  }
+}
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java b/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java
new file mode 100644
index 000000000000..0a1e0808af43
--- /dev/null
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.hudi;
+
+public interface HudiToIcebergMigrationActionsProvider {
+
+  default SnapshotHudiTable snapshotHudiTable() {
+    throw new UnsupportedOperationException("snapshotHudiTable is not supported");
+  }
+
+  static HudiToIcebergMigrationActionsProvider defaultProvider() {
+    return DefaultHudiToIcebergMigrationActions.defaultMigrationActions();
+  }
+
+  class DefaultHudiToIcebergMigrationActions implements HudiToIcebergMigrationActionsProvider {
+    private static final DefaultHudiToIcebergMigrationActions INSTANCE =
+        new DefaultHudiToIcebergMigrationActions();
+
+    private DefaultHudiToIcebergMigrationActions() {}
+
+    public static DefaultHudiToIcebergMigrationActions defaultMigrationActions() {
+      return INSTANCE;
+    }
+  }
+}
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java b/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java
new file mode 100644
index 000000000000..a5208809b314
--- /dev/null
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.hudi;
+
+import java.util.Map;
+import org.apache.iceberg.actions.Action;
+
+public interface SnapshotHudiTable extends Action<SnapshotHudiTable, SnapshotHudiTable.Result> {
+
+  SnapshotHudiTable tableProperties(Map<String, String> properties);
+
+  SnapshotHudiTable tableProperty(String key, String value);
+
+  interface Result {
+
+    long snapshotFilesCount();
+  }
+}

From dda6a1f76aa5bdbcd4616741bbde37498a8317b6 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Wed, 18 Jan 2023 13:54:34 -0500
Subject: [PATCH 04/20] test work

---
 build.gradle                                                   | 2 +-
 .../java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java    | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/build.gradle b/build.gradle
index c0c0f05f1ae0..598d4d1a27ab 100644
--- a/build.gradle
+++ b/build.gradle
@@ -458,7 +458,7 @@ project(':iceberg-hudi') {
     implementation project(':iceberg-orc')
     implementation "com.fasterxml.jackson.core:jackson-databind"
 
-    compileOnly("org.apache.hudi:hudi-common:0.12.2")
+    compileOnly("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2")
 
     compileOnly("org.apache.hadoop:hadoop-common") {
       exclude group: 'org.apache.avro', module: 'avro'
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index 060ddee428fd..c6b36501ba67 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -233,8 +233,9 @@ public void TestHudiPartitionedTableWrite() {
 
   @Test
   public void TestHudiMetaClientAlpha() {
+    LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation);
     SnapshotHudiTable.Result result =
-        HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(spark, unpartitionedLocation)
+        HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(spark, partitionedLocation)
             .execute();
   }
 

From 0ef358366d46201f96e3b88e979c576493f2d8f1 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Thu, 19 Jan 2023 12:30:35 -0500
Subject: [PATCH 05/20] work out schema conversion

---
 build.gradle                                  |   3 +
 .../iceberg/hudi/TestSnapshotHudiTable.java   |  53 +++++++
 .../hudi/BaseSnapshotHudiTableAction.java     |  60 +++++++-
 .../iceberg/hudi/HudiDataTypeToType.java      | 136 ++++++++++++++++++
 .../iceberg/hudi/HudiDataTypeVisitor.java     |  59 ++++++++
 5 files changed, 306 insertions(+), 5 deletions(-)
 create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeToType.java
 create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeVisitor.java

diff --git a/build.gradle b/build.gradle
index 598d4d1a27ab..f02d0230c1cf 100644
--- a/build.gradle
+++ b/build.gradle
@@ -459,6 +459,9 @@ project(':iceberg-hudi') {
     implementation "com.fasterxml.jackson.core:jackson-databind"
 
     compileOnly("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2")
+    implementation("org.apache.avro:avro") {
+      exclude group: 'org.tukaani' // xz compression is not supported
+    }
 
     compileOnly("org.apache.hadoop:hadoop-common") {
       exclude group: 'org.apache.avro', module: 'avro'
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index c6b36501ba67..b48a7b07c32b 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -24,6 +24,7 @@
 import java.util.Map;
 import org.apache.hudi.DataSourceWriteOptions;
 import org.apache.hudi.QuickstartUtils;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
@@ -231,6 +232,58 @@ public void TestHudiPartitionedTableWrite() {
     LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false));
   }
 
+  @Test
+  public void TestHudiMetaClientExploration() {
+    HoodieTableMetaClient hoodieTableMetaClient =
+        HoodieTableMetaClient.builder()
+            .setConf(spark.sessionState().newHadoopConf())
+            .setBasePath(partitionedLocation)
+            .setLoadActiveTimelineOnLoad(true)
+            .build();
+
+    LOG.info("Alpha test: hoodie table base path: {}", hoodieTableMetaClient.getBasePathV2());
+    LOG.info(
+        "Alpha test: hoodie getBootStrapIndexByFileId: {}",
+        hoodieTableMetaClient.getBootstrapIndexByFileIdFolderNameFolderPath());
+    LOG.info(
+        "Alpha test: hoodie getBootStrapIndexByPartitionPath: {}",
+        hoodieTableMetaClient.getBootstrapIndexByPartitionFolderPath());
+    LOG.info(
+        "Alpha test: hoodie getCommitActionType: {}", hoodieTableMetaClient.getCommitActionType());
+    LOG.info(
+        "Alpha test: hoodie getCommitsAndCompactionTimeline: {}",
+        hoodieTableMetaClient.getCommitsAndCompactionTimeline());
+    LOG.info(
+        "Alpha test: hoodie getCommitsTimeline: {}", hoodieTableMetaClient.getCommitsTimeline());
+    LOG.info("Alpha test: hoodie getCommitTimeline: {}", hoodieTableMetaClient.getCommitTimeline());
+    LOG.info(
+        "Alpha test: hoodie getConsistencyGuardConfig: {}",
+        hoodieTableMetaClient.getConsistencyGuardConfig().toString());
+    LOG.info(
+        "Alpha test: hoodie getFileSystemRetryConfig: {}",
+        hoodieTableMetaClient.getFileSystemRetryConfig().toString());
+    LOG.info(
+        "Alpha test: hoodie getHashingMetadataPath: {}",
+        hoodieTableMetaClient.getHashingMetadataPath());
+    LOG.info(
+        "Alpha test: hoodie getMetaAuxiliaryPath: {}",
+        hoodieTableMetaClient.getMetaAuxiliaryPath());
+    LOG.info("Alpha test: hoodie getMetaPath: {}", hoodieTableMetaClient.getMetaPath());
+    LOG.info(
+        "Alpha test: hoodie getMetastoreConfig: {}",
+        hoodieTableMetaClient.getMetastoreConfig().toString());
+    LOG.info(
+        "Alpha test: hoodie getSchemaFolderName: {}", hoodieTableMetaClient.getSchemaFolderName());
+    LOG.info(
+        "Alpha test: hoodie getTableConfig: {}", hoodieTableMetaClient.getTableConfig().toString());
+    LOG.info(
+        "Alpha test: hoodie getTableType: {}", hoodieTableMetaClient.getTableType().toString());
+    LOG.info("Alpha test: hoodie getTempFolderPath: {}", hoodieTableMetaClient.getTempFolderPath());
+    LOG.info(
+        "Alpha test: hoodie getTimelineLayoutVersion: {}",
+        hoodieTableMetaClient.getTimelineLayoutVersion());
+  }
+
   @Test
   public void TestHudiMetaClientAlpha() {
     LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation);
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 8da8746c1746..292d348b1fa3 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -21,6 +21,13 @@
 import java.util.Map;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.internal.schema.InternalSchema;
+import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.types.Type;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -29,11 +36,11 @@ public class BaseSnapshotHudiTableAction implements SnapshotHudiTable {
   private static final Logger LOG =
       LoggerFactory.getLogger(BaseSnapshotHudiTableAction.class.getName());
 
-  private HoodieTableMetaClient HoodieMetaClient;
+  private HoodieTableMetaClient hoodieTableMetaClient;
 
   public BaseSnapshotHudiTableAction(
       Configuration hoodieConfiguration, String hoodieTableBasePath) {
-    this.HoodieMetaClient = buildTableMetaClient(hoodieConfiguration, hoodieTableBasePath);
+    this.hoodieTableMetaClient = buildTableMetaClient(hoodieConfiguration, hoodieTableBasePath);
   }
 
   @Override
@@ -48,14 +55,57 @@ public SnapshotHudiTable tableProperty(String key, String value) {
 
   @Override
   public Result execute() {
-    LOG.info("Alpha test: hoodie table base path: {}", HoodieMetaClient.getBasePathV2());
-
+    LOG.info("Alpha test: hoodie table base path: {}", hoodieTableMetaClient.getBasePathV2());
+    LOG.info(
+        "Alpha test: hoodie getBootStrapIndexByFileId: {}",
+        hoodieTableMetaClient.getBootstrapIndexByFileIdFolderNameFolderPath());
+    LOG.info(
+        "Alpha test: hoodie getBootStrapIndexByPartitionPath: {}",
+        hoodieTableMetaClient.getBootstrapIndexByPartitionFolderPath());
+    InternalSchema hudiSchema = getHudiSchema();
+    LOG.info("Alpha test: hoodie table schema: {}", hudiSchema);
+    LOG.info("Alpha test: get record type: {}", hudiSchema.getRecord());
+    Schema icebergSchema = getIcebergSchema(hudiSchema);
+    LOG.info("Alpha test: get converted schema: {}", icebergSchema);
     return null;
   }
 
+  private InternalSchema getHudiSchema() {
+    TableSchemaResolver schemaUtil = new TableSchemaResolver(hoodieTableMetaClient);
+    Option<InternalSchema> hudiSchema = schemaUtil.getTableInternalSchemaFromCommitMetadata();
+    LOG.info("Alpha test: hoodie schema: {}", hudiSchema);
+    LOG.info("Alpha test: active timeline: {}", hoodieTableMetaClient.getActiveTimeline());
+    LOG.info(
+        "Alpha test: active timeline commit timeline: {}",
+        hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline());
+    LOG.info(
+        "Alpha test: active timeline commit timeline instants: {}",
+        hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
+    // TODO: need to add support for parquet format table
+    return hudiSchema.orElseGet(
+        () -> {
+          try {
+            return AvroInternalSchemaConverter.convert(schemaUtil.getTableAvroSchema());
+          } catch (Exception e) {
+            throw new HoodieException("cannot find schema for current table");
+          }
+        });
+  }
+
+  private Schema getIcebergSchema(InternalSchema hudiSchema) {
+    Type converted =
+        HudiDataTypeVisitor.visit(
+            hudiSchema.getRecord(), new HudiDataTypeToType(hudiSchema.getRecord()));
+    return new Schema(converted.asNestedType().asStructType().fields());
+  }
+
   private static HoodieTableMetaClient buildTableMetaClient(Configuration conf, String basePath) {
     HoodieTableMetaClient metaClient =
-        HoodieTableMetaClient.builder().setConf(conf).setBasePath(basePath).build();
+        HoodieTableMetaClient.builder()
+            .setConf(conf)
+            .setBasePath(basePath)
+            .setLoadActiveTimelineOnLoad(true)
+            .build();
     return metaClient;
   }
 }
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeToType.java b/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeToType.java
new file mode 100644
index 000000000000..370e192fead8
--- /dev/null
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeToType.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.hudi;
+
+import java.util.List;
+import org.apache.hudi.internal.schema.Types;
+import org.apache.iceberg.exceptions.ValidationException;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.types.Type;
+
+public class HudiDataTypeToType extends HudiDataTypeVisitor<Type> {
+  private final Types.RecordType root;
+  private int nextId = 0;
+
+  HudiDataTypeToType() {
+    this.root = null;
+  }
+
+  HudiDataTypeToType(Types.RecordType root) {
+    this.root = root;
+    this.nextId = root.fields().size();
+  }
+
+  private int getNextId() {
+    int next = nextId;
+    nextId += 1;
+    return next;
+  }
+
+  @SuppressWarnings("ReferenceEquality")
+  @Override
+  public Type record(Types.RecordType record, List<Type> fieldResults) {
+    List<Types.Field> fields = record.fields();
+    List<org.apache.iceberg.types.Types.NestedField> newFields =
+        Lists.newArrayListWithExpectedSize(fields.size());
+    boolean isRoot = root == record;
+    for (int i = 0; i < fields.size(); i += 1) {
+      Types.Field field = fields.get(i);
+      Type type = fieldResults.get(i);
+      int id;
+      if (isRoot) {
+        id = i;
+      } else {
+        id = getNextId();
+      }
+
+      String doc = field.doc();
+      if (field.isOptional()) {
+        newFields.add(
+            org.apache.iceberg.types.Types.NestedField.optional(id, field.name(), type, doc));
+      } else {
+        newFields.add(
+            org.apache.iceberg.types.Types.NestedField.required(id, field.name(), type, doc));
+      }
+    }
+
+    return org.apache.iceberg.types.Types.StructType.of(newFields);
+  }
+
+  @Override
+  public Type field(Types.Field field, Type typeResult) {
+    return typeResult;
+  }
+
+  @Override
+  public Type map(Types.MapType map, Type keyResult, Type valueResult) {
+    if (map.isValueOptional()) {
+      return org.apache.iceberg.types.Types.MapType.ofOptional(
+          getNextId(), getNextId(), keyResult, valueResult);
+    } else {
+      return org.apache.iceberg.types.Types.MapType.ofRequired(
+          getNextId(), getNextId(), keyResult, valueResult);
+    }
+  }
+
+  @Override
+  public Type array(Types.ArrayType array, Type elementResult) {
+    if (array.isElementOptional()) {
+      return org.apache.iceberg.types.Types.ListType.ofOptional(getNextId(), elementResult);
+    } else {
+      return org.apache.iceberg.types.Types.ListType.ofRequired(getNextId(), elementResult);
+    }
+  }
+
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
+  @Override
+  public Type atomic(org.apache.hudi.internal.schema.Type atomic) {
+    if (atomic instanceof Types.BooleanType) {
+      return org.apache.iceberg.types.Types.BooleanType.get();
+    } else if (atomic instanceof Types.IntType) {
+      return org.apache.iceberg.types.Types.IntegerType.get();
+    } else if (atomic instanceof Types.LongType) {
+      return org.apache.iceberg.types.Types.LongType.get();
+    } else if (atomic instanceof Types.FloatType) {
+      return org.apache.iceberg.types.Types.FloatType.get();
+    } else if (atomic instanceof Types.DoubleType) {
+      return org.apache.iceberg.types.Types.DoubleType.get();
+    } else if (atomic instanceof Types.DateType) {
+      return org.apache.iceberg.types.Types.DateType.get();
+    } else if (atomic instanceof Types.TimestampType) {
+      return org.apache.iceberg.types.Types.TimestampType.withZone();
+    } else if (atomic instanceof Types.StringType) {
+      return org.apache.iceberg.types.Types.StringType.get();
+    } else if (atomic instanceof Types.BinaryType) {
+      return org.apache.iceberg.types.Types.BinaryType.get();
+    } else if (atomic instanceof Types.UUIDType) {
+      return org.apache.iceberg.types.Types.UUIDType.get();
+    } else if (atomic instanceof Types.DecimalType) {
+      return org.apache.iceberg.types.Types.DecimalType.of(
+          ((Types.DecimalType) atomic).precision(), ((Types.DecimalType) atomic).scale());
+    } else if (atomic instanceof Types.FixedType) {
+      return org.apache.iceberg.types.Types.FixedType.ofLength(
+          ((Types.FixedType) atomic).getFixedSize());
+    } else if (atomic instanceof Types.TimeType) {
+      return org.apache.iceberg.types.Types.TimeType.get();
+    }
+
+    throw new ValidationException("Not a supported type: %s", atomic.getClass().getName());
+  }
+}
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeVisitor.java b/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeVisitor.java
new file mode 100644
index 000000000000..caedeb5eba29
--- /dev/null
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeVisitor.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.hudi;
+
+import java.util.List;
+import org.apache.hudi.internal.schema.Type;
+import org.apache.hudi.internal.schema.Types;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+
+public abstract class HudiDataTypeVisitor<T> {
+
+  public static <T> T visit(Type type, HudiDataTypeVisitor<T> visitor) {
+    if (type instanceof Types.RecordType) {
+      List<Types.Field> fields = ((Types.RecordType) type).fields();
+      List<T> fieldResults = Lists.newArrayListWithExpectedSize(fields.size());
+
+      for (Types.Field field : fields) {
+        fieldResults.add(visitor.field(field, visit(field.type(), visitor)));
+      }
+
+      return visitor.record((Types.RecordType) type, fieldResults);
+    } else if (type instanceof Types.MapType) {
+      return visitor.map(
+          (Types.MapType) type,
+          visit(((Types.MapType) type).keyType(), visitor),
+          visit(((Types.MapType) type).valueType(), visitor));
+    } else if (type instanceof Types.ArrayType) {
+      return visitor.array(
+          (Types.ArrayType) type, visit(((Types.ArrayType) type).elementType(), visitor));
+    }
+    return visitor.atomic(type);
+  }
+
+  public abstract T record(Types.RecordType record, List<T> fieldResults);
+
+  public abstract T field(Types.Field field, T typeResult);
+
+  public abstract T array(Types.ArrayType array, T elementResult);
+
+  public abstract T map(Types.MapType map, T keyResult, T valueResult);
+
+  public abstract T atomic(Type atomic);
+}

From ced38a1df2341b151cc96b4aca32c632d164cbc3 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Sat, 21 Jan 2023 01:02:17 -0500
Subject: [PATCH 06/20] rename some methods

---
 .../org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 292d348b1fa3..dbfdbb6082b9 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -65,7 +65,7 @@ public Result execute() {
     InternalSchema hudiSchema = getHudiSchema();
     LOG.info("Alpha test: hoodie table schema: {}", hudiSchema);
     LOG.info("Alpha test: get record type: {}", hudiSchema.getRecord());
-    Schema icebergSchema = getIcebergSchema(hudiSchema);
+    Schema icebergSchema = convertToIcebergSchema(hudiSchema);
     LOG.info("Alpha test: get converted schema: {}", icebergSchema);
     return null;
   }
@@ -81,7 +81,6 @@ private InternalSchema getHudiSchema() {
     LOG.info(
         "Alpha test: active timeline commit timeline instants: {}",
         hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
-    // TODO: need to add support for parquet format table
     return hudiSchema.orElseGet(
         () -> {
           try {
@@ -92,7 +91,7 @@ private InternalSchema getHudiSchema() {
         });
   }
 
-  private Schema getIcebergSchema(InternalSchema hudiSchema) {
+  private Schema convertToIcebergSchema(InternalSchema hudiSchema) {
     Type converted =
         HudiDataTypeVisitor.visit(
             hudiSchema.getRecord(), new HudiDataTypeToType(hudiSchema.getRecord()));

From 7286cab90e60ce0952b3a88002347804bd46365e Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Sat, 21 Jan 2023 16:17:11 -0500
Subject: [PATCH 07/20] COW first draft, but currently cannot get file groups

---
 ...udiToIcebergMigrationSparkIntegration.java |  17 +-
 .../iceberg/hudi/TestSnapshotHudiTable.java   |   4 +-
 .../hudi/BaseSnapshotHudiTableAction.java     | 283 +++++++++++++++++-
 3 files changed, 298 insertions(+), 6 deletions(-)

diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java
index ae213d99fb7f..cfeca68687b1 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java
@@ -18,12 +18,25 @@
  */
 package org.apache.iceberg.hudi;
 
+import org.apache.iceberg.catalog.TableIdentifier;
+import org.apache.iceberg.spark.Spark3Util;
 import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.connector.catalog.CatalogPlugin;
 
 public class HudiToIcebergMigrationSparkIntegration {
   private HudiToIcebergMigrationSparkIntegration() {}
 
-  static SnapshotHudiTable snapshotHudiTable(SparkSession spark, String hudiTablePath) {
-    return new BaseSnapshotHudiTableAction(spark.sessionState().newHadoopConf(), hudiTablePath);
+  static SnapshotHudiTable snapshotHudiTable(
+      SparkSession spark, String hudiTablePath, String newTableIdentifier) {
+    String ctx = "hudi snapshot target";
+    CatalogPlugin defaultCatalog = spark.sessionState().catalogManager().currentCatalog();
+    Spark3Util.CatalogAndIdentifier catalogAndIdentifier =
+        Spark3Util.catalogAndIdentifier(ctx, spark, newTableIdentifier, defaultCatalog);
+
+    return new BaseSnapshotHudiTableAction(
+        spark.sessionState().newHadoopConf(),
+        hudiTablePath,
+        Spark3Util.loadIcebergCatalog(spark, catalogAndIdentifier.catalog().name()),
+        TableIdentifier.parse(catalogAndIdentifier.identifier().toString()));
   }
 }
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index b48a7b07c32b..0111b1f20776 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -287,8 +287,10 @@ public void TestHudiMetaClientExploration() {
   @Test
   public void TestHudiMetaClientAlpha() {
     LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation);
+    String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table");
     SnapshotHudiTable.Result result =
-        HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(spark, partitionedLocation)
+        HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(
+                spark, partitionedLocation, newTableIdentifier)
             .execute();
   }
 
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index dbfdbb6082b9..241d97bf370b 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -18,15 +18,58 @@
  */
 package org.apache.iceberg.hudi;
 
+import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import javax.annotation.Nullable;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.engine.HoodieLocalEngineContext;
+import org.apache.hudi.common.model.HoodieBaseFile;
+import org.apache.hudi.common.model.HoodieFileGroup;
+import org.apache.hudi.common.model.HoodieFileGroupId;
+import org.apache.hudi.common.table.HoodieTableConfig;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.view.FileSystemViewManager;
+import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
 import org.apache.hudi.internal.schema.InternalSchema;
 import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
+import org.apache.iceberg.AppendFiles;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DataFiles;
+import org.apache.iceberg.DeleteFiles;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.Metrics;
+import org.apache.iceberg.MetricsConfig;
+import org.apache.iceberg.OverwriteFiles;
+import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.SnapshotSummary;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.Transaction;
+import org.apache.iceberg.avro.Avro;
+import org.apache.iceberg.catalog.Catalog;
+import org.apache.iceberg.catalog.TableIdentifier;
+import org.apache.iceberg.exceptions.ValidationException;
+import org.apache.iceberg.hadoop.HadoopFileIO;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.mapping.MappingUtil;
+import org.apache.iceberg.mapping.NameMapping;
+import org.apache.iceberg.mapping.NameMappingParser;
+import org.apache.iceberg.orc.OrcMetrics;
+import org.apache.iceberg.parquet.ParquetUtil;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.types.Type;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -35,12 +78,35 @@ public class BaseSnapshotHudiTableAction implements SnapshotHudiTable {
 
   private static final Logger LOG =
       LoggerFactory.getLogger(BaseSnapshotHudiTableAction.class.getName());
-
+  private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source";
+  private static final String HOODIE_SOURCE_VALUE = "hudi";
+  private static final String ORIGINAL_LOCATION_PROP = "original_location";
+  private static final String PARQUET_SUFFIX = ".parquet";
+  private static final String AVRO_SUFFIX = ".avro";
+  private static final String ORC_SUFFIX = ".orc";
   private HoodieTableMetaClient hoodieTableMetaClient;
+  private HoodieTableConfig hoodieTableConfig;
+  private HoodieEngineContext hoodieEngineContext;
+  private HoodieMetadataConfig hoodieMetadataConfig;
+  private String hoodieTableBasePath;
+  private Catalog icebergCatalog;
+  private TableIdentifier newTableIdentifier;
+  private HadoopFileIO hoodieFileIO;
+  private ImmutableMap.Builder<String, String> additionalPropertiesBuilder = ImmutableMap.builder();
 
   public BaseSnapshotHudiTableAction(
-      Configuration hoodieConfiguration, String hoodieTableBasePath) {
+      Configuration hoodieConfiguration,
+      String hoodieTableBasePath,
+      Catalog icebergCatalog,
+      TableIdentifier newTableIdentifier) {
     this.hoodieTableMetaClient = buildTableMetaClient(hoodieConfiguration, hoodieTableBasePath);
+    this.hoodieTableConfig = hoodieTableMetaClient.getTableConfig();
+    this.hoodieEngineContext = new HoodieLocalEngineContext(hoodieConfiguration);
+    this.hoodieTableBasePath = hoodieTableBasePath;
+    this.hoodieMetadataConfig = HoodieInputFormatUtils.buildMetadataConfig(hoodieConfiguration);
+    this.hoodieFileIO = new HadoopFileIO(hoodieConfiguration);
+    this.icebergCatalog = icebergCatalog;
+    this.newTableIdentifier = newTableIdentifier;
   }
 
   @Override
@@ -67,7 +133,165 @@ public Result execute() {
     LOG.info("Alpha test: get record type: {}", hudiSchema.getRecord());
     Schema icebergSchema = convertToIcebergSchema(hudiSchema);
     LOG.info("Alpha test: get converted schema: {}", icebergSchema);
-    return null;
+    PartitionSpec partitionSpec = getPartitionSpecFromHoodieMetadataData(icebergSchema);
+    LOG.info("Alpha test: get partition spec: {}", partitionSpec);
+    // TODO: add support for newTableLocation
+    Transaction icebergTransaction =
+        icebergCatalog.newCreateTableTransaction(
+            newTableIdentifier, icebergSchema, partitionSpec, destTableProperties());
+    icebergTransaction
+        .table()
+        .updateProperties()
+        .set(
+            TableProperties.DEFAULT_NAME_MAPPING,
+            NameMappingParser.toJson(MappingUtil.create(icebergTransaction.table().schema())))
+        .commit();
+
+    HoodieTimeline timeline =
+        hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
+    LOG.info("Alpha test: hoodie timeline: {}", timeline);
+    HoodieTableFileSystemView hoodieTableFileSystemView =
+        FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(
+            hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline);
+    Stream<HoodieInstant> completedInstants = timeline.getInstants();
+    LOG.info("Alpha test: get completed instants: {}", completedInstants);
+    // file group id -> Map<timestamp, HoodieBaseFile>
+    Map<HoodieFileGroupId, Map<String, HoodieBaseFile>> allStampedDataFiles =
+        hoodieTableFileSystemView
+            .getAllFileGroups()
+            .collect(
+                ImmutableMap.toImmutableMap(
+                    HoodieFileGroup::getFileGroupId,
+                    fileGroup ->
+                        fileGroup
+                            .getAllBaseFiles()
+                            .collect(
+                                ImmutableMap.toImmutableMap(
+                                    HoodieBaseFile::getCommitTime, baseFile -> baseFile))));
+    List<HoodieFileGroup> testGroups =
+        hoodieTableFileSystemView.getAllFileGroups().collect(Collectors.toList());
+    LOG.info("Alpha test: get all stamped data files: {}", allStampedDataFiles);
+    LOG.info("Alpha test: get all file groups: {}", testGroups);
+    Map<HoodieFileGroupId, DataFile> convertedDataFiles = Maps.newHashMap();
+    completedInstants.forEachOrdered(
+        instant -> {
+          LOG.info("Alpha test: get completed instant: {}", instant);
+          // copyInstants to iceberg table
+          // TODO: need to verify the order of the instants, make sure it is from the oldest to the
+          // newest
+          commitHoodieInstantToIcebergTransaction(
+              instant,
+              hoodieTableFileSystemView.getAllFileGroups(),
+              allStampedDataFiles,
+              convertedDataFiles,
+              icebergTransaction);
+        });
+
+    long totalDataFiles =
+        Long.parseLong(
+            icebergTransaction
+                .table()
+                .currentSnapshot()
+                .summary()
+                .get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
+    icebergTransaction.commitTransaction();
+    LOG.info(
+        "Successfully created Iceberg table {} from hudi table at {}, total data file count: {}",
+        newTableIdentifier,
+        hoodieTableBasePath,
+        totalDataFiles);
+    return new BaseSnapshotHudiTableActionResult(totalDataFiles);
+  }
+
+  public void commitHoodieInstantToIcebergTransaction(
+      HoodieInstant instant,
+      Stream<HoodieFileGroup> fileGroups,
+      Map<HoodieFileGroupId, Map<String, HoodieBaseFile>> allStampedDataFiles,
+      Map<HoodieFileGroupId, DataFile> convertedDataFiles,
+      Transaction transaction) {
+    List<DataFile> filesToAdd = Lists.newArrayList();
+    List<DataFile> filesToRemove = Lists.newArrayList();
+
+    // TODO: need to add synchronization if want to rely on parallelism here
+    fileGroups
+        .sequential()
+        .forEach(
+            fileGroup -> {
+              HoodieFileGroupId fileGroupId = fileGroup.getFileGroupId();
+              LOG.info("Alpha test: get file group: {}", fileGroup);
+              DataFile currentDataFile =
+                  buildDataFileFromHoodieBaseFile(
+                      instant,
+                      fileGroup,
+                      allStampedDataFiles.get(fileGroupId),
+                      transaction.table());
+              if (currentDataFile != null) {
+                filesToAdd.add(currentDataFile);
+                DataFile previousDataFile = convertedDataFiles.get(fileGroupId);
+                if (previousDataFile != null) {
+                  // need to delete the previous data file since a new version will be added
+                  filesToRemove.add(previousDataFile);
+                }
+                convertedDataFiles.put(fileGroupId, currentDataFile);
+              }
+            });
+    LOG.info("Alpha test: get files to add: {} at instant {}", filesToAdd, instant);
+    if (filesToAdd.size() > 0 && filesToRemove.size() > 0) {
+      // OverwriteFiles case
+      OverwriteFiles overwriteFiles = transaction.newOverwrite();
+      filesToAdd.forEach(overwriteFiles::addFile);
+      filesToRemove.forEach(overwriteFiles::deleteFile);
+      overwriteFiles.commit();
+    } else if (filesToAdd.size() > 0) {
+      // AppendFiles case
+      AppendFiles appendFiles = transaction.newAppend();
+      filesToAdd.forEach(appendFiles::appendFile);
+      appendFiles.commit();
+    } else if (filesToRemove.size() > 0) {
+      // DeleteFiles case
+      DeleteFiles deleteFiles = transaction.newDelete();
+      filesToRemove.forEach(deleteFiles::deleteFile);
+      deleteFiles.commit();
+    }
+  }
+
+  @Nullable
+  private DataFile buildDataFileFromHoodieBaseFile(
+      HoodieInstant instant,
+      HoodieFileGroup fileGroup,
+      Map<String, HoodieBaseFile> stampedDataFiles,
+      Table table) {
+    HoodieBaseFile baseFile = stampedDataFiles.get(instant.getTimestamp());
+    if (baseFile == null) {
+      LOG.info(
+          "Alpha test: does not have base file for instant: {}, fileGroupId {}",
+          instant,
+          fileGroup.getFileGroupId());
+      return null;
+    }
+
+    PartitionSpec spec = table.spec();
+    // TODO: need to verify the path is absolute
+    String path = baseFile.getPath();
+    long fileSize = baseFile.getFileSize();
+    String partitionPath = fileGroup.getPartitionPath();
+
+    MetricsConfig metricsConfig = MetricsConfig.forTable(table);
+    String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
+    NameMapping nameMapping =
+        nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
+
+    InputFile file = hoodieFileIO.newInputFile(path);
+    FileFormat format = determineFileFormatFromPath(path);
+    Metrics metrics = getMetricsForFile(file, format, metricsConfig, nameMapping);
+
+    return DataFiles.builder(spec)
+        .withPath(path)
+        .withFormat(format)
+        .withFileSizeInBytes(fileSize)
+        .withPartitionPath(partitionPath) // TODO: need to verify the partition path is correct
+        .withMetrics(metrics)
+        .build();
   }
 
   private InternalSchema getHudiSchema() {
@@ -98,6 +322,32 @@ private Schema convertToIcebergSchema(InternalSchema hudiSchema) {
     return new Schema(converted.asNestedType().asStructType().fields());
   }
 
+  private PartitionSpec getPartitionSpecFromHoodieMetadataData(Schema schema) {
+    Option<String[]> partitionNames = hoodieTableConfig.getPartitionFields();
+    if (partitionNames.isPresent()) {
+      PartitionSpec.Builder builder = PartitionSpec.builderFor(schema);
+      for (String partitionName : partitionNames.get()) {
+        builder.identity(partitionName);
+      }
+      return builder.build();
+    }
+
+    return PartitionSpec.unpartitioned();
+  }
+
+  private Map<String, String> destTableProperties() {
+    // TODO: need to check which hoodie properties to add to
+    additionalPropertiesBuilder.putAll(hoodieTableConfig.propsMap());
+    additionalPropertiesBuilder.putAll(
+        ImmutableMap.of(
+            SNAPSHOT_SOURCE_PROP,
+            HOODIE_SOURCE_VALUE,
+            ORIGINAL_LOCATION_PROP,
+            hoodieTableMetaClient.getBasePathV2().toString()));
+
+    return additionalPropertiesBuilder.build();
+  }
+
   private static HoodieTableMetaClient buildTableMetaClient(Configuration conf, String basePath) {
     HoodieTableMetaClient metaClient =
         HoodieTableMetaClient.builder()
@@ -107,4 +357,31 @@ private static HoodieTableMetaClient buildTableMetaClient(Configuration conf, St
             .build();
     return metaClient;
   }
+
+  private FileFormat determineFileFormatFromPath(String path) {
+    if (path.endsWith(PARQUET_SUFFIX)) {
+      return FileFormat.PARQUET;
+    } else if (path.endsWith(AVRO_SUFFIX)) {
+      return FileFormat.AVRO;
+    } else if (path.endsWith(ORC_SUFFIX)) {
+      return FileFormat.ORC;
+    } else {
+      throw new ValidationException("Cannot determine file format from path %s", path);
+    }
+  }
+
+  private Metrics getMetricsForFile(
+      InputFile file, FileFormat format, MetricsConfig metricsSpec, NameMapping mapping) {
+    switch (format) {
+      case AVRO:
+        long rowCount = Avro.rowCount(file);
+        return new Metrics(rowCount, null, null, null, null);
+      case PARQUET:
+        return ParquetUtil.fileMetrics(file, metricsSpec, mapping);
+      case ORC:
+        return OrcMetrics.fromInputFile(file, metricsSpec, mapping);
+      default:
+        throw new ValidationException("Cannot get metrics from file format: %s", format);
+    }
+  }
 }

From bbb5c3649d743bde1caf242c209bd8560db41965 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Sat, 21 Jan 2023 23:58:07 -0500
Subject: [PATCH 08/20] prepare for draft PR discussion

---
 build.gradle                                  |  3 +
 .../iceberg/hudi/TestSnapshotHudiTable.java   |  8 +--
 .../hudi/BaseSnapshotHudiTableAction.java     | 61 ++++++++++++++++---
 3 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/build.gradle b/build.gradle
index f02d0230c1cf..a2c284166c49 100644
--- a/build.gradle
+++ b/build.gradle
@@ -458,6 +458,9 @@ project(':iceberg-hudi') {
     implementation project(':iceberg-orc')
     implementation "com.fasterxml.jackson.core:jackson-databind"
 
+    // TODO: we only need hudi-common here, however, hudi-common has some dependency conflicts with hudi-spark-bundle
+    // which is currently used by the integration test. We should fix this in the future.
+    // Also, hudi uses java8, may need to assess if we can use hudi in java11.
     compileOnly("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2")
     implementation("org.apache.avro:avro") {
       exclude group: 'org.tukaani' // xz compression is not supported
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index 0111b1f20776..b28d9289a126 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -219,21 +219,21 @@ public void before() throws IOException {
   }
 
   @Test
-  public void TestHudiUnpartitionedTableWrite() {
+  public void testHudiUnpartitionedTableWrite() {
     Dataset<Row> df = spark.read().format("hudi").load(unpartitionedLocation);
     LOG.info("Generated unpartitioned dataframe shcema: {}", df.schema().treeString());
     LOG.info("Generated unpartitioned dataframe: {}", df.showString(10, 20, false));
   }
 
   @Test
-  public void TestHudiPartitionedTableWrite() {
+  public void testHudiPartitionedTableWrite() {
     Dataset<Row> df = spark.read().format("hudi").load(partitionedLocation);
     LOG.info("Generated partitioned dataframe shcema: {}", df.schema().treeString());
     LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false));
   }
 
   @Test
-  public void TestHudiMetaClientExploration() {
+  public void testHudiMetaClientExploration() {
     HoodieTableMetaClient hoodieTableMetaClient =
         HoodieTableMetaClient.builder()
             .setConf(spark.sessionState().newHadoopConf())
@@ -285,7 +285,7 @@ public void TestHudiMetaClientExploration() {
   }
 
   @Test
-  public void TestHudiMetaClientAlpha() {
+  public void testHudiMetaClientAlpha() {
     LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation);
     String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table");
     SnapshotHudiTable.Result result =
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 241d97bf370b..3fc11b84fc22 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -52,6 +52,7 @@
 import org.apache.iceberg.OverwriteFiles;
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.SnapshotSummary;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
@@ -128,6 +129,8 @@ public Result execute() {
     LOG.info(
         "Alpha test: hoodie getBootStrapIndexByPartitionPath: {}",
         hoodieTableMetaClient.getBootstrapIndexByPartitionFolderPath());
+
+    // Convert Hoodie table schema to Iceberg schema and extract the partition spec
     InternalSchema hudiSchema = getHudiSchema();
     LOG.info("Alpha test: hoodie table schema: {}", hudiSchema);
     LOG.info("Alpha test: get record type: {}", hudiSchema.getRecord());
@@ -135,10 +138,15 @@ public Result execute() {
     LOG.info("Alpha test: get converted schema: {}", icebergSchema);
     PartitionSpec partitionSpec = getPartitionSpecFromHoodieMetadataData(icebergSchema);
     LOG.info("Alpha test: get partition spec: {}", partitionSpec);
+
     // TODO: add support for newTableLocation
     Transaction icebergTransaction =
         icebergCatalog.newCreateTableTransaction(
             newTableIdentifier, icebergSchema, partitionSpec, destTableProperties());
+    // We need name mapping to ensure we can read data files correctly as iceberg table has its own
+    // rule to assign field id
+    // Although the field id rule seems to be the same as hudi, but the rule is not guaranteed by
+    // any API
     icebergTransaction
         .table()
         .updateProperties()
@@ -147,15 +155,21 @@ public Result execute() {
             NameMappingParser.toJson(MappingUtil.create(icebergTransaction.table().schema())))
         .commit();
 
+    // Pre-process the timeline, we only need to process all COMPLETED commit for COW table
+    // Commit that has been rollbacked will not be in either REQUESTED or INFLIGHT state
     HoodieTimeline timeline =
         hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
     LOG.info("Alpha test: hoodie timeline: {}", timeline);
+    // Initialize the FileSystemView for querying table data files
+    // TODO: need to choose the correct implementation of the FileSystemView
     HoodieTableFileSystemView hoodieTableFileSystemView =
         FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(
             hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline);
+    // get all instants on the timeline
     Stream<HoodieInstant> completedInstants = timeline.getInstants();
     LOG.info("Alpha test: get completed instants: {}", completedInstants);
     // file group id -> Map<timestamp, HoodieBaseFile>
+    // This pre-process aims to make a timestamp to HoodieBaseFile map for each file group
     Map<HoodieFileGroupId, Map<String, HoodieBaseFile>> allStampedDataFiles =
         hoodieTableFileSystemView
             .getAllFileGroups()
@@ -168,17 +182,24 @@ public Result execute() {
                             .collect(
                                 ImmutableMap.toImmutableMap(
                                     HoodieBaseFile::getCommitTime, baseFile -> baseFile))));
+    // BEGIN TEST ONLY CODE
     List<HoodieFileGroup> testGroups =
         hoodieTableFileSystemView.getAllFileGroups().collect(Collectors.toList());
     LOG.info("Alpha test: get all stamped data files: {}", allStampedDataFiles);
     LOG.info("Alpha test: get all file groups: {}", testGroups);
+    // END TEST ONLY CODE
+
+    // Help tracked if a previous version of the data file has been added to the iceberg table
     Map<HoodieFileGroupId, DataFile> convertedDataFiles = Maps.newHashMap();
+    // Replay the timeline from beginning to the end
     completedInstants.forEachOrdered(
         instant -> {
           LOG.info("Alpha test: get completed instant: {}", instant);
           // copyInstants to iceberg table
           // TODO: need to verify the order of the instants, make sure it is from the oldest to the
           // newest
+
+          // commit each instant as a transaction to the iceberg table
           commitHoodieInstantToIcebergTransaction(
               instant,
               hoodieTableFileSystemView.getAllFileGroups(),
@@ -186,14 +207,11 @@ public Result execute() {
               convertedDataFiles,
               icebergTransaction);
         });
-
+    Snapshot icebergSnapshot = icebergTransaction.table().currentSnapshot();
     long totalDataFiles =
-        Long.parseLong(
-            icebergTransaction
-                .table()
-                .currentSnapshot()
-                .summary()
-                .get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
+        icebergSnapshot != null
+            ? Long.parseLong(icebergSnapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP))
+            : 0;
     icebergTransaction.commitTransaction();
     LOG.info(
         "Successfully created Iceberg table {} from hudi table at {}, total data file count: {}",
@@ -203,6 +221,17 @@ public Result execute() {
     return new BaseSnapshotHudiTableActionResult(totalDataFiles);
   }
 
+  /**
+   * In COW Hoodie table, each file group is a combination of different versions of the same data
+   * file.
+   *
+   * <p>During each write, a new version of the file will be copied and modified to be a new version
+   * in the file group. Therefore, when committing the datafile to the iceberg table, we need to
+   * make sure that the older version of the data file is deleted before adding the newer version of
+   * the data file.
+   *
+   * <p>In other words, the COW behavior can be mapped to the overwrite operation in the iceberg.
+   */
   public void commitHoodieInstantToIcebergTransaction(
       HoodieInstant instant,
       Stream<HoodieFileGroup> fileGroups,
@@ -212,7 +241,7 @@ public void commitHoodieInstantToIcebergTransaction(
     List<DataFile> filesToAdd = Lists.newArrayList();
     List<DataFile> filesToRemove = Lists.newArrayList();
 
-    // TODO: need to add synchronization if want to rely on parallelism here
+    // TODO: may need to add synchronization lock for parallelism
     fileGroups
         .sequential()
         .forEach(
@@ -225,13 +254,17 @@ public void commitHoodieInstantToIcebergTransaction(
                       fileGroup,
                       allStampedDataFiles.get(fileGroupId),
                       transaction.table());
+
               if (currentDataFile != null) {
                 filesToAdd.add(currentDataFile);
+
                 DataFile previousDataFile = convertedDataFiles.get(fileGroupId);
                 if (previousDataFile != null) {
                   // need to delete the previous data file since a new version will be added
                   filesToRemove.add(previousDataFile);
                 }
+
+                // update the converted data file map
                 convertedDataFiles.put(fileGroupId, currentDataFile);
               }
             });
@@ -271,7 +304,7 @@ private DataFile buildDataFileFromHoodieBaseFile(
     }
 
     PartitionSpec spec = table.spec();
-    // TODO: need to verify the path is absolute
+    // TODO: need to verify the path is absolute (the field's name is fullPath)
     String path = baseFile.getPath();
     long fileSize = baseFile.getFileSize();
     String partitionPath = fileGroup.getPartitionPath();
@@ -294,6 +327,11 @@ private DataFile buildDataFileFromHoodieBaseFile(
         .build();
   }
 
+  /**
+   * Taken from <a
+   * href="https://github.com/apache/hudi/blob/a70355f44571036d7f99b3ca3cb240674bd1cf91/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java#L1405-L1414">getInternalSchema</a>
+   * in HoodieWriteClient.
+   */
   private InternalSchema getHudiSchema() {
     TableSchemaResolver schemaUtil = new TableSchemaResolver(hoodieTableMetaClient);
     Option<InternalSchema> hudiSchema = schemaUtil.getTableInternalSchemaFromCommitMetadata();
@@ -315,6 +353,11 @@ private InternalSchema getHudiSchema() {
         });
   }
 
+  /**
+   * Use nested type visitor to convert the internal schema to iceberg schema.
+   *
+   * <p>just like what we did with spark table's schema and delta lake table's schema.
+   */
   private Schema convertToIcebergSchema(InternalSchema hudiSchema) {
     Type converted =
         HudiDataTypeVisitor.visit(

From 44f7f693e35e642cbc3d3865f075572c064a5468 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Tue, 24 Jan 2023 22:28:43 -0500
Subject: [PATCH 09/20] fix get all file groups

---
 .../hudi/BaseSnapshotHudiTableAction.java     | 44 ++++++++++---------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 3fc11b84fc22..25be14d4db2d 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -18,6 +18,7 @@
  */
 package org.apache.iceberg.hudi;
 
+import java.io.IOException;
 import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;
@@ -27,6 +28,7 @@
 import org.apache.hudi.common.config.HoodieMetadataConfig;
 import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.engine.HoodieLocalEngineContext;
+import org.apache.hudi.common.fs.FSUtils;
 import org.apache.hudi.common.model.HoodieBaseFile;
 import org.apache.hudi.common.model.HoodieFileGroup;
 import org.apache.hudi.common.model.HoodieFileGroupId;
@@ -42,6 +44,9 @@
 import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
 import org.apache.hudi.internal.schema.InternalSchema;
 import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
+import org.apache.hudi.metadata.HoodieTableMetadata;
+
+import org.apache.hadoop.fs.Path;
 import org.apache.iceberg.AppendFiles;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.DataFiles;
@@ -122,22 +127,11 @@ public SnapshotHudiTable tableProperty(String key, String value) {
 
   @Override
   public Result execute() {
-    LOG.info("Alpha test: hoodie table base path: {}", hoodieTableMetaClient.getBasePathV2());
-    LOG.info(
-        "Alpha test: hoodie getBootStrapIndexByFileId: {}",
-        hoodieTableMetaClient.getBootstrapIndexByFileIdFolderNameFolderPath());
-    LOG.info(
-        "Alpha test: hoodie getBootStrapIndexByPartitionPath: {}",
-        hoodieTableMetaClient.getBootstrapIndexByPartitionFolderPath());
 
     // Convert Hoodie table schema to Iceberg schema and extract the partition spec
     InternalSchema hudiSchema = getHudiSchema();
-    LOG.info("Alpha test: hoodie table schema: {}", hudiSchema);
-    LOG.info("Alpha test: get record type: {}", hudiSchema.getRecord());
     Schema icebergSchema = convertToIcebergSchema(hudiSchema);
-    LOG.info("Alpha test: get converted schema: {}", icebergSchema);
     PartitionSpec partitionSpec = getPartitionSpecFromHoodieMetadataData(icebergSchema);
-    LOG.info("Alpha test: get partition spec: {}", partitionSpec);
 
     // TODO: add support for newTableLocation
     Transaction icebergTransaction =
@@ -159,20 +153,29 @@ public Result execute() {
     // Commit that has been rollbacked will not be in either REQUESTED or INFLIGHT state
     HoodieTimeline timeline =
         hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
-    LOG.info("Alpha test: hoodie timeline: {}", timeline);
     // Initialize the FileSystemView for querying table data files
     // TODO: need to choose the correct implementation of the FileSystemView
-    HoodieTableFileSystemView hoodieTableFileSystemView =
-        FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(
-            hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline);
+//    HoodieTableFileSystemView hoodieTableFileSystemView =
+//        FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(
+//            hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline);
+    HoodieTableFileSystemView hoodieTableFileSystemView = new HoodieTableFileSystemView(
+        hoodieTableMetaClient, timeline);
     // get all instants on the timeline
     Stream<HoodieInstant> completedInstants = timeline.getInstants();
-    LOG.info("Alpha test: get completed instants: {}", completedInstants);
+    List<String> partitionPaths = FSUtils.getAllPartitionPaths(hoodieEngineContext, hoodieMetadataConfig, hoodieTableMetaClient.getBasePathV2().toString());
+    try {
+      for (String partitionPath : partitionPaths) {
+        Path fullPartitionPath = FSUtils.getPartitionPath(hoodieTableMetaClient.getBasePathV2(), partitionPath);
+        hoodieTableFileSystemView.addFilesToView(FSUtils.getAllDataFilesInPartition(hoodieTableMetaClient.getFs(), fullPartitionPath));
+      }
+    } catch (IOException e) {
+      throw new RuntimeException("Failed to get all data files in partition", e);
+    }
     // file group id -> Map<timestamp, HoodieBaseFile>
     // This pre-process aims to make a timestamp to HoodieBaseFile map for each file group
     Map<HoodieFileGroupId, Map<String, HoodieBaseFile>> allStampedDataFiles =
         hoodieTableFileSystemView
-            .getAllFileGroups()
+            .fetchAllStoredFileGroups()
             .collect(
                 ImmutableMap.toImmutableMap(
                     HoodieFileGroup::getFileGroupId,
@@ -182,9 +185,10 @@ public Result execute() {
                             .collect(
                                 ImmutableMap.toImmutableMap(
                                     HoodieBaseFile::getCommitTime, baseFile -> baseFile))));
+
     // BEGIN TEST ONLY CODE
-    List<HoodieFileGroup> testGroups =
-        hoodieTableFileSystemView.getAllFileGroups().collect(Collectors.toList());
+    List<HoodieBaseFile> testGroups =
+        hoodieTableFileSystemView.getLatestBaseFiles().collect(Collectors.toList());
     LOG.info("Alpha test: get all stamped data files: {}", allStampedDataFiles);
     LOG.info("Alpha test: get all file groups: {}", testGroups);
     // END TEST ONLY CODE
@@ -307,7 +311,7 @@ private DataFile buildDataFileFromHoodieBaseFile(
     // TODO: need to verify the path is absolute (the field's name is fullPath)
     String path = baseFile.getPath();
     long fileSize = baseFile.getFileSize();
-    String partitionPath = fileGroup.getPartitionPath();
+    String partitionPath = FSUtils.getPartitionPath(hoodieTableMetaClient.getBasePathV2(), fileGroup.getPartitionPath()).toString();
 
     MetricsConfig metricsConfig = MetricsConfig.forTable(table);
     String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);

From 43e30de5b08b84d2a3a09529f836e2485a0d8db3 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Tue, 24 Jan 2023 22:50:46 -0500
Subject: [PATCH 10/20] successfully snapshot first hoodie table

---
 .../iceberg/hudi/TestSnapshotHudiTable.java   | 19 +++++++++++++++++++
 .../hudi/BaseSnapshotHudiTableAction.java     | 13 +++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index b28d9289a126..abd91fd87cb1 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -36,6 +36,7 @@
 import org.apache.spark.sql.SQLContext;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.hudi.catalog.HoodieCatalog;
+import org.assertj.core.api.Assertions;
 import org.junit.Before;
 import org.junit.Rule;
 import org.junit.Test;
@@ -292,6 +293,24 @@ public void testHudiMetaClientAlpha() {
         HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(
                 spark, partitionedLocation, newTableIdentifier)
             .execute();
+
+    checkSnapshotIntegrity(partitionedIdentifier, newTableIdentifier);
+  }
+
+  private void checkSnapshotIntegrity(
+      String hudiTableIdentifier,
+      String icebergTableIdentifier) {
+
+//    List<Row> deltaTableContents =
+//        spark.sql("SELECT * FROM " + hudiTableIdentifier).collectAsList();
+    List<Row> icebergTableContents =
+        spark.sql("SELECT * FROM " + icebergTableIdentifier).collectAsList();
+    LOG.info("Iceberg table contents: {}", spark.sql("SELECT * FROM " + icebergTableIdentifier).showString(10, 20, false));
+    return;
+
+//    Assertions.assertThat(deltaTableContents).hasSize(icebergTableContents.size());
+//    Assertions.assertThat(icebergTableContents).containsAll(deltaTableContents);
+//    Assertions.assertThat(deltaTableContents).containsAll(icebergTableContents);
   }
 
   private String destName(String catalogName, String dest) {
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 25be14d4db2d..06f61cf1b62c 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -55,6 +55,7 @@
 import org.apache.iceberg.Metrics;
 import org.apache.iceberg.MetricsConfig;
 import org.apache.iceberg.OverwriteFiles;
+import org.apache.iceberg.PartitionField;
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.Snapshot;
@@ -311,7 +312,7 @@ private DataFile buildDataFileFromHoodieBaseFile(
     // TODO: need to verify the path is absolute (the field's name is fullPath)
     String path = baseFile.getPath();
     long fileSize = baseFile.getFileSize();
-    String partitionPath = FSUtils.getPartitionPath(hoodieTableMetaClient.getBasePathV2(), fileGroup.getPartitionPath()).toString();
+    String partitionValue = fileGroup.getPartitionPath();
 
     MetricsConfig metricsConfig = MetricsConfig.forTable(table);
     String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
@@ -322,11 +323,19 @@ private DataFile buildDataFileFromHoodieBaseFile(
     FileFormat format = determineFileFormatFromPath(path);
     Metrics metrics = getMetricsForFile(file, format, metricsConfig, nameMapping);
 
+    List<PartitionField> testFields = spec.fields();
+
+    String partition =
+        spec.fields().stream()
+            .map(PartitionField::name)
+            .map(name -> String.format("%s=%s", name, partitionValue))
+            .collect(Collectors.joining("/"));
+
     return DataFiles.builder(spec)
         .withPath(path)
         .withFormat(format)
         .withFileSizeInBytes(fileSize)
-        .withPartitionPath(partitionPath) // TODO: need to verify the partition path is correct
+        .withPartitionPath(partition) // TODO: need to handle multiple partition fields
         .withMetrics(metrics)
         .build();
   }

From cb382db62718f1dd439b5a699e3702c32b73d3be Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Mon, 30 Jan 2023 00:18:02 -0500
Subject: [PATCH 11/20] pass test for all primitive types and partition table

---
 .../iceberg/hudi/TestSnapshotHudiTable.java   | 211 +++++++++---------
 1 file changed, 102 insertions(+), 109 deletions(-)

diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index abd91fd87cb1..071e6151093c 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -18,20 +18,34 @@
  */
 package org.apache.iceberg.hudi;
 
+import static org.apache.spark.sql.functions.current_date;
+import static org.apache.spark.sql.functions.date_add;
+import static org.apache.spark.sql.functions.expr;
+
 import java.io.File;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.hudi.DataSourceReadOptions;
 import org.apache.hudi.DataSourceWriteOptions;
 import org.apache.hudi.QuickstartUtils;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.spark.SparkSessionCatalog;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Column;
 import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.RelationalGroupedDataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
 import org.apache.spark.sql.SaveMode;
@@ -50,29 +64,6 @@
 public class TestSnapshotHudiTable extends SparkHudiMigrationTestBase {
 
   private static final Logger LOG = LoggerFactory.getLogger(TestSnapshotHudiTable.class.getName());
-  private static final String row1 =
-      "{\"name\":\"Michael\",\"addresses\":[{\"city\":\"SanJose\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}],"
-          + "\"address_nested\":{\"current\":{\"state\":\"NY\",\"city\":\"NewYork\"},\"previous\":{\"state\":\"NJ\",\"city\":\"Newark\"}},"
-          + "\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"6\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"],"
-          + "[\"Spark\",\"Java\"]],\"id\":1,\"magic_number\":1.123123123123}";
-  private static final String row2 =
-      "{\"name\":\"Test\",\"addresses\":[{\"city\":\"SanJos123123e\",\"state\":\"CA\"},{\"city\":\"Sand12312iago\",\"state\":\"CA\"}],"
-          + "\"address_nested\":{\"current\":{\"state\":\"N12Y\",\"city\":\"NewY1231ork\"}},\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"},"
-          + "\"secondProp\":{\"height\":\"6\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"],[\"Spark\",\"Java\"]],\"id\":2,\"magic_number\":2.123123123123}";
-  private static final String row3 =
-      "{\"name\":\"Test\",\"addresses\":[{\"city\":\"SanJose\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}],"
-          + "\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"6\"},\"subjects\":"
-          + "[[\"Java\",\"Scala\",\"C++\"],[\"Spark\",\"Java\"]],\"id\":3,\"magic_number\":3.123123123123}";
-  private static final String row4 =
-      "{\"name\":\"John\",\"addresses\":[{\"city\":\"LA\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}],"
-          + "\"address_nested\":{\"current\":{\"state\":\"NY\",\"city\":\"NewYork\"},\"previous\":{\"state\":\"NJ123\"}},"
-          + "\"properties\":{\"hair\":\"b12rown\",\"eye\":\"bla3221ck\"},\"secondProp\":{\"height\":\"633\"},\"subjects\":"
-          + "[[\"Spark\",\"Java\"]],\"id\":4,\"magic_number\":4.123123123123}";
-  private static final String row5 =
-      "{\"name\":\"Jonas\",\"addresses\":[{\"city\":\"Pittsburgh\",\"state\":\"PA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}],"
-          + "\"address_nested\":{\"current\":{\"state\":\"PA\",\"city\":\"Haha\"},\"previous\":{\"state\":\"NJ\"}},"
-          + "\"properties\":{\"hair\":\"black\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"7\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"],"
-          + "[\"Spark\",\"Java\"]],\"id\":5,\"magic_number\":5.123123123123}";
   private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source";
   private static final String DELTA_SOURCE_VALUE = "delta";
   private static final String ORIGINAL_LOCATION_PROP = "original_location";
@@ -186,37 +177,27 @@ public void before() throws IOException {
     spark.sql(String.format("DROP TABLE IF EXISTS %s", unpartitionedIdentifier));
     spark.sql(String.format("DROP TABLE IF EXISTS %s", externalDataFilesIdentifier));
 
-    // hard code the dataframe
-    List<String> jsonList = Lists.newArrayList();
-    jsonList.add(row1);
-    jsonList.add(row2);
-    jsonList.add(row3);
-    jsonList.add(row4);
-    jsonList.add(row5);
-    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
-    SQLContext sqlContext = new SQLContext(javaSparkContext);
-    JavaRDD<String> rdd = javaSparkContext.parallelize(jsonList);
-    Dataset<Row> df = sqlContext.read().json(rdd);
+    Dataset<Row> df = typeTestDataFrame();
 
     df.write()
         .format("hudi")
         .options(QuickstartUtils.getQuickstartWriteConfigs())
-        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number")
-        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name")
-        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "id")
+        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "decimalCol")
+        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "intCol")
+        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partitionPath")
         .option(HoodieWriteConfig.TABLE_NAME, partitionedIdentifier)
         .mode(SaveMode.Overwrite)
         .save(partitionedLocation);
 
-    df.write()
-        .format("hudi")
-        .options(QuickstartUtils.getQuickstartWriteConfigs())
-        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number")
-        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name")
-        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "")
-        .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier)
-        .mode(SaveMode.Overwrite)
-        .save(unpartitionedLocation);
+//    df.write()
+//        .format("hudi")
+//        .options(QuickstartUtils.getQuickstartWriteConfigs())
+//        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number")
+//        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name")
+//        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "")
+//        .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier)
+//        .mode(SaveMode.Overwrite)
+//        .save(unpartitionedLocation);
   }
 
   @Test
@@ -233,58 +214,6 @@ public void testHudiPartitionedTableWrite() {
     LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false));
   }
 
-  @Test
-  public void testHudiMetaClientExploration() {
-    HoodieTableMetaClient hoodieTableMetaClient =
-        HoodieTableMetaClient.builder()
-            .setConf(spark.sessionState().newHadoopConf())
-            .setBasePath(partitionedLocation)
-            .setLoadActiveTimelineOnLoad(true)
-            .build();
-
-    LOG.info("Alpha test: hoodie table base path: {}", hoodieTableMetaClient.getBasePathV2());
-    LOG.info(
-        "Alpha test: hoodie getBootStrapIndexByFileId: {}",
-        hoodieTableMetaClient.getBootstrapIndexByFileIdFolderNameFolderPath());
-    LOG.info(
-        "Alpha test: hoodie getBootStrapIndexByPartitionPath: {}",
-        hoodieTableMetaClient.getBootstrapIndexByPartitionFolderPath());
-    LOG.info(
-        "Alpha test: hoodie getCommitActionType: {}", hoodieTableMetaClient.getCommitActionType());
-    LOG.info(
-        "Alpha test: hoodie getCommitsAndCompactionTimeline: {}",
-        hoodieTableMetaClient.getCommitsAndCompactionTimeline());
-    LOG.info(
-        "Alpha test: hoodie getCommitsTimeline: {}", hoodieTableMetaClient.getCommitsTimeline());
-    LOG.info("Alpha test: hoodie getCommitTimeline: {}", hoodieTableMetaClient.getCommitTimeline());
-    LOG.info(
-        "Alpha test: hoodie getConsistencyGuardConfig: {}",
-        hoodieTableMetaClient.getConsistencyGuardConfig().toString());
-    LOG.info(
-        "Alpha test: hoodie getFileSystemRetryConfig: {}",
-        hoodieTableMetaClient.getFileSystemRetryConfig().toString());
-    LOG.info(
-        "Alpha test: hoodie getHashingMetadataPath: {}",
-        hoodieTableMetaClient.getHashingMetadataPath());
-    LOG.info(
-        "Alpha test: hoodie getMetaAuxiliaryPath: {}",
-        hoodieTableMetaClient.getMetaAuxiliaryPath());
-    LOG.info("Alpha test: hoodie getMetaPath: {}", hoodieTableMetaClient.getMetaPath());
-    LOG.info(
-        "Alpha test: hoodie getMetastoreConfig: {}",
-        hoodieTableMetaClient.getMetastoreConfig().toString());
-    LOG.info(
-        "Alpha test: hoodie getSchemaFolderName: {}", hoodieTableMetaClient.getSchemaFolderName());
-    LOG.info(
-        "Alpha test: hoodie getTableConfig: {}", hoodieTableMetaClient.getTableConfig().toString());
-    LOG.info(
-        "Alpha test: hoodie getTableType: {}", hoodieTableMetaClient.getTableType().toString());
-    LOG.info("Alpha test: hoodie getTempFolderPath: {}", hoodieTableMetaClient.getTempFolderPath());
-    LOG.info(
-        "Alpha test: hoodie getTimelineLayoutVersion: {}",
-        hoodieTableMetaClient.getTimelineLayoutVersion());
-  }
-
   @Test
   public void testHudiMetaClientAlpha() {
     LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation);
@@ -294,29 +223,93 @@ public void testHudiMetaClientAlpha() {
                 spark, partitionedLocation, newTableIdentifier)
             .execute();
 
-    checkSnapshotIntegrity(partitionedIdentifier, newTableIdentifier);
+    checkSnapshotIntegrity(partitionedLocation, newTableIdentifier);
   }
 
   private void checkSnapshotIntegrity(
-      String hudiTableIdentifier,
+      String hudiTableLocation,
       String icebergTableIdentifier) {
-
-//    List<Row> deltaTableContents =
-//        spark.sql("SELECT * FROM " + hudiTableIdentifier).collectAsList();
+    Dataset<Row> hudiResult = spark.read().format("hudi").option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()).load(hudiTableLocation);
+    Dataset<Row> icebergResult = spark.sql("SELECT * FROM " + icebergTableIdentifier);
+    // Need to sort the column by names since hudi tends to return the columns in a different order (put the one used for partitioning last)
+//    Dataset<Row> hudiSortedResult = hudiResult.groupBy(getColumns(hudiResult)).count();
+//    Dataset<Row> icebergSortedResult = icebergResult.groupBy(getColumns(icebergResult)).count();
+    List<Row> hudiTableContents =
+        hudiResult.collectAsList();
     List<Row> icebergTableContents =
-        spark.sql("SELECT * FROM " + icebergTableIdentifier).collectAsList();
-    LOG.info("Iceberg table contents: {}", spark.sql("SELECT * FROM " + icebergTableIdentifier).showString(10, 20, false));
-    return;
+        icebergResult.collectAsList();
+    LOG.info("Hudi table contents: {}", hudiResult.showString(10, 20, false));
+    LOG.info("Iceberg table contents: {}", icebergResult.showString(10, 20, false));
+    Assertions.assertThat(hudiTableContents).hasSize(icebergTableContents.size());
+    Assertions.assertThat(hudiTableContents).containsAll(icebergTableContents);
+    Assertions.assertThat(icebergTableContents).containsAll(hudiTableContents); // TODO: may change to containsExactlyInAnyOrderElementsOf
+  }
 
-//    Assertions.assertThat(deltaTableContents).hasSize(icebergTableContents.size());
-//    Assertions.assertThat(icebergTableContents).containsAll(deltaTableContents);
-//    Assertions.assertThat(deltaTableContents).containsAll(icebergTableContents);
+  private Column[] getColumns(Dataset<Row> df) {
+    Column[] columns = new Column[df.columns().length];
+    for (int i = 0; i < df.columns().length; i++) {
+      columns[i] = df.col(df.columns()[i]);
+    }
+    Arrays.sort(columns, Comparator.comparing(Column::toString));
+    return columns;
   }
 
+
   private String destName(String catalogName, String dest) {
     if (catalogName.equals(defaultSparkCatalog)) {
       return NAMESPACE + "." + catalogName + "_" + dest;
     }
     return catalogName + "." + NAMESPACE + "." + catalogName + "_" + dest;
   }
+
+  private Dataset<Row> typeTestDataFrame() {
+    return spark
+        .range(0, 5, 1, 5)
+        .withColumnRenamed("id", "longCol")
+        .withColumn("intCol", expr("CAST(longCol AS INT)"))
+        .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"))
+        .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)"))
+        .withColumn("dateCol", date_add(current_date(), 1))
+//        .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
+        .withColumn("stringCol", expr("CAST(dateCol AS STRING)"))
+        .withColumn("booleanCol", expr("longCol > 5"))
+        .withColumn("binaryCol", expr("CAST(longCol AS BINARY)"))
+        .withColumn("byteCol", expr("CAST(longCol AS BYTE)"))
+        .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))"))
+        .withColumn("shortCol", expr("CAST(longCol AS SHORT)"))
+        .withColumn("mapCol", expr("MAP(stringCol, shortCol)")) // Hudi requires Map key to be String
+        .withColumn("arrayCol", expr("ARRAY(longCol)"))
+        .withColumn("structCol", expr("STRUCT(mapCol, arrayCol)"))
+        .withColumn("partitionPath", expr("CAST(longCol AS STRING)"));
+  }
+
+  private Dataset<Row> nestedDataFrame() {
+    return spark
+        .range(0, 5, 1, 5)
+        .withColumn("longCol", expr("id"))
+        .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))"))
+        .withColumn("magic_number", expr("rand(5) * 100"))
+        .withColumn("dateCol", date_add(current_date(), 1))
+        .withColumn("dateString", expr("CAST(dateCol AS STRING)"))
+        .withColumn("random1", expr("CAST(rand(5) * 100 as LONG)"))
+        .withColumn("random2", expr("CAST(rand(51) * 100 as LONG)"))
+        .withColumn("random3", expr("CAST(rand(511) * 100 as LONG)"))
+        .withColumn("random4", expr("CAST(rand(15) * 100 as LONG)"))
+        .withColumn("random5", expr("CAST(rand(115) * 100 as LONG)"))
+        .withColumn("innerStruct1", expr("STRUCT(random1, random2)"))
+        .withColumn("innerStruct2", expr("STRUCT(random3, random4)"))
+        .withColumn("structCol1", expr("STRUCT(innerStruct1, innerStruct2)"))
+        .withColumn(
+            "innerStruct3",
+            expr("STRUCT(SHA1(CAST(random5 AS BINARY)), SHA1(CAST(random1 AS BINARY)))"))
+        .withColumn(
+            "structCol2",
+            expr(
+                "STRUCT(innerStruct3, STRUCT(SHA1(CAST(random2 AS BINARY)), SHA1(CAST(random3 AS BINARY))))"))
+        .withColumn("arrayCol", expr("ARRAY(random1, random2, random3, random4, random5)"))
+        .withColumn("mapCol1", expr("MAP(structCol1, structCol2)"))
+        .withColumn("mapCol2", expr("MAP(longCol, dateString)"))
+        .withColumn("mapCol3", expr("MAP(dateCol, arrayCol)"))
+        .withColumn("structCol3", expr("STRUCT(structCol2, mapCol3, arrayCol)"));
+  }
 }

From 62ef777e9d35ae87560ce685211fa67693d3e380 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Mon, 30 Jan 2023 22:43:04 -0500
Subject: [PATCH 12/20] find bugs when arrayType presents

---
 build.gradle                                  |   1 +
 .../iceberg/hudi/TestSnapshotHudiTable.java   | 214 ++++++++++++------
 .../hudi/BaseSnapshotHudiTableAction.java     |  46 ++--
 3 files changed, 174 insertions(+), 87 deletions(-)

diff --git a/build.gradle b/build.gradle
index a2c284166c49..82d27288d7d1 100644
--- a/build.gradle
+++ b/build.gradle
@@ -473,6 +473,7 @@ project(':iceberg-hudi') {
       exclude group: 'com.google.code.gson', module: 'gson'
     }
     if (sparkVersions.contains("3.3") && scalaVersion == "2.12") {
+      integrationImplementation project(':iceberg-data')
       integrationImplementation("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2")
       integrationImplementation project(path: ":iceberg-spark:iceberg-spark-3.3_${scalaVersion}")
       integrationImplementation("org.apache.hadoop:hadoop-minicluster") {
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index 071e6151093c..bf21ebeb4bbe 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -24,31 +24,24 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
-import java.util.stream.Collectors;
-
 import org.apache.hudi.DataSourceReadOptions;
 import org.apache.hudi.DataSourceWriteOptions;
-import org.apache.hudi.QuickstartUtils;
-import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.catalog.TableIdentifier;
+import org.apache.iceberg.data.IcebergGenerics;
+import org.apache.iceberg.data.Record;
+import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
-import org.apache.iceberg.relocated.com.google.common.collect.Lists;
-import org.apache.iceberg.relocated.com.google.common.collect.Sets;
-import org.apache.iceberg.spark.SparkSessionCatalog;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.Column;
+import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.SparkCatalog;
 import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.RelationalGroupedDataset;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
 import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.connector.catalog.CatalogPlugin;
 import org.apache.spark.sql.hudi.catalog.HoodieCatalog;
 import org.assertj.core.api.Assertions;
 import org.junit.Before;
@@ -80,13 +73,15 @@ public class TestSnapshotHudiTable extends SparkHudiMigrationTestBase {
   private String unpartitionedLocation;
   private String newIcebergTableLocation;
   private String externalDataFilesTableLocation;
+  private Dataset<Row> typeTestDataframe = typeTestDataFrame();
+  private Dataset<Row> nestedDataframe = nestedDataFrame();
 
   @Parameterized.Parameters(name = "Catalog Name {0} - Options {2}")
   public static Object[][] parameters() {
     return new Object[][] {
       new Object[] {
         icebergCatalogName,
-        SparkSessionCatalog.class.getName(),
+        SparkCatalog.class.getName(),
         ImmutableMap.of(
             "type",
             "hive",
@@ -177,27 +172,39 @@ public void before() throws IOException {
     spark.sql(String.format("DROP TABLE IF EXISTS %s", unpartitionedIdentifier));
     spark.sql(String.format("DROP TABLE IF EXISTS %s", externalDataFilesIdentifier));
 
-    Dataset<Row> df = typeTestDataFrame();
-
-    df.write()
-        .format("hudi")
-        .options(QuickstartUtils.getQuickstartWriteConfigs())
-        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "decimalCol")
-        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "intCol")
-        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partitionPath")
-        .option(HoodieWriteConfig.TABLE_NAME, partitionedIdentifier)
-        .mode(SaveMode.Overwrite)
-        .save(partitionedLocation);
+    //    typeTestDataframe.write()
+    //        .format("hudi")
+    //        .options(QuickstartUtils.getQuickstartWriteConfigs())
+    //        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "decimalCol")
+    //        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "intCol")
+    //        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partitionPath")
+    //        .option(HoodieWriteConfig.TABLE_NAME, partitionedIdentifier)
+    //        .mode(SaveMode.Overwrite)
+    //        .save(partitionedLocation);
+    writeHoodieTable(
+        typeTestDataframe,
+        "decimalCol",
+        "intCol",
+        "partitionPath",
+        partitionedLocation,
+        partitionedIdentifier);
 
-//    df.write()
-//        .format("hudi")
-//        .options(QuickstartUtils.getQuickstartWriteConfigs())
-//        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number")
-//        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name")
-//        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "")
-//        .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier)
-//        .mode(SaveMode.Overwrite)
-//        .save(unpartitionedLocation);
+    //    df.write()
+    //        .format("hudi")
+    //        .options(QuickstartUtils.getQuickstartWriteConfigs())
+    //        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number")
+    //        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name")
+    //        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "")
+    //        .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier)
+    //        .mode(SaveMode.Overwrite)
+    //        .save(unpartitionedLocation);
+    writeHoodieTable(
+        typeTestDataframe,
+        "decimalCol",
+        "intCol",
+        "",
+        unpartitionedLocation,
+        unpartitionedIdentifier);
   }
 
   @Test
@@ -215,45 +222,95 @@ public void testHudiPartitionedTableWrite() {
   }
 
   @Test
-  public void testHudiMetaClientAlpha() {
+  public void testBasicPartitionedTable() {
     LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation);
     String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table");
     SnapshotHudiTable.Result result =
         HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(
                 spark, partitionedLocation, newTableIdentifier)
             .execute();
+    Table table = getIcebergTable(newTableIdentifier);
+    queryManual(table);
+    // checkSnapshotIntegrity(partitionedLocation, newTableIdentifier);
+  }
 
-    checkSnapshotIntegrity(partitionedLocation, newTableIdentifier);
+  @Test
+  public void referenceIcebergTable() {
+    String newTableIdentifier = destName(icebergCatalogName, "reference_iceberg_table");
+    typeTestDataframe
+        .writeTo(newTableIdentifier)
+        .using("iceberg")
+        .tableProperty(
+            TableProperties.WRITE_DATA_LOCATION,
+            "/Users/jonasjiang/Workspace/Apache_Hudi_ws/hudi_table_test/unpartitioned_iceberg_ref")
+        .tableProperty(
+            TableProperties.WRITE_METADATA_LOCATION,
+            "/Users/jonasjiang/Workspace/Apache_Hudi_ws/hudi_table_test/unpartitioned_iceberg_ref/metadata")
+        .createOrReplace();
+    Table table = getIcebergTable(newTableIdentifier);
+    queryManual(table);
   }
 
-  private void checkSnapshotIntegrity(
-      String hudiTableLocation,
-      String icebergTableIdentifier) {
-    Dataset<Row> hudiResult = spark.read().format("hudi").option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()).load(hudiTableLocation);
+  @Test
+  public void testBasicUnpartitionedTable() {
+    String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_2");
+    SnapshotHudiTable.Result result =
+        HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(
+                spark, unpartitionedLocation, newTableIdentifier)
+            .execute();
+
+    Dataset<Row> hudiResult =
+        spark
+            .read()
+            .format("hudi")
+            .option(
+                DataSourceReadOptions.QUERY_TYPE_OPT_KEY(),
+                DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL())
+            .load(unpartitionedLocation);
+    LOG.info("Hudi table contents: {}", hudiResult.showString(10, 20, false));
+    Table table = getIcebergTable(newTableIdentifier);
+    queryManual(table);
+    checkSnapshotIntegrity(unpartitionedLocation, newTableIdentifier);
+  }
+
+  private void checkSnapshotIntegrity(String hudiTableLocation, String icebergTableIdentifier) {
+    Dataset<Row> hudiResult =
+        spark
+            .read()
+            .format("hudi")
+            .option(
+                DataSourceReadOptions.QUERY_TYPE_OPT_KEY(),
+                DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL())
+            .load(hudiTableLocation);
     Dataset<Row> icebergResult = spark.sql("SELECT * FROM " + icebergTableIdentifier);
-    // Need to sort the column by names since hudi tends to return the columns in a different order (put the one used for partitioning last)
-//    Dataset<Row> hudiSortedResult = hudiResult.groupBy(getColumns(hudiResult)).count();
-//    Dataset<Row> icebergSortedResult = icebergResult.groupBy(getColumns(icebergResult)).count();
-    List<Row> hudiTableContents =
-        hudiResult.collectAsList();
-    List<Row> icebergTableContents =
-        icebergResult.collectAsList();
     LOG.info("Hudi table contents: {}", hudiResult.showString(10, 20, false));
     LOG.info("Iceberg table contents: {}", icebergResult.showString(10, 20, false));
+    // TODO: adjust test technique since hudi tends to return the columns in a different order (put
+    // the one used for partitioning last)
+    List<Row> hudiTableContents = hudiResult.collectAsList();
+    List<Row> icebergTableContents = icebergResult.collectAsList();
+
     Assertions.assertThat(hudiTableContents).hasSize(icebergTableContents.size());
     Assertions.assertThat(hudiTableContents).containsAll(icebergTableContents);
-    Assertions.assertThat(icebergTableContents).containsAll(hudiTableContents); // TODO: may change to containsExactlyInAnyOrderElementsOf
+    Assertions.assertThat(icebergTableContents)
+        .containsAll(hudiTableContents); // TODO: may change to containsExactlyInAnyOrderElementsOf
   }
 
-  private Column[] getColumns(Dataset<Row> df) {
-    Column[] columns = new Column[df.columns().length];
-    for (int i = 0; i < df.columns().length; i++) {
-      columns[i] = df.col(df.columns()[i]);
+  private void queryManual(Table table) {
+    CloseableIterable<Record> records = IcebergGenerics.read(table).build();
+    for (Record record : records) {
+      LOG.info("Alpha Test Iceberg Record: {}", record);
     }
-    Arrays.sort(columns, Comparator.comparing(Column::toString));
-    return columns;
   }
 
+  private Table getIcebergTable(String icebergTableIdentifier) {
+    CatalogPlugin defaultCatalog = spark.sessionState().catalogManager().currentCatalog();
+    Spark3Util.CatalogAndIdentifier catalogAndIdent =
+        Spark3Util.catalogAndIdentifier(
+            "test catalog", spark, icebergTableIdentifier, defaultCatalog);
+    return Spark3Util.loadIcebergCatalog(spark, catalogAndIdent.catalog().name())
+        .loadTable(TableIdentifier.parse(catalogAndIdent.identifier().toString()));
+  }
 
   private String destName(String catalogName, String dest) {
     if (catalogName.equals(defaultSparkCatalog)) {
@@ -267,20 +324,23 @@ private Dataset<Row> typeTestDataFrame() {
         .range(0, 5, 1, 5)
         .withColumnRenamed("id", "longCol")
         .withColumn("intCol", expr("CAST(longCol AS INT)"))
-        .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"))
-        .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)"))
+        //        .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"))
+        //        .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)"))
         .withColumn("dateCol", date_add(current_date(), 1))
-//        .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
+        //        .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
         .withColumn("stringCol", expr("CAST(dateCol AS STRING)"))
-        .withColumn("booleanCol", expr("longCol > 5"))
-        .withColumn("binaryCol", expr("CAST(longCol AS BINARY)"))
-        .withColumn("byteCol", expr("CAST(longCol AS BYTE)"))
+        //        .withColumn("booleanCol", expr("longCol > 5"))
+        //        .withColumn("binaryCol", expr("CAST(longCol AS BINARY)"))
+        //        .withColumn("byteCol", expr("CAST(longCol AS BYTE)"))
         .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))"))
-        .withColumn("shortCol", expr("CAST(longCol AS SHORT)"))
-        .withColumn("mapCol", expr("MAP(stringCol, shortCol)")) // Hudi requires Map key to be String
-        .withColumn("arrayCol", expr("ARRAY(longCol)"))
-        .withColumn("structCol", expr("STRUCT(mapCol, arrayCol)"))
-        .withColumn("partitionPath", expr("CAST(longCol AS STRING)"));
+        //        .withColumn("shortCol", expr("CAST(longCol AS SHORT)"))
+        .withColumn("mapCol", expr("MAP(stringCol, intCol)")) // Hudi requires Map key to be String
+        .withColumn("arrayCol", expr("ARRAY(dateCol)"))
+        //        .withColumn("structCol", expr("STRUCT(longCol AS a, longCol AS b)"))
+        .withColumn(
+            "partitionPath",
+            expr("CAST(longCol AS STRING)")); // For test convenience, please put the partition col
+    // in the end.
   }
 
   private Dataset<Row> nestedDataFrame() {
@@ -312,4 +372,22 @@ private Dataset<Row> nestedDataFrame() {
         .withColumn("mapCol3", expr("MAP(dateCol, arrayCol)"))
         .withColumn("structCol3", expr("STRUCT(structCol2, mapCol3, arrayCol)"));
   }
+
+  private void writeHoodieTable(
+      Dataset<Row> df,
+      String recordKey,
+      String preCombineKey,
+      String partitionPathField,
+      String tableLocation,
+      String tableIdentifier) {
+    df.write()
+        .format("hudi")
+        //        .options(QuickstartUtils.getQuickstartWriteConfigs())
+        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey)
+        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), preCombineKey)
+        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionPathField)
+        .option(HoodieWriteConfig.TBL_NAME.key(), tableIdentifier)
+        .mode(SaveMode.Append)
+        .save(tableLocation);
+  }
 }
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 06f61cf1b62c..69f631180a5e 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -25,6 +25,7 @@
 import java.util.stream.Stream;
 import javax.annotation.Nullable;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
 import org.apache.hudi.common.config.HoodieMetadataConfig;
 import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.engine.HoodieLocalEngineContext;
@@ -37,16 +38,12 @@
 import org.apache.hudi.common.table.TableSchemaResolver;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
-import org.apache.hudi.common.table.view.FileSystemViewManager;
 import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.exception.HoodieException;
 import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
 import org.apache.hudi.internal.schema.InternalSchema;
 import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
-import org.apache.hudi.metadata.HoodieTableMetadata;
-
-import org.apache.hadoop.fs.Path;
 import org.apache.iceberg.AppendFiles;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.DataFiles;
@@ -118,12 +115,14 @@ public BaseSnapshotHudiTableAction(
 
   @Override
   public SnapshotHudiTable tableProperties(Map<String, String> properties) {
-    return null;
+    additionalPropertiesBuilder.putAll(properties);
+    return this;
   }
 
   @Override
   public SnapshotHudiTable tableProperty(String key, String value) {
-    return null;
+    additionalPropertiesBuilder.put(key, value);
+    return this;
   }
 
   @Override
@@ -137,17 +136,20 @@ public Result execute() {
     // TODO: add support for newTableLocation
     Transaction icebergTransaction =
         icebergCatalog.newCreateTableTransaction(
-            newTableIdentifier, icebergSchema, partitionSpec, destTableProperties());
+            newTableIdentifier,
+            icebergSchema,
+            partitionSpec,
+            hoodieTableBasePath,
+            destTableProperties());
     // We need name mapping to ensure we can read data files correctly as iceberg table has its own
     // rule to assign field id
     // Although the field id rule seems to be the same as hudi, but the rule is not guaranteed by
     // any API
+    NameMapping nameMapping = MappingUtil.create(icebergTransaction.table().schema());
     icebergTransaction
         .table()
         .updateProperties()
-        .set(
-            TableProperties.DEFAULT_NAME_MAPPING,
-            NameMappingParser.toJson(MappingUtil.create(icebergTransaction.table().schema())))
+        .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping))
         .commit();
 
     // Pre-process the timeline, we only need to process all COMPLETED commit for COW table
@@ -156,18 +158,24 @@ public Result execute() {
         hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
     // Initialize the FileSystemView for querying table data files
     // TODO: need to choose the correct implementation of the FileSystemView
-//    HoodieTableFileSystemView hoodieTableFileSystemView =
-//        FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(
-//            hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline);
-    HoodieTableFileSystemView hoodieTableFileSystemView = new HoodieTableFileSystemView(
-        hoodieTableMetaClient, timeline);
+    //    HoodieTableFileSystemView hoodieTableFileSystemView =
+    //        FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(
+    //            hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline);
+    HoodieTableFileSystemView hoodieTableFileSystemView =
+        new HoodieTableFileSystemView(hoodieTableMetaClient, timeline);
     // get all instants on the timeline
     Stream<HoodieInstant> completedInstants = timeline.getInstants();
-    List<String> partitionPaths = FSUtils.getAllPartitionPaths(hoodieEngineContext, hoodieMetadataConfig, hoodieTableMetaClient.getBasePathV2().toString());
+    List<String> partitionPaths =
+        FSUtils.getAllPartitionPaths(
+            hoodieEngineContext,
+            hoodieMetadataConfig,
+            hoodieTableMetaClient.getBasePathV2().toString());
     try {
       for (String partitionPath : partitionPaths) {
-        Path fullPartitionPath = FSUtils.getPartitionPath(hoodieTableMetaClient.getBasePathV2(), partitionPath);
-        hoodieTableFileSystemView.addFilesToView(FSUtils.getAllDataFilesInPartition(hoodieTableMetaClient.getFs(), fullPartitionPath));
+        Path fullPartitionPath =
+            FSUtils.getPartitionPath(hoodieTableMetaClient.getBasePathV2(), partitionPath);
+        hoodieTableFileSystemView.addFilesToView(
+            FSUtils.getAllDataFilesInPartition(hoodieTableMetaClient.getFs(), fullPartitionPath));
       }
     } catch (IOException e) {
       throw new RuntimeException("Failed to get all data files in partition", e);
@@ -380,7 +388,7 @@ private Schema convertToIcebergSchema(InternalSchema hudiSchema) {
 
   private PartitionSpec getPartitionSpecFromHoodieMetadataData(Schema schema) {
     Option<String[]> partitionNames = hoodieTableConfig.getPartitionFields();
-    if (partitionNames.isPresent()) {
+    if (partitionNames.isPresent() && partitionNames.get().length > 0) {
       PartitionSpec.Builder builder = PartitionSpec.builderFor(schema);
       for (String partitionName : partitionNames.get()) {
         builder.identity(partitionName);

From 8fef8c9b9f3962792195e09e143372cbd75a67c1 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Tue, 31 Jan 2023 17:40:19 -0500
Subject: [PATCH 13/20] remove the need of hadoop-mr

---
 .../apache/iceberg/hudi/TestSnapshotHudiTable.java |  2 +-
 .../iceberg/hudi/BaseSnapshotHudiTableAction.java  | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index bf21ebeb4bbe..57a58cb383ab 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -231,7 +231,7 @@ public void testBasicPartitionedTable() {
             .execute();
     Table table = getIcebergTable(newTableIdentifier);
     queryManual(table);
-    // checkSnapshotIntegrity(partitionedLocation, newTableIdentifier);
+    checkSnapshotIntegrity(partitionedLocation, newTableIdentifier);
   }
 
   @Test
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 69f631180a5e..2191c6a88f56 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -41,7 +41,6 @@
 import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.exception.HoodieException;
-import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
 import org.apache.hudi.internal.schema.InternalSchema;
 import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
 import org.apache.iceberg.AppendFiles;
@@ -107,7 +106,7 @@ public BaseSnapshotHudiTableAction(
     this.hoodieTableConfig = hoodieTableMetaClient.getTableConfig();
     this.hoodieEngineContext = new HoodieLocalEngineContext(hoodieConfiguration);
     this.hoodieTableBasePath = hoodieTableBasePath;
-    this.hoodieMetadataConfig = HoodieInputFormatUtils.buildMetadataConfig(hoodieConfiguration);
+    this.hoodieMetadataConfig = buildMetadataConfig(hoodieConfiguration);
     this.hoodieFileIO = new HadoopFileIO(hoodieConfiguration);
     this.icebergCatalog = icebergCatalog;
     this.newTableIdentifier = newTableIdentifier;
@@ -198,8 +197,6 @@ public Result execute() {
     // BEGIN TEST ONLY CODE
     List<HoodieBaseFile> testGroups =
         hoodieTableFileSystemView.getLatestBaseFiles().collect(Collectors.toList());
-    LOG.info("Alpha test: get all stamped data files: {}", allStampedDataFiles);
-    LOG.info("Alpha test: get all file groups: {}", testGroups);
     // END TEST ONLY CODE
 
     // Help tracked if a previous version of the data file has been added to the iceberg table
@@ -448,4 +445,13 @@ private Metrics getMetricsForFile(
         throw new ValidationException("Cannot get metrics from file format: %s", format);
     }
   }
+
+  private HoodieMetadataConfig buildMetadataConfig(Configuration conf) {
+    return HoodieMetadataConfig.newBuilder()
+        .enable(
+            conf.getBoolean(
+                HoodieMetadataConfig.ENABLE.key(),
+                HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS))
+        .build();
+  }
 }

From 9db1ead9b263d2b5fe9475f6edba5db6c1b7a4cc Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Tue, 31 Jan 2023 22:40:54 -0500
Subject: [PATCH 14/20] verify multiple commits

---
 .../iceberg/hudi/TestSnapshotHudiTable.java   | 274 +++++++++---------
 .../hudi/BaseSnapshotHudiTableAction.java     |   2 +
 2 files changed, 141 insertions(+), 135 deletions(-)

diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index 57a58cb383ab..5a466f6fcee3 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -30,7 +30,6 @@
 import org.apache.hudi.DataSourceWriteOptions;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.iceberg.Table;
-import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.catalog.TableIdentifier;
 import org.apache.iceberg.data.IcebergGenerics;
 import org.apache.iceberg.data.Record;
@@ -65,14 +64,14 @@ public class TestSnapshotHudiTable extends SparkHudiMigrationTestBase {
   private static final String icebergCatalogName = "iceberg_hive";
   private String partitionedIdentifier;
   private String unpartitionedIdentifier;
-  private String externalDataFilesIdentifier;
+  private String multiCommitIdentifier;
   private final String partitionedTableName = "partitioned_table";
   private final String unpartitionedTableName = "unpartitioned_table";
-  private final String externalDataFilesTableName = "external_data_files_table";
+  private final String multiCommitTableName = "multi_commit_table";
   private String partitionedLocation;
   private String unpartitionedLocation;
   private String newIcebergTableLocation;
-  private String externalDataFilesTableLocation;
+  private String multiCommitTableLocation;
   private Dataset<Row> typeTestDataframe = typeTestDataFrame();
   private Dataset<Row> nestedDataframe = nestedDataFrame();
 
@@ -107,122 +106,38 @@ public TestSnapshotHudiTable(
     spark.conf().set("spark.sql.catalog." + defaultSparkCatalog, HoodieCatalog.class.getName());
   }
 
-  /**
-   * The test hardcode a nested dataframe to test the snapshot feature. The schema of created
-   * dataframe is:
-   *
-   * <pre>
-   *  root
-   *  |-- address_nested: struct (nullable = true)
-   *  |    |-- current: struct (nullable = true)
-   *  |    |    |-- city: string (nullable = true)
-   *  |    |    |-- state: string (nullable = true)
-   *  |    |-- previous: struct (nullable = true)
-   *  |    |    |-- city: string (nullable = true)
-   *  |    |    |-- state: string (nullable = true)
-   *  |-- addresses: array (nullable = true)
-   *  |    |-- element: struct (containsNull = true)
-   *  |    |    |-- city: string (nullable = true)
-   *  |    |    |-- state: string (nullable = true)
-   *  |-- id: long (nullable = true)
-   *  |-- magic_number: double (nullable = true)
-   *  |-- name: string (nullable = true)
-   *  |-- properties: struct (nullable = true)
-   *  |    |-- eye: string (nullable = true)
-   *  |    |-- hair: string (nullable = true)
-   *  |-- secondProp: struct (nullable = true)
-   *  |    |-- height: string (nullable = true)
-   *  |-- subjects: array (nullable = true)
-   *  |    |-- element: array (containsNull = true)
-   *  |    |    |-- element: string (containsNull = true)
-   * </pre>
-   *
-   * The dataframe content is (by calling df.show()):
-   *
-   * <pre>
-   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
-   * |      address_nested|           addresses| id|  magic_number|   name|          properties|secondProp|            subjects|
-   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
-   * |{{NewYork, NY}, {...|[{SanJose, CA}, {...|  1|1.123123123123|Michael|      {black, brown}|       {6}|[[Java, Scala, C+...|
-   * |{{NewY1231ork, N1...|[{SanJos123123e, ...|  2|2.123123123123|   Test|      {black, brown}|       {6}|[[Java, Scala, C+...|
-   * |                null|[{SanJose, CA}, {...|  3|3.123123123123|   Test|      {black, brown}|       {6}|[[Java, Scala, C+...|
-   * |{{NewYork, NY}, {...|[{LA, CA}, {Sandi...|  4|4.123123123123|   John|{bla3221ck, b12rown}|     {633}|     [[Spark, Java]]|
-   * |{{Haha, PA}, {nul...|[{Pittsburgh, PA}...|  5|5.123123123123|  Jonas|      {black, black}|       {7}|[[Java, Scala, C+...|
-   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
-   * </pre>
-   */
   @Before
   public void before() throws IOException {
     File partitionedFolder = temp1.newFolder();
     File unpartitionedFolder = temp2.newFolder();
     File newIcebergTableFolder = temp3.newFolder();
-    File externalDataFilesTableFolder = temp4.newFolder();
+    File multiCommitTableFolder = temp4.newFolder();
     partitionedLocation = partitionedFolder.toURI().toString();
     unpartitionedLocation = unpartitionedFolder.toURI().toString();
     newIcebergTableLocation = newIcebergTableFolder.toURI().toString();
-    externalDataFilesTableLocation = externalDataFilesTableFolder.toURI().toString();
+    multiCommitTableLocation = multiCommitTableFolder.toURI().toString();
 
     spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", NAMESPACE));
 
     partitionedIdentifier = destName(defaultSparkCatalog, partitionedTableName);
     unpartitionedIdentifier = destName(defaultSparkCatalog, unpartitionedTableName);
-    externalDataFilesIdentifier = destName(defaultSparkCatalog, externalDataFilesTableName);
+    multiCommitIdentifier = destName(defaultSparkCatalog, multiCommitTableName);
 
     spark.sql(String.format("DROP TABLE IF EXISTS %s", partitionedIdentifier));
     spark.sql(String.format("DROP TABLE IF EXISTS %s", unpartitionedIdentifier));
-    spark.sql(String.format("DROP TABLE IF EXISTS %s", externalDataFilesIdentifier));
+    spark.sql(String.format("DROP TABLE IF EXISTS %s", multiCommitIdentifier));
+  }
 
-    //    typeTestDataframe.write()
-    //        .format("hudi")
-    //        .options(QuickstartUtils.getQuickstartWriteConfigs())
-    //        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "decimalCol")
-    //        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "intCol")
-    //        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partitionPath")
-    //        .option(HoodieWriteConfig.TABLE_NAME, partitionedIdentifier)
-    //        .mode(SaveMode.Overwrite)
-    //        .save(partitionedLocation);
+  @Test
+  public void testBasicPartitionedTable() {
     writeHoodieTable(
         typeTestDataframe,
         "decimalCol",
         "intCol",
         "partitionPath",
+        SaveMode.Overwrite,
         partitionedLocation,
         partitionedIdentifier);
-
-    //    df.write()
-    //        .format("hudi")
-    //        .options(QuickstartUtils.getQuickstartWriteConfigs())
-    //        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number")
-    //        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name")
-    //        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "")
-    //        .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier)
-    //        .mode(SaveMode.Overwrite)
-    //        .save(unpartitionedLocation);
-    writeHoodieTable(
-        typeTestDataframe,
-        "decimalCol",
-        "intCol",
-        "",
-        unpartitionedLocation,
-        unpartitionedIdentifier);
-  }
-
-  @Test
-  public void testHudiUnpartitionedTableWrite() {
-    Dataset<Row> df = spark.read().format("hudi").load(unpartitionedLocation);
-    LOG.info("Generated unpartitioned dataframe shcema: {}", df.schema().treeString());
-    LOG.info("Generated unpartitioned dataframe: {}", df.showString(10, 20, false));
-  }
-
-  @Test
-  public void testHudiPartitionedTableWrite() {
-    Dataset<Row> df = spark.read().format("hudi").load(partitionedLocation);
-    LOG.info("Generated partitioned dataframe shcema: {}", df.schema().treeString());
-    LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false));
-  }
-
-  @Test
-  public void testBasicPartitionedTable() {
     LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation);
     String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table");
     SnapshotHudiTable.Result result =
@@ -234,52 +149,93 @@ public void testBasicPartitionedTable() {
     checkSnapshotIntegrity(partitionedLocation, newTableIdentifier);
   }
 
-  @Test
-  public void referenceIcebergTable() {
-    String newTableIdentifier = destName(icebergCatalogName, "reference_iceberg_table");
-    typeTestDataframe
-        .writeTo(newTableIdentifier)
-        .using("iceberg")
-        .tableProperty(
-            TableProperties.WRITE_DATA_LOCATION,
-            "/Users/jonasjiang/Workspace/Apache_Hudi_ws/hudi_table_test/unpartitioned_iceberg_ref")
-        .tableProperty(
-            TableProperties.WRITE_METADATA_LOCATION,
-            "/Users/jonasjiang/Workspace/Apache_Hudi_ws/hudi_table_test/unpartitioned_iceberg_ref/metadata")
-        .createOrReplace();
-    Table table = getIcebergTable(newTableIdentifier);
-    queryManual(table);
-  }
-
   @Test
   public void testBasicUnpartitionedTable() {
+    writeHoodieTable(
+        typeTestDataframe,
+        "decimalCol",
+        "intCol",
+        "",
+        SaveMode.Overwrite,
+        unpartitionedLocation,
+        unpartitionedIdentifier);
     String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_2");
     SnapshotHudiTable.Result result =
         HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(
                 spark, unpartitionedLocation, newTableIdentifier)
             .execute();
-
-    Dataset<Row> hudiResult =
-        spark
-            .read()
-            .format("hudi")
-            .option(
-                DataSourceReadOptions.QUERY_TYPE_OPT_KEY(),
-                DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL())
-            .load(unpartitionedLocation);
-    LOG.info("Hudi table contents: {}", hudiResult.showString(10, 20, false));
-    Table table = getIcebergTable(newTableIdentifier);
-    queryManual(table);
     checkSnapshotIntegrity(unpartitionedLocation, newTableIdentifier);
   }
 
+  @Test
+  public void testMultiCommitTable() {
+    Dataset<Row> initialDataFrame = multiDataFrame(0, 2);
+    writeHoodieTable(
+        initialDataFrame,
+        "decimalCol",
+        "magic_number",
+        "partitionPath",
+        SaveMode.Append,
+        multiCommitTableLocation,
+        multiCommitIdentifier);
+    writeHoodieTable(
+        initialDataFrame,
+        "decimalCol",
+        "magic_number",
+        "partitionPath",
+        SaveMode.Append,
+        multiCommitTableLocation,
+        multiCommitIdentifier);
+    writeHoodieTable(
+        multiDataFrame(2, 5),
+        "decimalCol",
+        "magic_number",
+        "partitionPath",
+        SaveMode.Append,
+        multiCommitTableLocation,
+        multiCommitIdentifier);
+    writeHoodieTable(
+        multiDataFrame(0, 1),
+        "decimalCol",
+        "magic_number",
+        "partitionPath",
+        SaveMode.Append,
+        multiCommitTableLocation,
+        multiCommitIdentifier);
+    Dataset<Row> toDelete = multiDataFrame(4, 5);
+    writeHoodieTable(
+        toDelete,
+        "decimalCol",
+        "magic_number",
+        "partitionPath",
+        SaveMode.Append,
+        multiCommitTableLocation,
+        multiCommitIdentifier);
+    writeHoodieTableOperation(
+        toDelete,
+        DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(),
+        "decimalCol",
+        "magic_number",
+        "partitionPath",
+        SaveMode.Append,
+        multiCommitTableLocation,
+        multiCommitIdentifier);
+
+    String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_3");
+    SnapshotHudiTable.Result result =
+        HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(
+                spark, multiCommitTableLocation, newTableIdentifier)
+            .execute();
+    checkSnapshotIntegrity(multiCommitTableLocation, newTableIdentifier);
+  }
+
   private void checkSnapshotIntegrity(String hudiTableLocation, String icebergTableIdentifier) {
     Dataset<Row> hudiResult =
         spark
             .read()
             .format("hudi")
             .option(
-                DataSourceReadOptions.QUERY_TYPE_OPT_KEY(),
+                DataSourceReadOptions.QUERY_TYPE().key(),
                 DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL())
             .load(hudiTableLocation);
     Dataset<Row> icebergResult = spark.sql("SELECT * FROM " + icebergTableIdentifier);
@@ -324,25 +280,52 @@ private Dataset<Row> typeTestDataFrame() {
         .range(0, 5, 1, 5)
         .withColumnRenamed("id", "longCol")
         .withColumn("intCol", expr("CAST(longCol AS INT)"))
-        //        .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"))
-        //        .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)"))
+        .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"))
+        .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)"))
         .withColumn("dateCol", date_add(current_date(), 1))
         //        .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
         .withColumn("stringCol", expr("CAST(dateCol AS STRING)"))
-        //        .withColumn("booleanCol", expr("longCol > 5"))
-        //        .withColumn("binaryCol", expr("CAST(longCol AS BINARY)"))
-        //        .withColumn("byteCol", expr("CAST(longCol AS BYTE)"))
+        .withColumn("booleanCol", expr("longCol > 5"))
+        .withColumn("binaryCol", expr("CAST(longCol AS BINARY)"))
+        .withColumn("byteCol", expr("CAST(longCol AS BYTE)"))
         .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))"))
-        //        .withColumn("shortCol", expr("CAST(longCol AS SHORT)"))
+        .withColumn("shortCol", expr("CAST(longCol AS SHORT)"))
         .withColumn("mapCol", expr("MAP(stringCol, intCol)")) // Hudi requires Map key to be String
-        .withColumn("arrayCol", expr("ARRAY(dateCol)"))
-        //        .withColumn("structCol", expr("STRUCT(longCol AS a, longCol AS b)"))
+        //        .withColumn("arrayCol", expr("ARRAY(dateCol)")) // hudi's parquet handles array
+        // type differently from iceberg
+        .withColumn("structCol", expr("STRUCT(longCol AS a, longCol AS b)"))
         .withColumn(
             "partitionPath",
             expr("CAST(longCol AS STRING)")); // For test convenience, please put the partition col
     // in the end.
   }
 
+  private Dataset<Row> multiDataFrame(int start, int end) {
+    return spark
+        .range(start, end, 1, end - start)
+        .withColumn("longCol", expr("id"))
+        .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))"))
+        .withColumn("magic_number", expr("rand(5) * 100"))
+        .withColumn("dateCol", date_add(current_date(), 1))
+        .withColumn("dateString", expr("CAST(dateCol AS STRING)"))
+        .withColumn("random1", expr("CAST(rand(5) * 100 as LONG)"))
+        .withColumn("random2", expr("CAST(rand(51) * 100 as LONG)"))
+        .withColumn("random3", expr("CAST(rand(511) * 100 as LONG)"))
+        .withColumn("random4", expr("CAST(rand(15) * 100 as LONG)"))
+        .withColumn("random5", expr("CAST(rand(115) * 100 as LONG)"))
+        .withColumn("innerStruct1", expr("STRUCT(random1, random2)"))
+        .withColumn("innerStruct2", expr("STRUCT(random3, random4)"))
+        .withColumn("structCol1", expr("STRUCT(innerStruct1, innerStruct2)"))
+        .withColumn(
+            "innerStruct3",
+            expr("STRUCT(SHA1(CAST(random5 AS BINARY)), SHA1(CAST(random1 AS BINARY)))"))
+        .withColumn(
+            "structCol2",
+            expr(
+                "STRUCT(innerStruct3, STRUCT(SHA1(CAST(random2 AS BINARY)), SHA1(CAST(random3 AS BINARY))))"))
+        .withColumn("partitionPath", expr("CAST(id AS STRING)"));
+  }
+
   private Dataset<Row> nestedDataFrame() {
     return spark
         .range(0, 5, 1, 5)
@@ -378,6 +361,7 @@ private void writeHoodieTable(
       String recordKey,
       String preCombineKey,
       String partitionPathField,
+      SaveMode saveMode,
       String tableLocation,
       String tableIdentifier) {
     df.write()
@@ -387,7 +371,27 @@ private void writeHoodieTable(
         .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), preCombineKey)
         .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionPathField)
         .option(HoodieWriteConfig.TBL_NAME.key(), tableIdentifier)
-        .mode(SaveMode.Append)
+        .mode(saveMode)
+        .save(tableLocation);
+  }
+
+  private void writeHoodieTableOperation(
+      Dataset<Row> df,
+      String operationKey,
+      String recordKey,
+      String preCombineKey,
+      String partitionPathField,
+      SaveMode saveMode,
+      String tableLocation,
+      String tableIdentifier) {
+    df.write()
+        .format("hudi")
+        .option(DataSourceWriteOptions.OPERATION().key(), operationKey)
+        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey)
+        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), preCombineKey)
+        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionPathField)
+        .option(HoodieWriteConfig.TBL_NAME.key(), tableIdentifier)
+        .mode(saveMode)
         .save(tableLocation);
   }
 }
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 2191c6a88f56..5c80cef94f68 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -153,6 +153,8 @@ public Result execute() {
 
     // Pre-process the timeline, we only need to process all COMPLETED commit for COW table
     // Commit that has been rollbacked will not be in either REQUESTED or INFLIGHT state
+    HoodieTimeline commitsTimeline = hoodieTableMetaClient.getCommitsTimeline();
+    HoodieTimeline archivedTimeline = hoodieTableMetaClient.getArchivedTimeline();
     HoodieTimeline timeline =
         hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
     // Initialize the FileSystemView for querying table data files

From 94dddef5b6ff02551fc223cb03718418e54b6bd7 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Wed, 1 Feb 2023 21:59:10 -0500
Subject: [PATCH 15/20] make arrayType possible by enforcing the new list type
 in parquet

---
 .../apache/iceberg/hudi/SparkHudiMigrationTestBase.java   | 1 +
 .../org/apache/iceberg/hudi/TestSnapshotHudiTable.java    | 8 +++-----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java b/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java
index 42703c4403ae..7fd5f9bd69f1 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java
@@ -45,6 +45,7 @@ public static void startMetastoreAndSpark() {
             .config(
                 "spark.hadoop." + HiveConf.ConfVars.METASTOREURIS.varname,
                 hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname))
+            .config("spark.hadoop.parquet.avro.write-old-list-structure", "false")
             .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true")
             .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
             .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index 5a466f6fcee3..d5cc02234645 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -283,7 +283,7 @@ private Dataset<Row> typeTestDataFrame() {
         .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"))
         .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)"))
         .withColumn("dateCol", date_add(current_date(), 1))
-        //        .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
+        .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
         .withColumn("stringCol", expr("CAST(dateCol AS STRING)"))
         .withColumn("booleanCol", expr("longCol > 5"))
         .withColumn("binaryCol", expr("CAST(longCol AS BINARY)"))
@@ -291,13 +291,11 @@ private Dataset<Row> typeTestDataFrame() {
         .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))"))
         .withColumn("shortCol", expr("CAST(longCol AS SHORT)"))
         .withColumn("mapCol", expr("MAP(stringCol, intCol)")) // Hudi requires Map key to be String
-        //        .withColumn("arrayCol", expr("ARRAY(dateCol)")) // hudi's parquet handles array
-        // type differently from iceberg
+        .withColumn("arrayCol", expr("ARRAY(dateCol)"))
         .withColumn("structCol", expr("STRUCT(longCol AS a, longCol AS b)"))
         .withColumn(
             "partitionPath",
-            expr("CAST(longCol AS STRING)")); // For test convenience, please put the partition col
-    // in the end.
+            expr("CAST(longCol AS STRING)"));
   }
 
   private Dataset<Row> multiDataFrame(int start, int end) {

From cc0a9bb001d08a6ce23d4f7b18995beebc05e860 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Thu, 2 Feb 2023 00:20:24 -0500
Subject: [PATCH 16/20] add tests refactor the base action implementation and
 add ci

---
 .github/workflows/hudi-conversion-ci.yaml     | 86 ++++++++++++++++++
 ...udiToIcebergMigrationSparkIntegration.java | 21 +++--
 .../iceberg/hudi/TestSnapshotHudiTable.java   | 90 +++++++++++++++----
 .../hudi/BaseSnapshotHudiTableAction.java     | 89 ++++++++----------
 ...HudiToIcebergMigrationActionsProvider.java | 20 ++++-
 .../iceberg/hudi/SnapshotHudiTable.java       | 55 +++++++++++-
 6 files changed, 282 insertions(+), 79 deletions(-)
 create mode 100644 .github/workflows/hudi-conversion-ci.yaml

diff --git a/.github/workflows/hudi-conversion-ci.yaml b/.github/workflows/hudi-conversion-ci.yaml
new file mode 100644
index 000000000000..3e2b1018acec
--- /dev/null
+++ b/.github/workflows/hudi-conversion-ci.yaml
@@ -0,0 +1,86 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "Hudi Conversion CI"
+on:
+  push:
+    branches:
+      - 'master'
+      - '0.**'
+    tags:
+      - 'apache-iceberg-**'
+  pull_request:
+    paths-ignore:
+      - '.github/ISSUE_TEMPLATE/iceberg_bug_report.yml'
+      - '.github/workflows/python-ci.yml'
+      - '.github/workflows/flink-ci.yml'
+      - '.github/workflows/hive-ci.yml'
+      - '.gitignore'
+      - '.asf.yml'
+      - 'dev/**'
+      - 'mr/**'
+      - 'hive3/**'
+      - 'hive3-orc-bundle/**'
+      - 'hive-runtime/**'
+      - 'flink/**'
+      - 'pig/**'
+      - 'python/**'
+      - 'python_legacy/**'
+      - 'docs/**'
+      - 'open-api/**'
+      - 'format/**'
+      - '.gitattributes'
+      - 'README.md'
+      - 'CONTRIBUTING.md'
+      - 'LICENSE'
+      - 'NOTICE'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  hudi-conversion-scala-2-12-tests:
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        jvm: [8, 11]
+    env:
+      SPARK_LOCAL_IP: localhost
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-java@v3
+        with:
+          distribution: zulu
+          java-version: ${{ matrix.jvm }}
+      - uses: actions/cache@v3
+        with:
+          path: |
+            ~/.gradle/caches
+            ~/.gradle/wrapper
+          key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties') }}
+          restore-keys: ${{ runner.os }}-gradle-
+      - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts
+      - run: ./gradlew -DsparkVersions=3.3 -DscalaVersion=2.12 -DhiveVersions= -DflinkVersions= :iceberg-hudi:check -Pquick=true -x javadoc
+      - uses: actions/upload-artifact@v3
+        if: failure()
+        with:
+          name: test logs
+          path: |
+            **/build/testlogs
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java
index cfeca68687b1..cf06fa9556cc 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java
@@ -19,6 +19,7 @@
 package org.apache.iceberg.hudi;
 
 import org.apache.iceberg.catalog.TableIdentifier;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.spark.Spark3Util;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.connector.catalog.CatalogPlugin;
@@ -28,15 +29,23 @@ private HudiToIcebergMigrationSparkIntegration() {}
 
   static SnapshotHudiTable snapshotHudiTable(
       SparkSession spark, String hudiTablePath, String newTableIdentifier) {
+    Preconditions.checkArgument(
+        spark != null, "The SparkSession cannot be null, please provide a valid SparkSession");
+    Preconditions.checkArgument(
+        newTableIdentifier != null,
+        "The table identifier cannot be null, please provide a valid table identifier for the new iceberg table");
+    Preconditions.checkArgument(
+        hudiTablePath != null,
+        "The hudi table location cannot be null, please provide a valid location of the delta lake table to be snapshot");
     String ctx = "hudi snapshot target";
     CatalogPlugin defaultCatalog = spark.sessionState().catalogManager().currentCatalog();
     Spark3Util.CatalogAndIdentifier catalogAndIdentifier =
         Spark3Util.catalogAndIdentifier(ctx, spark, newTableIdentifier, defaultCatalog);
-
-    return new BaseSnapshotHudiTableAction(
-        spark.sessionState().newHadoopConf(),
-        hudiTablePath,
-        Spark3Util.loadIcebergCatalog(spark, catalogAndIdentifier.catalog().name()),
-        TableIdentifier.parse(catalogAndIdentifier.identifier().toString()));
+    return HudiToIcebergMigrationActionsProvider.defaultProvider()
+        .snapshotHudiTable(hudiTablePath)
+        .as(TableIdentifier.parse(catalogAndIdentifier.identifier().toString()))
+        .hoodieConfiguration(spark.sessionState().newHadoopConf())
+        .icebergCatalog(
+            Spark3Util.loadIcebergCatalog(spark, catalogAndIdentifier.catalog().name()));
   }
 }
diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index d5cc02234645..5d3e33d4921d 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -31,9 +31,6 @@
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.catalog.TableIdentifier;
-import org.apache.iceberg.data.IcebergGenerics;
-import org.apache.iceberg.data.Record;
-import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.spark.Spark3Util;
 import org.apache.iceberg.spark.SparkCatalog;
@@ -57,7 +54,7 @@ public class TestSnapshotHudiTable extends SparkHudiMigrationTestBase {
 
   private static final Logger LOG = LoggerFactory.getLogger(TestSnapshotHudiTable.class.getName());
   private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source";
-  private static final String DELTA_SOURCE_VALUE = "delta";
+  private static final String HUDI_SOURCE_VALUE = "hudi";
   private static final String ORIGINAL_LOCATION_PROP = "original_location";
   private static final String NAMESPACE = "delta_conversion_test";
   private static final String defaultSparkCatalog = "spark_catalog";
@@ -144,9 +141,9 @@ public void testBasicPartitionedTable() {
         HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(
                 spark, partitionedLocation, newTableIdentifier)
             .execute();
-    Table table = getIcebergTable(newTableIdentifier);
-    queryManual(table);
     checkSnapshotIntegrity(partitionedLocation, newTableIdentifier);
+    checkIcebergTableLocation(newTableIdentifier, partitionedLocation);
+    checkIcebergTableProperties(newTableIdentifier, ImmutableMap.of(), partitionedLocation);
   }
 
   @Test
@@ -165,6 +162,8 @@ public void testBasicUnpartitionedTable() {
                 spark, unpartitionedLocation, newTableIdentifier)
             .execute();
     checkSnapshotIntegrity(unpartitionedLocation, newTableIdentifier);
+    checkIcebergTableLocation(newTableIdentifier, unpartitionedLocation);
+    checkIcebergTableProperties(newTableIdentifier, ImmutableMap.of(), unpartitionedLocation);
   }
 
   @Test
@@ -227,6 +226,49 @@ public void testMultiCommitTable() {
                 spark, multiCommitTableLocation, newTableIdentifier)
             .execute();
     checkSnapshotIntegrity(multiCommitTableLocation, newTableIdentifier);
+    checkIcebergTableLocation(newTableIdentifier, multiCommitTableLocation);
+    checkIcebergTableProperties(newTableIdentifier, ImmutableMap.of(), multiCommitTableLocation);
+  }
+
+  @Test
+  public void testSnapshotWithNewLocation() {
+    writeHoodieTable(
+        typeTestDataframe,
+        "decimalCol",
+        "intCol",
+        "partitionPath",
+        SaveMode.Overwrite,
+        partitionedLocation,
+        partitionedIdentifier);
+    String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_4");
+    SnapshotHudiTable.Result result =
+        HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(
+                spark, partitionedLocation, newTableIdentifier)
+            .tableLocation(newIcebergTableLocation)
+            .execute();
+    checkSnapshotIntegrity(partitionedLocation, newTableIdentifier);
+    checkIcebergTableLocation(newTableIdentifier, newIcebergTableLocation);
+  }
+
+  @Test
+  public void testSnapshotWithAdditionalProperties() {
+    writeHoodieTable(
+        typeTestDataframe,
+        "decimalCol",
+        "intCol",
+        "partitionPath",
+        SaveMode.Overwrite,
+        partitionedLocation,
+        partitionedIdentifier);
+    String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_5");
+    SnapshotHudiTable.Result result =
+        HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(
+                spark, partitionedLocation, newTableIdentifier)
+            .tableProperties(ImmutableMap.of("test", "test"))
+            .execute();
+    checkSnapshotIntegrity(partitionedLocation, newTableIdentifier);
+    checkIcebergTableProperties(
+        newTableIdentifier, ImmutableMap.of("test", "test"), partitionedLocation);
   }
 
   private void checkSnapshotIntegrity(String hudiTableLocation, String icebergTableIdentifier) {
@@ -239,24 +281,36 @@ private void checkSnapshotIntegrity(String hudiTableLocation, String icebergTabl
                 DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL())
             .load(hudiTableLocation);
     Dataset<Row> icebergResult = spark.sql("SELECT * FROM " + icebergTableIdentifier);
-    LOG.info("Hudi table contents: {}", hudiResult.showString(10, 20, false));
-    LOG.info("Iceberg table contents: {}", icebergResult.showString(10, 20, false));
     // TODO: adjust test technique since hudi tends to return the columns in a different order (put
     // the one used for partitioning last)
     List<Row> hudiTableContents = hudiResult.collectAsList();
     List<Row> icebergTableContents = icebergResult.collectAsList();
 
     Assertions.assertThat(hudiTableContents).hasSize(icebergTableContents.size());
-    Assertions.assertThat(hudiTableContents).containsAll(icebergTableContents);
-    Assertions.assertThat(icebergTableContents)
-        .containsAll(hudiTableContents); // TODO: may change to containsExactlyInAnyOrderElementsOf
+    Assertions.assertThat(hudiTableContents)
+        .containsExactlyInAnyOrderElementsOf(icebergTableContents);
   }
 
-  private void queryManual(Table table) {
-    CloseableIterable<Record> records = IcebergGenerics.read(table).build();
-    for (Record record : records) {
-      LOG.info("Alpha Test Iceberg Record: {}", record);
-    }
+  private void checkIcebergTableLocation(String icebergTableIdentifier, String expectedLoacation) {
+    Table table = getIcebergTable(icebergTableIdentifier);
+    Assertions.assertThat(table.location()).isEqualTo(expectedLoacation);
+  }
+
+  private void checkIcebergTableProperties(
+      String icebergTableIdentifier,
+      Map<String, String> expectedAdditionalProperties,
+      String hudiTableLocation) {
+    Table icebergTable = getIcebergTable(icebergTableIdentifier);
+    ImmutableMap.Builder<String, String> expectedPropertiesBuilder = ImmutableMap.builder();
+    // The snapshot action will put some fixed properties to the table
+    expectedPropertiesBuilder.put(SNAPSHOT_SOURCE_PROP, HUDI_SOURCE_VALUE);
+    expectedPropertiesBuilder.putAll(expectedAdditionalProperties);
+    ImmutableMap<String, String> expectedProperties = expectedPropertiesBuilder.build();
+
+    Assertions.assertThat(icebergTable.properties().entrySet())
+        .containsAll(expectedProperties.entrySet());
+    Assertions.assertThat(icebergTable.properties())
+        .containsEntry(ORIGINAL_LOCATION_PROP, hudiTableLocation);
   }
 
   private Table getIcebergTable(String icebergTableIdentifier) {
@@ -293,9 +347,7 @@ private Dataset<Row> typeTestDataFrame() {
         .withColumn("mapCol", expr("MAP(stringCol, intCol)")) // Hudi requires Map key to be String
         .withColumn("arrayCol", expr("ARRAY(dateCol)"))
         .withColumn("structCol", expr("STRUCT(longCol AS a, longCol AS b)"))
-        .withColumn(
-            "partitionPath",
-            expr("CAST(longCol AS STRING)"));
+        .withColumn("partitionPath", expr("CAST(longCol AS STRING)"));
   }
 
   private Dataset<Row> multiDataFrame(int start, int end) {
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 5c80cef94f68..7b1e25791d3c 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -94,22 +94,13 @@ public class BaseSnapshotHudiTableAction implements SnapshotHudiTable {
   private String hoodieTableBasePath;
   private Catalog icebergCatalog;
   private TableIdentifier newTableIdentifier;
+  private String newTableLocation;
   private HadoopFileIO hoodieFileIO;
   private ImmutableMap.Builder<String, String> additionalPropertiesBuilder = ImmutableMap.builder();
 
-  public BaseSnapshotHudiTableAction(
-      Configuration hoodieConfiguration,
-      String hoodieTableBasePath,
-      Catalog icebergCatalog,
-      TableIdentifier newTableIdentifier) {
-    this.hoodieTableMetaClient = buildTableMetaClient(hoodieConfiguration, hoodieTableBasePath);
-    this.hoodieTableConfig = hoodieTableMetaClient.getTableConfig();
-    this.hoodieEngineContext = new HoodieLocalEngineContext(hoodieConfiguration);
+  public BaseSnapshotHudiTableAction(String hoodieTableBasePath) {
     this.hoodieTableBasePath = hoodieTableBasePath;
-    this.hoodieMetadataConfig = buildMetadataConfig(hoodieConfiguration);
-    this.hoodieFileIO = new HadoopFileIO(hoodieConfiguration);
-    this.icebergCatalog = icebergCatalog;
-    this.newTableIdentifier = newTableIdentifier;
+    this.newTableLocation = hoodieTableBasePath;
   }
 
   @Override
@@ -119,8 +110,36 @@ public SnapshotHudiTable tableProperties(Map<String, String> properties) {
   }
 
   @Override
-  public SnapshotHudiTable tableProperty(String key, String value) {
-    additionalPropertiesBuilder.put(key, value);
+  public SnapshotHudiTable tableProperty(String name, String value) {
+    additionalPropertiesBuilder.put(name, value);
+    return this;
+  }
+
+  @Override
+  public SnapshotHudiTable tableLocation(String location) {
+    this.newTableLocation = location;
+    return this;
+  }
+
+  @Override
+  public SnapshotHudiTable as(TableIdentifier identifier) {
+    this.newTableIdentifier = identifier;
+    return this;
+  }
+
+  @Override
+  public SnapshotHudiTable icebergCatalog(Catalog catalog) {
+    this.icebergCatalog = catalog;
+    return this;
+  }
+
+  @Override
+  public SnapshotHudiTable hoodieConfiguration(Configuration configuration) {
+    this.hoodieTableMetaClient = buildTableMetaClient(configuration, hoodieTableBasePath);
+    this.hoodieTableConfig = hoodieTableMetaClient.getTableConfig();
+    this.hoodieEngineContext = new HoodieLocalEngineContext(configuration);
+    this.hoodieMetadataConfig = buildMetadataConfig(configuration);
+    this.hoodieFileIO = new HadoopFileIO(configuration);
     return this;
   }
 
@@ -132,18 +151,15 @@ public Result execute() {
     Schema icebergSchema = convertToIcebergSchema(hudiSchema);
     PartitionSpec partitionSpec = getPartitionSpecFromHoodieMetadataData(icebergSchema);
 
-    // TODO: add support for newTableLocation
     Transaction icebergTransaction =
         icebergCatalog.newCreateTableTransaction(
             newTableIdentifier,
             icebergSchema,
             partitionSpec,
-            hoodieTableBasePath,
+            newTableLocation,
             destTableProperties());
-    // We need name mapping to ensure we can read data files correctly as iceberg table has its own
+    // Need name mapping to ensure we can read data files correctly as iceberg table has its own
     // rule to assign field id
-    // Although the field id rule seems to be the same as hudi, but the rule is not guaranteed by
-    // any API
     NameMapping nameMapping = MappingUtil.create(icebergTransaction.table().schema());
     icebergTransaction
         .table()
@@ -153,19 +169,11 @@ public Result execute() {
 
     // Pre-process the timeline, we only need to process all COMPLETED commit for COW table
     // Commit that has been rollbacked will not be in either REQUESTED or INFLIGHT state
-    HoodieTimeline commitsTimeline = hoodieTableMetaClient.getCommitsTimeline();
-    HoodieTimeline archivedTimeline = hoodieTableMetaClient.getArchivedTimeline();
     HoodieTimeline timeline =
         hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
     // Initialize the FileSystemView for querying table data files
-    // TODO: need to choose the correct implementation of the FileSystemView
-    //    HoodieTableFileSystemView hoodieTableFileSystemView =
-    //        FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(
-    //            hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline);
     HoodieTableFileSystemView hoodieTableFileSystemView =
         new HoodieTableFileSystemView(hoodieTableMetaClient, timeline);
-    // get all instants on the timeline
-    Stream<HoodieInstant> completedInstants = timeline.getInstants();
     List<String> partitionPaths =
         FSUtils.getAllPartitionPaths(
             hoodieEngineContext,
@@ -181,6 +189,8 @@ public Result execute() {
     } catch (IOException e) {
       throw new RuntimeException("Failed to get all data files in partition", e);
     }
+    // get all instants on the timeline
+    Stream<HoodieInstant> completedInstants = timeline.getInstants();
     // file group id -> Map<timestamp, HoodieBaseFile>
     // This pre-process aims to make a timestamp to HoodieBaseFile map for each file group
     Map<HoodieFileGroupId, Map<String, HoodieBaseFile>> allStampedDataFiles =
@@ -196,21 +206,11 @@ public Result execute() {
                                 ImmutableMap.toImmutableMap(
                                     HoodieBaseFile::getCommitTime, baseFile -> baseFile))));
 
-    // BEGIN TEST ONLY CODE
-    List<HoodieBaseFile> testGroups =
-        hoodieTableFileSystemView.getLatestBaseFiles().collect(Collectors.toList());
-    // END TEST ONLY CODE
-
     // Help tracked if a previous version of the data file has been added to the iceberg table
     Map<HoodieFileGroupId, DataFile> convertedDataFiles = Maps.newHashMap();
     // Replay the timeline from beginning to the end
     completedInstants.forEachOrdered(
         instant -> {
-          LOG.info("Alpha test: get completed instant: {}", instant);
-          // copyInstants to iceberg table
-          // TODO: need to verify the order of the instants, make sure it is from the oldest to the
-          // newest
-
           // commit each instant as a transaction to the iceberg table
           commitHoodieInstantToIcebergTransaction(
               instant,
@@ -253,7 +253,6 @@ public void commitHoodieInstantToIcebergTransaction(
     List<DataFile> filesToAdd = Lists.newArrayList();
     List<DataFile> filesToRemove = Lists.newArrayList();
 
-    // TODO: may need to add synchronization lock for parallelism
     fileGroups
         .sequential()
         .forEach(
@@ -316,7 +315,6 @@ private DataFile buildDataFileFromHoodieBaseFile(
     }
 
     PartitionSpec spec = table.spec();
-    // TODO: need to verify the path is absolute (the field's name is fullPath)
     String path = baseFile.getPath();
     long fileSize = baseFile.getFileSize();
     String partitionValue = fileGroup.getPartitionPath();
@@ -330,8 +328,6 @@ private DataFile buildDataFileFromHoodieBaseFile(
     FileFormat format = determineFileFormatFromPath(path);
     Metrics metrics = getMetricsForFile(file, format, metricsConfig, nameMapping);
 
-    List<PartitionField> testFields = spec.fields();
-
     String partition =
         spec.fields().stream()
             .map(PartitionField::name)
@@ -355,14 +351,6 @@ private DataFile buildDataFileFromHoodieBaseFile(
   private InternalSchema getHudiSchema() {
     TableSchemaResolver schemaUtil = new TableSchemaResolver(hoodieTableMetaClient);
     Option<InternalSchema> hudiSchema = schemaUtil.getTableInternalSchemaFromCommitMetadata();
-    LOG.info("Alpha test: hoodie schema: {}", hudiSchema);
-    LOG.info("Alpha test: active timeline: {}", hoodieTableMetaClient.getActiveTimeline());
-    LOG.info(
-        "Alpha test: active timeline commit timeline: {}",
-        hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline());
-    LOG.info(
-        "Alpha test: active timeline commit timeline instants: {}",
-        hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
     return hudiSchema.orElseGet(
         () -> {
           try {
@@ -399,14 +387,13 @@ private PartitionSpec getPartitionSpecFromHoodieMetadataData(Schema schema) {
   }
 
   private Map<String, String> destTableProperties() {
-    // TODO: need to check which hoodie properties to add to
     additionalPropertiesBuilder.putAll(hoodieTableConfig.propsMap());
     additionalPropertiesBuilder.putAll(
         ImmutableMap.of(
             SNAPSHOT_SOURCE_PROP,
             HOODIE_SOURCE_VALUE,
             ORIGINAL_LOCATION_PROP,
-            hoodieTableMetaClient.getBasePathV2().toString()));
+            hoodieTableBasePath));
 
     return additionalPropertiesBuilder.build();
   }
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java b/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java
index 0a1e0808af43..8ba58e2ed203 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java
@@ -18,12 +18,28 @@
  */
 package org.apache.iceberg.hudi;
 
+/**
+ * An API that provide actions for migration from an Apache Hudi table to an Iceberg table. Query
+ * engines can use {@code defaultActions()} to access default action implementations, or implement
+ * this provider to supply a different implementation if necessary.
+ */
 public interface HudiToIcebergMigrationActionsProvider {
 
-  default SnapshotHudiTable snapshotHudiTable() {
-    throw new UnsupportedOperationException("snapshotHudiTable is not supported");
+  /**
+   * Initiates an action to snapshot an existing Delta Lake table to an Iceberg table.
+   *
+   * @param sourceTableLocation the location of the Delta Lake table
+   * @return a {@link SnapshotHudiTable} action
+   */
+  default SnapshotHudiTable snapshotHudiTable(String sourceTableLocation) {
+    return new BaseSnapshotHudiTableAction(sourceTableLocation);
   }
 
+  /**
+   * Get the default implementation of {@link HudiToIcebergMigrationActionsProvider}
+   *
+   * @return an instance with access to all default actions
+   */
   static HudiToIcebergMigrationActionsProvider defaultProvider() {
     return DefaultHudiToIcebergMigrationActions.defaultMigrationActions();
   }
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java b/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java
index a5208809b314..cf86139516d8 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java
@@ -19,16 +19,69 @@
 package org.apache.iceberg.hudi;
 
 import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.iceberg.actions.Action;
+import org.apache.iceberg.catalog.Catalog;
+import org.apache.iceberg.catalog.TableIdentifier;
 
 public interface SnapshotHudiTable extends Action<SnapshotHudiTable, SnapshotHudiTable.Result> {
 
+  /**
+   * Sets table properties in the newly created Iceberg table. Any properties with the same key name
+   * will be overwritten.
+   *
+   * @param properties a map of properties to set
+   * @return this for method chaining
+   */
   SnapshotHudiTable tableProperties(Map<String, String> properties);
 
-  SnapshotHudiTable tableProperty(String key, String value);
+  /**
+   * Sets a table property in the newly created Iceberg table. Any properties with the same key will
+   * be overwritten.
+   *
+   * @param name a table property name
+   * @param value a table property value
+   * @return this for method chaining
+   */
+  SnapshotHudiTable tableProperty(String name, String value);
 
+  /**
+   * Sets the location of the newly created Iceberg table. Default location is the same as the Hudi
+   * table.
+   *
+   * @param location a path to the new table location
+   * @return this for method chaining
+   */
+  SnapshotHudiTable tableLocation(String location);
+
+  /**
+   * Sets the identifier of the newly created Iceberg table. This is required to be set before
+   * execute the action.
+   *
+   * @param identifier a table identifier (namespace, name) @Returns this for method chaining
+   */
+  SnapshotHudiTable as(TableIdentifier identifier);
+
+  /**
+   * Sets the catalog of the newly created Iceberg table. This is required to be set before execute
+   * the action
+   *
+   * @param catalog a catalog @Returns this for method chaining
+   */
+  SnapshotHudiTable icebergCatalog(Catalog catalog);
+
+  /**
+   * Sets the Hadoop configuration used to access hudi table's timeline and file groups. This is
+   * required to be set before execute the action.
+   *
+   * @param conf a Hadoop configuration @Returns this for method chaining
+   */
+  SnapshotHudiTable hoodieConfiguration(Configuration conf);
+
+  /** The action result that contains a summary of the execution. */
   interface Result {
 
+    /** Returns the number of snapshot data files. */
     long snapshotFilesCount();
   }
 }

From 3ee5a6b5e7b5e17198a5ca97b08aefbb0e741dfe Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Thu, 2 Feb 2023 01:33:39 -0500
Subject: [PATCH 17/20] resolve dependency issue (kind of...)

---
 build.gradle   | 13 ++++++++-----
 versions.props |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/build.gradle b/build.gradle
index 82d27288d7d1..8de344026481 100644
--- a/build.gradle
+++ b/build.gradle
@@ -458,10 +458,10 @@ project(':iceberg-hudi') {
     implementation project(':iceberg-orc')
     implementation "com.fasterxml.jackson.core:jackson-databind"
 
-    // TODO: we only need hudi-common here, however, hudi-common has some dependency conflicts with hudi-spark-bundle
-    // which is currently used by the integration test. We should fix this in the future.
-    // Also, hudi uses java8, may need to assess if we can use hudi in java11.
-    compileOnly("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2")
+    // Hudi uses java8, may need to assess if we can use hudi in java11.
+    compileOnly("org.apache.hudi:hudi-common")
+    // Added to resolve dependency conflicts with hudi-spark-bundle
+    compileOnly("org.apache.hudi:hudi-client-common")
     implementation("org.apache.avro:avro") {
       exclude group: 'org.tukaani' // xz compression is not supported
     }
@@ -474,7 +474,10 @@ project(':iceberg-hudi') {
     }
     if (sparkVersions.contains("3.3") && scalaVersion == "2.12") {
       integrationImplementation project(':iceberg-data')
-      integrationImplementation("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2")
+      integrationImplementation("org.apache.hudi:hudi-spark3.3-bundle_2.12") {
+        exclude group: 'org.apache.hudi', module: 'hudi-common'
+        exclude group: 'org.apache.hudi', module: 'hudi-client-common'
+      }
       integrationImplementation project(path: ":iceberg-spark:iceberg-spark-3.3_${scalaVersion}")
       integrationImplementation("org.apache.hadoop:hadoop-minicluster") {
         exclude group: 'org.apache.avro', module: 'avro'
diff --git a/versions.props b/versions.props
index 99dbea48a244..3739ab1748cf 100644
--- a/versions.props
+++ b/versions.props
@@ -29,6 +29,7 @@ org.scala-lang.modules:scala-collection-compat_2.13 = 2.6.0
 com.emc.ecs:object-client-bundle = 3.3.2
 org.immutables:value = 2.9.2
 net.snowflake:snowflake-jdbc = 3.13.22
+org.apache.hudi:* = 0.12.0
 
 # test deps
 org.junit.vintage:junit-vintage-engine = 5.8.2

From fb479a6cec5f10d0f8d4737b840f74d80493abd9 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Tue, 7 Feb 2023 21:42:29 -0500
Subject: [PATCH 18/20] handle multiple partition columns

---
 .../iceberg/hudi/TestSnapshotHudiTable.java   | 47 ++++++++++++++++++-
 .../hudi/BaseSnapshotHudiTableAction.java     | 18 +++++--
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index 5d3e33d4921d..55864a93f380 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -271,6 +271,27 @@ public void testSnapshotWithAdditionalProperties() {
         newTableIdentifier, ImmutableMap.of("test", "test"), partitionedLocation);
   }
 
+  @Test
+  public void testSnapshotWithComplexKeyGen() {
+    writeHoodieTableKeyGenerator(
+        multiDataFrame(0, 1),
+        "decimalCol,dateCol",
+        "magic_number",
+        "zpartitionPath,partitionPath,partitionPath2",
+        SaveMode.Append,
+        partitionedLocation,
+        partitionedIdentifier);
+    String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_6");
+    SnapshotHudiTable.Result result =
+        HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(
+                spark, partitionedLocation, newTableIdentifier)
+            .tableProperties(ImmutableMap.of("test", "test"))
+            .execute();
+    checkSnapshotIntegrity(partitionedLocation, newTableIdentifier);
+    checkIcebergTableProperties(
+        newTableIdentifier, ImmutableMap.of("test", "test"), partitionedLocation);
+  }
+
   private void checkSnapshotIntegrity(String hudiTableLocation, String icebergTableIdentifier) {
     Dataset<Row> hudiResult =
         spark
@@ -373,7 +394,9 @@ private Dataset<Row> multiDataFrame(int start, int end) {
             "structCol2",
             expr(
                 "STRUCT(innerStruct3, STRUCT(SHA1(CAST(random2 AS BINARY)), SHA1(CAST(random3 AS BINARY))))"))
-        .withColumn("partitionPath", expr("CAST(id AS STRING)"));
+        .withColumn("zpartitionPath", expr("CAST(dateCol AS STRING)"))
+        .withColumn("partitionPath", expr("CAST(id AS STRING)"))
+        .withColumn("partitionPath2", expr("CAST(random1 AS STRING)"));
   }
 
   private Dataset<Row> nestedDataFrame() {
@@ -425,6 +448,28 @@ private void writeHoodieTable(
         .save(tableLocation);
   }
 
+  private void writeHoodieTableKeyGenerator(
+      Dataset<Row> df,
+      String recordKey,
+      String preCombineKey,
+      String partitionPathField,
+      SaveMode saveMode,
+      String tableLocation,
+      String tableIdentifier) {
+    df.write()
+        .format("hudi")
+        //        .options(QuickstartUtils.getQuickstartWriteConfigs())
+        .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey)
+        .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), preCombineKey)
+        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionPathField)
+        .option(
+            DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(),
+            "org.apache.hudi.keygen.ComplexKeyGenerator")
+        .option(HoodieWriteConfig.TBL_NAME.key(), tableIdentifier)
+        .mode(saveMode)
+        .save(tableLocation);
+  }
+
   private void writeHoodieTableOperation(
       Dataset<Row> df,
       String operationKey,
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 7b1e25791d3c..3a70d559f284 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -70,6 +70,7 @@
 import org.apache.iceberg.mapping.NameMappingParser;
 import org.apache.iceberg.orc.OrcMetrics;
 import org.apache.iceberg.parquet.ParquetUtil;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
@@ -317,7 +318,17 @@ private DataFile buildDataFileFromHoodieBaseFile(
     PartitionSpec spec = table.spec();
     String path = baseFile.getPath();
     long fileSize = baseFile.getFileSize();
-    String partitionValue = fileGroup.getPartitionPath();
+    String[] partitionValues = fileGroup.getPartitionPath().split("/");
+    List<PartitionField> partitionFields = spec.fields();
+    Preconditions.checkState(
+        partitionValues.length == partitionFields.size(), "Invalid partition values");
+    // map partition values to spec
+    ImmutableMap.Builder<String, String> partitionValueMapBuilder = ImmutableMap.builder();
+    ImmutableMap<String, String> partitionValueMap;
+    for (int i = 0; i < partitionFields.size(); i++) {
+      partitionValueMapBuilder.put(partitionFields.get(i).name(), partitionValues[i]);
+    }
+    partitionValueMap = partitionValueMapBuilder.build();
 
     MetricsConfig metricsConfig = MetricsConfig.forTable(table);
     String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
@@ -329,9 +340,8 @@ private DataFile buildDataFileFromHoodieBaseFile(
     Metrics metrics = getMetricsForFile(file, format, metricsConfig, nameMapping);
 
     String partition =
-        spec.fields().stream()
-            .map(PartitionField::name)
-            .map(name -> String.format("%s=%s", name, partitionValue))
+        partitionValueMap.entrySet().stream()
+            .map(e -> String.format("%s=%s", e.getKey(), e.getValue()))
             .collect(Collectors.joining("/"));
 
     return DataFiles.builder(spec)

From 42f97096c94c4ab8ce72dda39dda4a9fff66c4d0 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Tue, 7 Feb 2023 22:20:17 -0500
Subject: [PATCH 19/20] fix bug for unpartitioned table

---
 .../org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index 3a70d559f284..d6f5efdae5c8 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -321,7 +321,7 @@ private DataFile buildDataFileFromHoodieBaseFile(
     String[] partitionValues = fileGroup.getPartitionPath().split("/");
     List<PartitionField> partitionFields = spec.fields();
     Preconditions.checkState(
-        partitionValues.length == partitionFields.size(), "Invalid partition values");
+        partitionValues.length == partitionFields.size() || partitionFields.isEmpty(), "Invalid partition values");
     // map partition values to spec
     ImmutableMap.Builder<String, String> partitionValueMapBuilder = ImmutableMap.builder();
     ImmutableMap<String, String> partitionValueMap;

From dde7fcd548a7cb59030972e397bb528ed3eba0c4 Mon Sep 17 00:00:00 2001
From: Rushan Jiang <rushanj@andrew.cmu.edu>
Date: Wed, 8 Feb 2023 01:53:27 -0500
Subject: [PATCH 20/20] checked multi partitions

---
 .../apache/iceberg/hudi/TestSnapshotHudiTable.java   | 12 ++++++------
 .../iceberg/hudi/BaseSnapshotHudiTableAction.java    |  3 ++-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
index 55864a93f380..02c5db45d0b9 100644
--- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
+++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java
@@ -173,7 +173,7 @@ public void testMultiCommitTable() {
         initialDataFrame,
         "decimalCol",
         "magic_number",
-        "partitionPath",
+        "partitionPath2",
         SaveMode.Append,
         multiCommitTableLocation,
         multiCommitIdentifier);
@@ -181,7 +181,7 @@ public void testMultiCommitTable() {
         initialDataFrame,
         "decimalCol",
         "magic_number",
-        "partitionPath",
+        "partitionPath2",
         SaveMode.Append,
         multiCommitTableLocation,
         multiCommitIdentifier);
@@ -189,7 +189,7 @@ public void testMultiCommitTable() {
         multiDataFrame(2, 5),
         "decimalCol",
         "magic_number",
-        "partitionPath",
+        "partitionPath2",
         SaveMode.Append,
         multiCommitTableLocation,
         multiCommitIdentifier);
@@ -197,7 +197,7 @@ public void testMultiCommitTable() {
         multiDataFrame(0, 1),
         "decimalCol",
         "magic_number",
-        "partitionPath",
+        "partitionPath2",
         SaveMode.Append,
         multiCommitTableLocation,
         multiCommitIdentifier);
@@ -206,7 +206,7 @@ public void testMultiCommitTable() {
         toDelete,
         "decimalCol",
         "magic_number",
-        "partitionPath",
+        "partitionPath2",
         SaveMode.Append,
         multiCommitTableLocation,
         multiCommitIdentifier);
@@ -215,7 +215,7 @@ public void testMultiCommitTable() {
         DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(),
         "decimalCol",
         "magic_number",
-        "partitionPath",
+        "partitionPath2",
         SaveMode.Append,
         multiCommitTableLocation,
         multiCommitIdentifier);
diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
index d6f5efdae5c8..c80a6600f64f 100644
--- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
+++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java
@@ -321,7 +321,8 @@ private DataFile buildDataFileFromHoodieBaseFile(
     String[] partitionValues = fileGroup.getPartitionPath().split("/");
     List<PartitionField> partitionFields = spec.fields();
     Preconditions.checkState(
-        partitionValues.length == partitionFields.size() || partitionFields.isEmpty(), "Invalid partition values");
+        partitionValues.length == partitionFields.size() || partitionFields.isEmpty(),
+        "Invalid partition values");
     // map partition values to spec
     ImmutableMap.Builder<String, String> partitionValueMapBuilder = ImmutableMap.builder();
     ImmutableMap<String, String> partitionValueMap;