From 91ee4acfebdcdce49e26177c874ccfe1257c02ef Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Tue, 17 Jan 2023 17:17:42 -0500 Subject: [PATCH 01/20] add test base for hudi --- build.gradle | 68 +++++++ .../hudi/SparkHudiMigrationTestBase.java | 74 +++++++ .../iceberg/hudi/TestSnapshotHudiTable.java | 180 ++++++++++++++++++ settings.gradle | 2 + 4 files changed, 324 insertions(+) create mode 100644 hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java create mode 100644 hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java diff --git a/build.gradle b/build.gradle index 7b14f3b73163..1dc6f04345a0 100644 --- a/build.gradle +++ b/build.gradle @@ -48,6 +48,10 @@ plugins { id 'nebula.dependency-recommender' version '11.0.0' } +String scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") +String sparkVersionsString = System.getProperty("sparkVersions") != null ? System.getProperty("sparkVersions") : System.getProperty("defaultSparkVersions") +List sparkVersions = sparkVersionsString != null && !sparkVersionsString.isEmpty() ? sparkVersionsString.split(",") : [] + try { // apply these plugins in a try-catch block so that we can handle cases without .git directory apply plugin: 'com.palantir.git-version' @@ -438,6 +442,70 @@ project(':iceberg-aws') { } } +project(':iceberg-hudi') { + + configurations { + integrationImplementation.extendsFrom testImplementation + integrationRuntime.extendsFrom testRuntimeOnly + } + + dependencies { + implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow') + api project(':iceberg-api') + implementation project(':iceberg-common') + implementation project(':iceberg-core') + implementation project(':iceberg-parquet') + implementation project(':iceberg-orc') + implementation "com.fasterxml.jackson.core:jackson-databind" + + + compileOnly("org.apache.hadoop:hadoop-common") { + exclude group: 'org.apache.avro', module: 'avro' + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'javax.servlet', module: 'servlet-api' + exclude group: 'com.google.code.gson', module: 'gson' + } + if (sparkVersions.contains("3.3") && scalaVersion == "2.12") { + integrationImplementation("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2") + integrationImplementation project(path: ":iceberg-spark:iceberg-spark-3.3_${scalaVersion}") + integrationImplementation("org.apache.hadoop:hadoop-minicluster") { + exclude group: 'org.apache.avro', module: 'avro' + // to make sure netty libs only come from project(':iceberg-arrow') + exclude group: 'io.netty', module: 'netty-buffer' + exclude group: 'io.netty', module: 'netty-common' + } + integrationImplementation project(path: ':iceberg-hive-metastore') + integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') + integrationImplementation("org.apache.spark:spark-hive_${scalaVersion}:3.3.1") { + exclude group: 'org.apache.avro', module: 'avro' + exclude group: 'org.apache.arrow' + exclude group: 'org.apache.parquet' + // to make sure netty libs only come from project(':iceberg-arrow') + exclude group: 'io.netty', module: 'netty-buffer' + exclude group: 'io.netty', module: 'netty-common' + exclude group: 'org.roaringbitmap' + } + } + } + + if (sparkVersions.contains("3.3") && scalaVersion == "2.12") { + sourceSets { + integration { + java.srcDir "$projectDir/src/integration/java" + resources.srcDir "$projectDir/src/integration/resources" + compileClasspath += main.output + test.output + runtimeClasspath += main.output + test.output + } + } + + task integrationTest(type: Test) { + testClassesDirs = sourceSets.integration.output.classesDirs + classpath = sourceSets.integration.runtimeClasspath + } + check.dependsOn integrationTest + } +} + project(':iceberg-gcp') { dependencies { implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow') diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java b/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java new file mode 100644 index 000000000000..42703c4403ae --- /dev/null +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.hudi; + +import java.util.Map; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.hive.TestHiveMetastore; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.internal.SQLConf; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +@SuppressWarnings("VisibilityModifier") +public abstract class SparkHudiMigrationTestBase { + protected static TestHiveMetastore metastore = null; + protected static HiveConf hiveConf = null; + protected static SparkSession spark = null; + + @BeforeClass + public static void startMetastoreAndSpark() { + SparkHudiMigrationTestBase.metastore = new TestHiveMetastore(); + metastore.start(); + SparkHudiMigrationTestBase.hiveConf = metastore.hiveConf(); + + SparkHudiMigrationTestBase.spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config( + "spark.hadoop." + HiveConf.ConfVars.METASTOREURIS.varname, + hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)) + .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") + .enableHiveSupport() + .getOrCreate(); + } + + @AfterClass + public static void stopMetastoreAndSpark() throws Exception { + if (metastore != null) { + metastore.stop(); + SparkHudiMigrationTestBase.metastore = null; + } + if (spark != null) { + spark.stop(); + SparkHudiMigrationTestBase.spark = null; + } + } + + public SparkHudiMigrationTestBase( + String catalogName, String implementation, Map config) { + + spark.conf().set("spark.sql.catalog." + catalogName, implementation); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog." + catalogName + "." + key, value)); + } +} diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java new file mode 100644 index 000000000000..3268b7f5d820 --- /dev/null +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.hudi; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.QuickstartUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.hudi.catalog.HoodieCatalog; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@RunWith(Parameterized.class) +public class TestSnapshotHudiTable extends SparkHudiMigrationTestBase { + + private static final Logger LOG = LoggerFactory.getLogger(TestSnapshotHudiTable.class.getName()); + private static final String row1 = + "{\"name\":\"Michael\",\"addresses\":[{\"city\":\"SanJose\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}]," + + "\"address_nested\":{\"current\":{\"state\":\"NY\",\"city\":\"NewYork\"},\"previous\":{\"state\":\"NJ\",\"city\":\"Newark\"}}," + + "\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"6\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"]," + + "[\"Spark\",\"Java\"]],\"id\":1,\"magic_number\":1.123123123123}"; + private static final String row2 = + "{\"name\":\"Test\",\"addresses\":[{\"city\":\"SanJos123123e\",\"state\":\"CA\"},{\"city\":\"Sand12312iago\",\"state\":\"CA\"}]," + + "\"address_nested\":{\"current\":{\"state\":\"N12Y\",\"city\":\"NewY1231ork\"}},\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"}," + + "\"secondProp\":{\"height\":\"6\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"],[\"Spark\",\"Java\"]],\"id\":2,\"magic_number\":2.123123123123}"; + private static final String row3 = + "{\"name\":\"Test\",\"addresses\":[{\"city\":\"SanJose\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}]," + + "\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"6\"},\"subjects\":" + + "[[\"Java\",\"Scala\",\"C++\"],[\"Spark\",\"Java\"]],\"id\":3,\"magic_number\":3.123123123123}"; + private static final String row4 = + "{\"name\":\"John\",\"addresses\":[{\"city\":\"LA\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}]," + + "\"address_nested\":{\"current\":{\"state\":\"NY\",\"city\":\"NewYork\"},\"previous\":{\"state\":\"NJ123\"}}," + + "\"properties\":{\"hair\":\"b12rown\",\"eye\":\"bla3221ck\"},\"secondProp\":{\"height\":\"633\"},\"subjects\":" + + "[[\"Spark\",\"Java\"]],\"id\":4,\"magic_number\":4.123123123123}"; + private static final String row5 = + "{\"name\":\"Jonas\",\"addresses\":[{\"city\":\"Pittsburgh\",\"state\":\"PA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}]," + + "\"address_nested\":{\"current\":{\"state\":\"PA\",\"city\":\"Haha\"},\"previous\":{\"state\":\"NJ\"}}," + + "\"properties\":{\"hair\":\"black\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"7\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"]," + + "[\"Spark\",\"Java\"]],\"id\":5,\"magic_number\":5.123123123123}"; + private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source"; + private static final String DELTA_SOURCE_VALUE = "delta"; + private static final String ORIGINAL_LOCATION_PROP = "original_location"; + private static final String NAMESPACE = "delta_conversion_test"; + private static final String defaultSparkCatalog = "spark_catalog"; + private static final String icebergCatalogName = "iceberg_hive"; + private String partitionedIdentifier; + private String unpartitionedIdentifier; + private String externalDataFilesIdentifier; + private final String partitionedTableName = "partitioned_table"; + private final String unpartitionedTableName = "unpartitioned_table"; + private final String externalDataFilesTableName = "external_data_files_table"; + private String partitionedLocation; + private String unpartitionedLocation; + private String newIcebergTableLocation; + private String externalDataFilesTableLocation; + + @Parameterized.Parameters(name = "Catalog Name {0} - Options {2}") + public static Object[][] parameters() { + return new Object[][] { + new Object[] { + icebergCatalogName, + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", + "hive", + "default-namespace", + "default", + "parquet-enabled", + "true", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + ) + } + }; + } + + @Rule public TemporaryFolder temp1 = new TemporaryFolder(); + @Rule public TemporaryFolder temp2 = new TemporaryFolder(); + @Rule public TemporaryFolder temp3 = new TemporaryFolder(); + @Rule public TemporaryFolder temp4 = new TemporaryFolder(); + + public TestSnapshotHudiTable( + String catalogName, String implementation, Map config) { + super(catalogName, implementation, config); + spark.conf().set("spark.sql.catalog." + defaultSparkCatalog, HoodieCatalog.class.getName()); + } + + @Before + public void before() throws IOException { + File partitionedFolder = temp1.newFolder(); + File unpartitionedFolder = temp2.newFolder(); + File newIcebergTableFolder = temp3.newFolder(); + File externalDataFilesTableFolder = temp4.newFolder(); + partitionedLocation = partitionedFolder.toURI().toString(); + unpartitionedLocation = unpartitionedFolder.toURI().toString(); + newIcebergTableLocation = newIcebergTableFolder.toURI().toString(); + externalDataFilesTableLocation = externalDataFilesTableFolder.toURI().toString(); + + spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", NAMESPACE)); + + partitionedIdentifier = destName(defaultSparkCatalog, partitionedTableName); + unpartitionedIdentifier = destName(defaultSparkCatalog, unpartitionedTableName); + externalDataFilesIdentifier = destName(defaultSparkCatalog, externalDataFilesTableName); + + spark.sql(String.format("DROP TABLE IF EXISTS %s", partitionedIdentifier)); + spark.sql(String.format("DROP TABLE IF EXISTS %s", unpartitionedIdentifier)); + spark.sql(String.format("DROP TABLE IF EXISTS %s", externalDataFilesIdentifier)); + + // hard code the dataframe + List jsonList = Lists.newArrayList(); + jsonList.add(row1); + jsonList.add(row2); + jsonList.add(row3); + jsonList.add(row4); + jsonList.add(row5); + JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); + SQLContext sqlContext = new SQLContext(javaSparkContext); + JavaRDD rdd = javaSparkContext.parallelize(jsonList); + Dataset df = sqlContext.read().json(rdd); + + df.write() + .format("hudi") + .options(QuickstartUtils.getQuickstartWriteConfigs()) + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "id") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "") + .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier) + .mode(SaveMode.Overwrite) + .save(unpartitionedLocation); + } + + @Test + public void TestHudiTableWrite() { + Dataset df = spark.read().format("hudi").load(unpartitionedLocation); + LOG.info("Generated dataframe shcema: {}", df.schema().treeString()); + LOG.info("Generated dataframe: {}", df.showString(10, 20,false)); + df.show(); + } + + private String destName(String catalogName, String dest) { + if (catalogName.equals(defaultSparkCatalog)) { + return NAMESPACE + "." + catalogName + "_" + dest; + } + return catalogName + "." + NAMESPACE + "." + catalogName + "_" + dest; + } +} diff --git a/settings.gradle b/settings.gradle index c5ac07e080c2..5201184f42af 100644 --- a/settings.gradle +++ b/settings.gradle @@ -35,6 +35,7 @@ include 'nessie' include 'gcp' include 'dell' include 'snowflake' +include 'hudi' project(':api').name = 'iceberg-api' project(':common').name = 'iceberg-common' @@ -53,6 +54,7 @@ project(':nessie').name = 'iceberg-nessie' project(':gcp').name = 'iceberg-gcp' project(':dell').name = 'iceberg-dell' project(':snowflake').name = 'iceberg-snowflake' +project(':hudi').name = 'iceberg-hudi' if (null != System.getProperty("allVersions")) { System.setProperty("flinkVersions", System.getProperty("knownFlinkVersions")) From 57f93f17bf8b355fc1afe24df9d8696d45737d0f Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Tue, 17 Jan 2023 22:00:17 -0500 Subject: [PATCH 02/20] add write data to partitioned hudi table --- .../iceberg/hudi/TestSnapshotHudiTable.java | 70 +++++++++++++++++-- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index 3268b7f5d820..9bfc85c9cc11 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -119,6 +119,50 @@ public TestSnapshotHudiTable( spark.conf().set("spark.sql.catalog." + defaultSparkCatalog, HoodieCatalog.class.getName()); } + /** + * The test hardcode a nested dataframe to test the snapshot feature. The schema of created + * dataframe is: + * + *
+   *  root
+   *  |-- address_nested: struct (nullable = true)
+   *  |    |-- current: struct (nullable = true)
+   *  |    |    |-- city: string (nullable = true)
+   *  |    |    |-- state: string (nullable = true)
+   *  |    |-- previous: struct (nullable = true)
+   *  |    |    |-- city: string (nullable = true)
+   *  |    |    |-- state: string (nullable = true)
+   *  |-- addresses: array (nullable = true)
+   *  |    |-- element: struct (containsNull = true)
+   *  |    |    |-- city: string (nullable = true)
+   *  |    |    |-- state: string (nullable = true)
+   *  |-- id: long (nullable = true)
+   *  |-- magic_number: double (nullable = true)
+   *  |-- name: string (nullable = true)
+   *  |-- properties: struct (nullable = true)
+   *  |    |-- eye: string (nullable = true)
+   *  |    |-- hair: string (nullable = true)
+   *  |-- secondProp: struct (nullable = true)
+   *  |    |-- height: string (nullable = true)
+   *  |-- subjects: array (nullable = true)
+   *  |    |-- element: array (containsNull = true)
+   *  |    |    |-- element: string (containsNull = true)
+   * 
+ * + * The dataframe content is (by calling df.show()): + * + *
+   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
+   * |      address_nested|           addresses| id|  magic_number|   name|          properties|secondProp|            subjects|
+   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
+   * |{{NewYork, NY}, {...|[{SanJose, CA}, {...|  1|1.123123123123|Michael|      {black, brown}|       {6}|[[Java, Scala, C+...|
+   * |{{NewY1231ork, N1...|[{SanJos123123e, ...|  2|2.123123123123|   Test|      {black, brown}|       {6}|[[Java, Scala, C+...|
+   * |                null|[{SanJose, CA}, {...|  3|3.123123123123|   Test|      {black, brown}|       {6}|[[Java, Scala, C+...|
+   * |{{NewYork, NY}, {...|[{LA, CA}, {Sandi...|  4|4.123123123123|   John|{bla3221ck, b12rown}|     {633}|     [[Spark, Java]]|
+   * |{{Haha, PA}, {nul...|[{Pittsburgh, PA}...|  5|5.123123123123|  Jonas|      {black, black}|       {7}|[[Java, Scala, C+...|
+   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
+   * 
+ */ @Before public void before() throws IOException { File partitionedFolder = temp1.newFolder(); @@ -155,7 +199,17 @@ public void before() throws IOException { df.write() .format("hudi") .options(QuickstartUtils.getQuickstartWriteConfigs()) - .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "id") + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "id") + .option(HoodieWriteConfig.TABLE_NAME, partitionedIdentifier) + .mode(SaveMode.Overwrite) + .save(partitionedLocation); + + df.write() + .format("hudi") + .options(QuickstartUtils.getQuickstartWriteConfigs()) + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number") .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name") .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "") .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier) @@ -164,11 +218,17 @@ public void before() throws IOException { } @Test - public void TestHudiTableWrite() { + public void TestHudiUnpartitionedTableWrite() { Dataset df = spark.read().format("hudi").load(unpartitionedLocation); - LOG.info("Generated dataframe shcema: {}", df.schema().treeString()); - LOG.info("Generated dataframe: {}", df.showString(10, 20,false)); - df.show(); + LOG.info("Generated unpartitioned dataframe shcema: {}", df.schema().treeString()); + LOG.info("Generated unpartitioned dataframe: {}", df.showString(10, 20, false)); + } + + @Test + public void TestHudiPartitionedTableWrite() { + Dataset df = spark.read().format("hudi").load(partitionedLocation); + LOG.info("Generated partitioned dataframe shcema: {}", df.schema().treeString()); + LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false)); } private String destName(String catalogName, String dest) { From c09198848baf5af354487252c119ae6e49c82cd2 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Wed, 18 Jan 2023 13:39:00 -0500 Subject: [PATCH 03/20] test fail --- build.gradle | 1 + ...udiToIcebergMigrationSparkIntegration.java | 29 +++++++++ .../iceberg/hudi/TestSnapshotHudiTable.java | 7 +++ .../hudi/BaseSnapshotHudiTableAction.java | 61 +++++++++++++++++++ .../BaseSnapshotHudiTableActionResult.java | 33 ++++++++++ ...HudiToIcebergMigrationActionsProvider.java | 41 +++++++++++++ .../iceberg/hudi/SnapshotHudiTable.java | 34 +++++++++++ 7 files changed, 206 insertions(+) create mode 100644 hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableActionResult.java create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java diff --git a/build.gradle b/build.gradle index 1dc6f04345a0..c0c0f05f1ae0 100644 --- a/build.gradle +++ b/build.gradle @@ -458,6 +458,7 @@ project(':iceberg-hudi') { implementation project(':iceberg-orc') implementation "com.fasterxml.jackson.core:jackson-databind" + compileOnly("org.apache.hudi:hudi-common:0.12.2") compileOnly("org.apache.hadoop:hadoop-common") { exclude group: 'org.apache.avro', module: 'avro' diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java new file mode 100644 index 000000000000..ae213d99fb7f --- /dev/null +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.hudi; + +import org.apache.spark.sql.SparkSession; + +public class HudiToIcebergMigrationSparkIntegration { + private HudiToIcebergMigrationSparkIntegration() {} + + static SnapshotHudiTable snapshotHudiTable(SparkSession spark, String hudiTablePath) { + return new BaseSnapshotHudiTableAction(spark.sessionState().newHadoopConf(), hudiTablePath); + } +} diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index 9bfc85c9cc11..060ddee428fd 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -231,6 +231,13 @@ public void TestHudiPartitionedTableWrite() { LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false)); } + @Test + public void TestHudiMetaClientAlpha() { + SnapshotHudiTable.Result result = + HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(spark, unpartitionedLocation) + .execute(); + } + private String destName(String catalogName, String dest) { if (catalogName.equals(defaultSparkCatalog)) { return NAMESPACE + "." + catalogName + "_" + dest; diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java new file mode 100644 index 000000000000..8da8746c1746 --- /dev/null +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.hudi; + +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class BaseSnapshotHudiTableAction implements SnapshotHudiTable { + + private static final Logger LOG = + LoggerFactory.getLogger(BaseSnapshotHudiTableAction.class.getName()); + + private HoodieTableMetaClient HoodieMetaClient; + + public BaseSnapshotHudiTableAction( + Configuration hoodieConfiguration, String hoodieTableBasePath) { + this.HoodieMetaClient = buildTableMetaClient(hoodieConfiguration, hoodieTableBasePath); + } + + @Override + public SnapshotHudiTable tableProperties(Map properties) { + return null; + } + + @Override + public SnapshotHudiTable tableProperty(String key, String value) { + return null; + } + + @Override + public Result execute() { + LOG.info("Alpha test: hoodie table base path: {}", HoodieMetaClient.getBasePathV2()); + + return null; + } + + private static HoodieTableMetaClient buildTableMetaClient(Configuration conf, String basePath) { + HoodieTableMetaClient metaClient = + HoodieTableMetaClient.builder().setConf(conf).setBasePath(basePath).build(); + return metaClient; + } +} diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableActionResult.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableActionResult.java new file mode 100644 index 000000000000..ba6c85ab97d3 --- /dev/null +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableActionResult.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.hudi; + +public class BaseSnapshotHudiTableActionResult implements SnapshotHudiTable.Result { + + private final long snapshotFilesCount; + + public BaseSnapshotHudiTableActionResult(long snapshotFilesCount) { + this.snapshotFilesCount = snapshotFilesCount; + } + + @Override + public long snapshotFilesCount() { + return snapshotFilesCount; + } +} diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java b/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java new file mode 100644 index 000000000000..0a1e0808af43 --- /dev/null +++ b/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.hudi; + +public interface HudiToIcebergMigrationActionsProvider { + + default SnapshotHudiTable snapshotHudiTable() { + throw new UnsupportedOperationException("snapshotHudiTable is not supported"); + } + + static HudiToIcebergMigrationActionsProvider defaultProvider() { + return DefaultHudiToIcebergMigrationActions.defaultMigrationActions(); + } + + class DefaultHudiToIcebergMigrationActions implements HudiToIcebergMigrationActionsProvider { + private static final DefaultHudiToIcebergMigrationActions INSTANCE = + new DefaultHudiToIcebergMigrationActions(); + + private DefaultHudiToIcebergMigrationActions() {} + + public static DefaultHudiToIcebergMigrationActions defaultMigrationActions() { + return INSTANCE; + } + } +} diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java b/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java new file mode 100644 index 000000000000..a5208809b314 --- /dev/null +++ b/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.hudi; + +import java.util.Map; +import org.apache.iceberg.actions.Action; + +public interface SnapshotHudiTable extends Action { + + SnapshotHudiTable tableProperties(Map properties); + + SnapshotHudiTable tableProperty(String key, String value); + + interface Result { + + long snapshotFilesCount(); + } +} From dda6a1f76aa5bdbcd4616741bbde37498a8317b6 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Wed, 18 Jan 2023 13:54:34 -0500 Subject: [PATCH 04/20] test work --- build.gradle | 2 +- .../java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/build.gradle b/build.gradle index c0c0f05f1ae0..598d4d1a27ab 100644 --- a/build.gradle +++ b/build.gradle @@ -458,7 +458,7 @@ project(':iceberg-hudi') { implementation project(':iceberg-orc') implementation "com.fasterxml.jackson.core:jackson-databind" - compileOnly("org.apache.hudi:hudi-common:0.12.2") + compileOnly("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2") compileOnly("org.apache.hadoop:hadoop-common") { exclude group: 'org.apache.avro', module: 'avro' diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index 060ddee428fd..c6b36501ba67 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -233,8 +233,9 @@ public void TestHudiPartitionedTableWrite() { @Test public void TestHudiMetaClientAlpha() { + LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation); SnapshotHudiTable.Result result = - HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(spark, unpartitionedLocation) + HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(spark, partitionedLocation) .execute(); } From 0ef358366d46201f96e3b88e979c576493f2d8f1 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Thu, 19 Jan 2023 12:30:35 -0500 Subject: [PATCH 05/20] work out schema conversion --- build.gradle | 3 + .../iceberg/hudi/TestSnapshotHudiTable.java | 53 +++++++ .../hudi/BaseSnapshotHudiTableAction.java | 60 +++++++- .../iceberg/hudi/HudiDataTypeToType.java | 136 ++++++++++++++++++ .../iceberg/hudi/HudiDataTypeVisitor.java | 59 ++++++++ 5 files changed, 306 insertions(+), 5 deletions(-) create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeToType.java create mode 100644 hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeVisitor.java diff --git a/build.gradle b/build.gradle index 598d4d1a27ab..f02d0230c1cf 100644 --- a/build.gradle +++ b/build.gradle @@ -459,6 +459,9 @@ project(':iceberg-hudi') { implementation "com.fasterxml.jackson.core:jackson-databind" compileOnly("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2") + implementation("org.apache.avro:avro") { + exclude group: 'org.tukaani' // xz compression is not supported + } compileOnly("org.apache.hadoop:hadoop-common") { exclude group: 'org.apache.avro', module: 'avro' diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index c6b36501ba67..b48a7b07c32b 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -24,6 +24,7 @@ import java.util.Map; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.QuickstartUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -231,6 +232,58 @@ public void TestHudiPartitionedTableWrite() { LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false)); } + @Test + public void TestHudiMetaClientExploration() { + HoodieTableMetaClient hoodieTableMetaClient = + HoodieTableMetaClient.builder() + .setConf(spark.sessionState().newHadoopConf()) + .setBasePath(partitionedLocation) + .setLoadActiveTimelineOnLoad(true) + .build(); + + LOG.info("Alpha test: hoodie table base path: {}", hoodieTableMetaClient.getBasePathV2()); + LOG.info( + "Alpha test: hoodie getBootStrapIndexByFileId: {}", + hoodieTableMetaClient.getBootstrapIndexByFileIdFolderNameFolderPath()); + LOG.info( + "Alpha test: hoodie getBootStrapIndexByPartitionPath: {}", + hoodieTableMetaClient.getBootstrapIndexByPartitionFolderPath()); + LOG.info( + "Alpha test: hoodie getCommitActionType: {}", hoodieTableMetaClient.getCommitActionType()); + LOG.info( + "Alpha test: hoodie getCommitsAndCompactionTimeline: {}", + hoodieTableMetaClient.getCommitsAndCompactionTimeline()); + LOG.info( + "Alpha test: hoodie getCommitsTimeline: {}", hoodieTableMetaClient.getCommitsTimeline()); + LOG.info("Alpha test: hoodie getCommitTimeline: {}", hoodieTableMetaClient.getCommitTimeline()); + LOG.info( + "Alpha test: hoodie getConsistencyGuardConfig: {}", + hoodieTableMetaClient.getConsistencyGuardConfig().toString()); + LOG.info( + "Alpha test: hoodie getFileSystemRetryConfig: {}", + hoodieTableMetaClient.getFileSystemRetryConfig().toString()); + LOG.info( + "Alpha test: hoodie getHashingMetadataPath: {}", + hoodieTableMetaClient.getHashingMetadataPath()); + LOG.info( + "Alpha test: hoodie getMetaAuxiliaryPath: {}", + hoodieTableMetaClient.getMetaAuxiliaryPath()); + LOG.info("Alpha test: hoodie getMetaPath: {}", hoodieTableMetaClient.getMetaPath()); + LOG.info( + "Alpha test: hoodie getMetastoreConfig: {}", + hoodieTableMetaClient.getMetastoreConfig().toString()); + LOG.info( + "Alpha test: hoodie getSchemaFolderName: {}", hoodieTableMetaClient.getSchemaFolderName()); + LOG.info( + "Alpha test: hoodie getTableConfig: {}", hoodieTableMetaClient.getTableConfig().toString()); + LOG.info( + "Alpha test: hoodie getTableType: {}", hoodieTableMetaClient.getTableType().toString()); + LOG.info("Alpha test: hoodie getTempFolderPath: {}", hoodieTableMetaClient.getTempFolderPath()); + LOG.info( + "Alpha test: hoodie getTimelineLayoutVersion: {}", + hoodieTableMetaClient.getTimelineLayoutVersion()); + } + @Test public void TestHudiMetaClientAlpha() { LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation); diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 8da8746c1746..292d348b1fa3 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -21,6 +21,13 @@ import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Type; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -29,11 +36,11 @@ public class BaseSnapshotHudiTableAction implements SnapshotHudiTable { private static final Logger LOG = LoggerFactory.getLogger(BaseSnapshotHudiTableAction.class.getName()); - private HoodieTableMetaClient HoodieMetaClient; + private HoodieTableMetaClient hoodieTableMetaClient; public BaseSnapshotHudiTableAction( Configuration hoodieConfiguration, String hoodieTableBasePath) { - this.HoodieMetaClient = buildTableMetaClient(hoodieConfiguration, hoodieTableBasePath); + this.hoodieTableMetaClient = buildTableMetaClient(hoodieConfiguration, hoodieTableBasePath); } @Override @@ -48,14 +55,57 @@ public SnapshotHudiTable tableProperty(String key, String value) { @Override public Result execute() { - LOG.info("Alpha test: hoodie table base path: {}", HoodieMetaClient.getBasePathV2()); - + LOG.info("Alpha test: hoodie table base path: {}", hoodieTableMetaClient.getBasePathV2()); + LOG.info( + "Alpha test: hoodie getBootStrapIndexByFileId: {}", + hoodieTableMetaClient.getBootstrapIndexByFileIdFolderNameFolderPath()); + LOG.info( + "Alpha test: hoodie getBootStrapIndexByPartitionPath: {}", + hoodieTableMetaClient.getBootstrapIndexByPartitionFolderPath()); + InternalSchema hudiSchema = getHudiSchema(); + LOG.info("Alpha test: hoodie table schema: {}", hudiSchema); + LOG.info("Alpha test: get record type: {}", hudiSchema.getRecord()); + Schema icebergSchema = getIcebergSchema(hudiSchema); + LOG.info("Alpha test: get converted schema: {}", icebergSchema); return null; } + private InternalSchema getHudiSchema() { + TableSchemaResolver schemaUtil = new TableSchemaResolver(hoodieTableMetaClient); + Option hudiSchema = schemaUtil.getTableInternalSchemaFromCommitMetadata(); + LOG.info("Alpha test: hoodie schema: {}", hudiSchema); + LOG.info("Alpha test: active timeline: {}", hoodieTableMetaClient.getActiveTimeline()); + LOG.info( + "Alpha test: active timeline commit timeline: {}", + hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline()); + LOG.info( + "Alpha test: active timeline commit timeline instants: {}", + hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()); + // TODO: need to add support for parquet format table + return hudiSchema.orElseGet( + () -> { + try { + return AvroInternalSchemaConverter.convert(schemaUtil.getTableAvroSchema()); + } catch (Exception e) { + throw new HoodieException("cannot find schema for current table"); + } + }); + } + + private Schema getIcebergSchema(InternalSchema hudiSchema) { + Type converted = + HudiDataTypeVisitor.visit( + hudiSchema.getRecord(), new HudiDataTypeToType(hudiSchema.getRecord())); + return new Schema(converted.asNestedType().asStructType().fields()); + } + private static HoodieTableMetaClient buildTableMetaClient(Configuration conf, String basePath) { HoodieTableMetaClient metaClient = - HoodieTableMetaClient.builder().setConf(conf).setBasePath(basePath).build(); + HoodieTableMetaClient.builder() + .setConf(conf) + .setBasePath(basePath) + .setLoadActiveTimelineOnLoad(true) + .build(); return metaClient; } } diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeToType.java b/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeToType.java new file mode 100644 index 000000000000..370e192fead8 --- /dev/null +++ b/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeToType.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.hudi; + +import java.util.List; +import org.apache.hudi.internal.schema.Types; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; + +public class HudiDataTypeToType extends HudiDataTypeVisitor { + private final Types.RecordType root; + private int nextId = 0; + + HudiDataTypeToType() { + this.root = null; + } + + HudiDataTypeToType(Types.RecordType root) { + this.root = root; + this.nextId = root.fields().size(); + } + + private int getNextId() { + int next = nextId; + nextId += 1; + return next; + } + + @SuppressWarnings("ReferenceEquality") + @Override + public Type record(Types.RecordType record, List fieldResults) { + List fields = record.fields(); + List newFields = + Lists.newArrayListWithExpectedSize(fields.size()); + boolean isRoot = root == record; + for (int i = 0; i < fields.size(); i += 1) { + Types.Field field = fields.get(i); + Type type = fieldResults.get(i); + int id; + if (isRoot) { + id = i; + } else { + id = getNextId(); + } + + String doc = field.doc(); + if (field.isOptional()) { + newFields.add( + org.apache.iceberg.types.Types.NestedField.optional(id, field.name(), type, doc)); + } else { + newFields.add( + org.apache.iceberg.types.Types.NestedField.required(id, field.name(), type, doc)); + } + } + + return org.apache.iceberg.types.Types.StructType.of(newFields); + } + + @Override + public Type field(Types.Field field, Type typeResult) { + return typeResult; + } + + @Override + public Type map(Types.MapType map, Type keyResult, Type valueResult) { + if (map.isValueOptional()) { + return org.apache.iceberg.types.Types.MapType.ofOptional( + getNextId(), getNextId(), keyResult, valueResult); + } else { + return org.apache.iceberg.types.Types.MapType.ofRequired( + getNextId(), getNextId(), keyResult, valueResult); + } + } + + @Override + public Type array(Types.ArrayType array, Type elementResult) { + if (array.isElementOptional()) { + return org.apache.iceberg.types.Types.ListType.ofOptional(getNextId(), elementResult); + } else { + return org.apache.iceberg.types.Types.ListType.ofRequired(getNextId(), elementResult); + } + } + + @SuppressWarnings("checkstyle:CyclomaticComplexity") + @Override + public Type atomic(org.apache.hudi.internal.schema.Type atomic) { + if (atomic instanceof Types.BooleanType) { + return org.apache.iceberg.types.Types.BooleanType.get(); + } else if (atomic instanceof Types.IntType) { + return org.apache.iceberg.types.Types.IntegerType.get(); + } else if (atomic instanceof Types.LongType) { + return org.apache.iceberg.types.Types.LongType.get(); + } else if (atomic instanceof Types.FloatType) { + return org.apache.iceberg.types.Types.FloatType.get(); + } else if (atomic instanceof Types.DoubleType) { + return org.apache.iceberg.types.Types.DoubleType.get(); + } else if (atomic instanceof Types.DateType) { + return org.apache.iceberg.types.Types.DateType.get(); + } else if (atomic instanceof Types.TimestampType) { + return org.apache.iceberg.types.Types.TimestampType.withZone(); + } else if (atomic instanceof Types.StringType) { + return org.apache.iceberg.types.Types.StringType.get(); + } else if (atomic instanceof Types.BinaryType) { + return org.apache.iceberg.types.Types.BinaryType.get(); + } else if (atomic instanceof Types.UUIDType) { + return org.apache.iceberg.types.Types.UUIDType.get(); + } else if (atomic instanceof Types.DecimalType) { + return org.apache.iceberg.types.Types.DecimalType.of( + ((Types.DecimalType) atomic).precision(), ((Types.DecimalType) atomic).scale()); + } else if (atomic instanceof Types.FixedType) { + return org.apache.iceberg.types.Types.FixedType.ofLength( + ((Types.FixedType) atomic).getFixedSize()); + } else if (atomic instanceof Types.TimeType) { + return org.apache.iceberg.types.Types.TimeType.get(); + } + + throw new ValidationException("Not a supported type: %s", atomic.getClass().getName()); + } +} diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeVisitor.java b/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeVisitor.java new file mode 100644 index 000000000000..caedeb5eba29 --- /dev/null +++ b/hudi/src/main/java/org/apache/iceberg/hudi/HudiDataTypeVisitor.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.hudi; + +import java.util.List; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public abstract class HudiDataTypeVisitor { + + public static T visit(Type type, HudiDataTypeVisitor visitor) { + if (type instanceof Types.RecordType) { + List fields = ((Types.RecordType) type).fields(); + List fieldResults = Lists.newArrayListWithExpectedSize(fields.size()); + + for (Types.Field field : fields) { + fieldResults.add(visitor.field(field, visit(field.type(), visitor))); + } + + return visitor.record((Types.RecordType) type, fieldResults); + } else if (type instanceof Types.MapType) { + return visitor.map( + (Types.MapType) type, + visit(((Types.MapType) type).keyType(), visitor), + visit(((Types.MapType) type).valueType(), visitor)); + } else if (type instanceof Types.ArrayType) { + return visitor.array( + (Types.ArrayType) type, visit(((Types.ArrayType) type).elementType(), visitor)); + } + return visitor.atomic(type); + } + + public abstract T record(Types.RecordType record, List fieldResults); + + public abstract T field(Types.Field field, T typeResult); + + public abstract T array(Types.ArrayType array, T elementResult); + + public abstract T map(Types.MapType map, T keyResult, T valueResult); + + public abstract T atomic(Type atomic); +} From ced38a1df2341b151cc96b4aca32c632d164cbc3 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Sat, 21 Jan 2023 01:02:17 -0500 Subject: [PATCH 06/20] rename some methods --- .../org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 292d348b1fa3..dbfdbb6082b9 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -65,7 +65,7 @@ public Result execute() { InternalSchema hudiSchema = getHudiSchema(); LOG.info("Alpha test: hoodie table schema: {}", hudiSchema); LOG.info("Alpha test: get record type: {}", hudiSchema.getRecord()); - Schema icebergSchema = getIcebergSchema(hudiSchema); + Schema icebergSchema = convertToIcebergSchema(hudiSchema); LOG.info("Alpha test: get converted schema: {}", icebergSchema); return null; } @@ -81,7 +81,6 @@ private InternalSchema getHudiSchema() { LOG.info( "Alpha test: active timeline commit timeline instants: {}", hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()); - // TODO: need to add support for parquet format table return hudiSchema.orElseGet( () -> { try { @@ -92,7 +91,7 @@ private InternalSchema getHudiSchema() { }); } - private Schema getIcebergSchema(InternalSchema hudiSchema) { + private Schema convertToIcebergSchema(InternalSchema hudiSchema) { Type converted = HudiDataTypeVisitor.visit( hudiSchema.getRecord(), new HudiDataTypeToType(hudiSchema.getRecord())); From 7286cab90e60ce0952b3a88002347804bd46365e Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Sat, 21 Jan 2023 16:17:11 -0500 Subject: [PATCH 07/20] COW first draft, but currently cannot get file groups --- ...udiToIcebergMigrationSparkIntegration.java | 17 +- .../iceberg/hudi/TestSnapshotHudiTable.java | 4 +- .../hudi/BaseSnapshotHudiTableAction.java | 283 +++++++++++++++++- 3 files changed, 298 insertions(+), 6 deletions(-) diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java index ae213d99fb7f..cfeca68687b1 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java @@ -18,12 +18,25 @@ */ package org.apache.iceberg.hudi; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.spark.Spark3Util; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.catalog.CatalogPlugin; public class HudiToIcebergMigrationSparkIntegration { private HudiToIcebergMigrationSparkIntegration() {} - static SnapshotHudiTable snapshotHudiTable(SparkSession spark, String hudiTablePath) { - return new BaseSnapshotHudiTableAction(spark.sessionState().newHadoopConf(), hudiTablePath); + static SnapshotHudiTable snapshotHudiTable( + SparkSession spark, String hudiTablePath, String newTableIdentifier) { + String ctx = "hudi snapshot target"; + CatalogPlugin defaultCatalog = spark.sessionState().catalogManager().currentCatalog(); + Spark3Util.CatalogAndIdentifier catalogAndIdentifier = + Spark3Util.catalogAndIdentifier(ctx, spark, newTableIdentifier, defaultCatalog); + + return new BaseSnapshotHudiTableAction( + spark.sessionState().newHadoopConf(), + hudiTablePath, + Spark3Util.loadIcebergCatalog(spark, catalogAndIdentifier.catalog().name()), + TableIdentifier.parse(catalogAndIdentifier.identifier().toString())); } } diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index b48a7b07c32b..0111b1f20776 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -287,8 +287,10 @@ public void TestHudiMetaClientExploration() { @Test public void TestHudiMetaClientAlpha() { LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation); + String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table"); SnapshotHudiTable.Result result = - HudiToIcebergMigrationSparkIntegration.snapshotHudiTable(spark, partitionedLocation) + HudiToIcebergMigrationSparkIntegration.snapshotHudiTable( + spark, partitionedLocation, newTableIdentifier) .execute(); } diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index dbfdbb6082b9..241d97bf370b 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -18,15 +18,58 @@ */ package org.apache.iceberg.hudi; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import javax.annotation.Nullable; import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.internal.schema.InternalSchema; import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DeleteFiles; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.OverwriteFiles; +import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.Transaction; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.mapping.MappingUtil; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.mapping.NameMappingParser; +import org.apache.iceberg.orc.OrcMetrics; +import org.apache.iceberg.parquet.ParquetUtil; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Type; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,12 +78,35 @@ public class BaseSnapshotHudiTableAction implements SnapshotHudiTable { private static final Logger LOG = LoggerFactory.getLogger(BaseSnapshotHudiTableAction.class.getName()); - + private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source"; + private static final String HOODIE_SOURCE_VALUE = "hudi"; + private static final String ORIGINAL_LOCATION_PROP = "original_location"; + private static final String PARQUET_SUFFIX = ".parquet"; + private static final String AVRO_SUFFIX = ".avro"; + private static final String ORC_SUFFIX = ".orc"; private HoodieTableMetaClient hoodieTableMetaClient; + private HoodieTableConfig hoodieTableConfig; + private HoodieEngineContext hoodieEngineContext; + private HoodieMetadataConfig hoodieMetadataConfig; + private String hoodieTableBasePath; + private Catalog icebergCatalog; + private TableIdentifier newTableIdentifier; + private HadoopFileIO hoodieFileIO; + private ImmutableMap.Builder additionalPropertiesBuilder = ImmutableMap.builder(); public BaseSnapshotHudiTableAction( - Configuration hoodieConfiguration, String hoodieTableBasePath) { + Configuration hoodieConfiguration, + String hoodieTableBasePath, + Catalog icebergCatalog, + TableIdentifier newTableIdentifier) { this.hoodieTableMetaClient = buildTableMetaClient(hoodieConfiguration, hoodieTableBasePath); + this.hoodieTableConfig = hoodieTableMetaClient.getTableConfig(); + this.hoodieEngineContext = new HoodieLocalEngineContext(hoodieConfiguration); + this.hoodieTableBasePath = hoodieTableBasePath; + this.hoodieMetadataConfig = HoodieInputFormatUtils.buildMetadataConfig(hoodieConfiguration); + this.hoodieFileIO = new HadoopFileIO(hoodieConfiguration); + this.icebergCatalog = icebergCatalog; + this.newTableIdentifier = newTableIdentifier; } @Override @@ -67,7 +133,165 @@ public Result execute() { LOG.info("Alpha test: get record type: {}", hudiSchema.getRecord()); Schema icebergSchema = convertToIcebergSchema(hudiSchema); LOG.info("Alpha test: get converted schema: {}", icebergSchema); - return null; + PartitionSpec partitionSpec = getPartitionSpecFromHoodieMetadataData(icebergSchema); + LOG.info("Alpha test: get partition spec: {}", partitionSpec); + // TODO: add support for newTableLocation + Transaction icebergTransaction = + icebergCatalog.newCreateTableTransaction( + newTableIdentifier, icebergSchema, partitionSpec, destTableProperties()); + icebergTransaction + .table() + .updateProperties() + .set( + TableProperties.DEFAULT_NAME_MAPPING, + NameMappingParser.toJson(MappingUtil.create(icebergTransaction.table().schema()))) + .commit(); + + HoodieTimeline timeline = + hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + LOG.info("Alpha test: hoodie timeline: {}", timeline); + HoodieTableFileSystemView hoodieTableFileSystemView = + FileSystemViewManager.createInMemoryFileSystemViewWithTimeline( + hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline); + Stream completedInstants = timeline.getInstants(); + LOG.info("Alpha test: get completed instants: {}", completedInstants); + // file group id -> Map + Map> allStampedDataFiles = + hoodieTableFileSystemView + .getAllFileGroups() + .collect( + ImmutableMap.toImmutableMap( + HoodieFileGroup::getFileGroupId, + fileGroup -> + fileGroup + .getAllBaseFiles() + .collect( + ImmutableMap.toImmutableMap( + HoodieBaseFile::getCommitTime, baseFile -> baseFile)))); + List testGroups = + hoodieTableFileSystemView.getAllFileGroups().collect(Collectors.toList()); + LOG.info("Alpha test: get all stamped data files: {}", allStampedDataFiles); + LOG.info("Alpha test: get all file groups: {}", testGroups); + Map convertedDataFiles = Maps.newHashMap(); + completedInstants.forEachOrdered( + instant -> { + LOG.info("Alpha test: get completed instant: {}", instant); + // copyInstants to iceberg table + // TODO: need to verify the order of the instants, make sure it is from the oldest to the + // newest + commitHoodieInstantToIcebergTransaction( + instant, + hoodieTableFileSystemView.getAllFileGroups(), + allStampedDataFiles, + convertedDataFiles, + icebergTransaction); + }); + + long totalDataFiles = + Long.parseLong( + icebergTransaction + .table() + .currentSnapshot() + .summary() + .get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + icebergTransaction.commitTransaction(); + LOG.info( + "Successfully created Iceberg table {} from hudi table at {}, total data file count: {}", + newTableIdentifier, + hoodieTableBasePath, + totalDataFiles); + return new BaseSnapshotHudiTableActionResult(totalDataFiles); + } + + public void commitHoodieInstantToIcebergTransaction( + HoodieInstant instant, + Stream fileGroups, + Map> allStampedDataFiles, + Map convertedDataFiles, + Transaction transaction) { + List filesToAdd = Lists.newArrayList(); + List filesToRemove = Lists.newArrayList(); + + // TODO: need to add synchronization if want to rely on parallelism here + fileGroups + .sequential() + .forEach( + fileGroup -> { + HoodieFileGroupId fileGroupId = fileGroup.getFileGroupId(); + LOG.info("Alpha test: get file group: {}", fileGroup); + DataFile currentDataFile = + buildDataFileFromHoodieBaseFile( + instant, + fileGroup, + allStampedDataFiles.get(fileGroupId), + transaction.table()); + if (currentDataFile != null) { + filesToAdd.add(currentDataFile); + DataFile previousDataFile = convertedDataFiles.get(fileGroupId); + if (previousDataFile != null) { + // need to delete the previous data file since a new version will be added + filesToRemove.add(previousDataFile); + } + convertedDataFiles.put(fileGroupId, currentDataFile); + } + }); + LOG.info("Alpha test: get files to add: {} at instant {}", filesToAdd, instant); + if (filesToAdd.size() > 0 && filesToRemove.size() > 0) { + // OverwriteFiles case + OverwriteFiles overwriteFiles = transaction.newOverwrite(); + filesToAdd.forEach(overwriteFiles::addFile); + filesToRemove.forEach(overwriteFiles::deleteFile); + overwriteFiles.commit(); + } else if (filesToAdd.size() > 0) { + // AppendFiles case + AppendFiles appendFiles = transaction.newAppend(); + filesToAdd.forEach(appendFiles::appendFile); + appendFiles.commit(); + } else if (filesToRemove.size() > 0) { + // DeleteFiles case + DeleteFiles deleteFiles = transaction.newDelete(); + filesToRemove.forEach(deleteFiles::deleteFile); + deleteFiles.commit(); + } + } + + @Nullable + private DataFile buildDataFileFromHoodieBaseFile( + HoodieInstant instant, + HoodieFileGroup fileGroup, + Map stampedDataFiles, + Table table) { + HoodieBaseFile baseFile = stampedDataFiles.get(instant.getTimestamp()); + if (baseFile == null) { + LOG.info( + "Alpha test: does not have base file for instant: {}, fileGroupId {}", + instant, + fileGroup.getFileGroupId()); + return null; + } + + PartitionSpec spec = table.spec(); + // TODO: need to verify the path is absolute + String path = baseFile.getPath(); + long fileSize = baseFile.getFileSize(); + String partitionPath = fileGroup.getPartitionPath(); + + MetricsConfig metricsConfig = MetricsConfig.forTable(table); + String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + + InputFile file = hoodieFileIO.newInputFile(path); + FileFormat format = determineFileFormatFromPath(path); + Metrics metrics = getMetricsForFile(file, format, metricsConfig, nameMapping); + + return DataFiles.builder(spec) + .withPath(path) + .withFormat(format) + .withFileSizeInBytes(fileSize) + .withPartitionPath(partitionPath) // TODO: need to verify the partition path is correct + .withMetrics(metrics) + .build(); } private InternalSchema getHudiSchema() { @@ -98,6 +322,32 @@ private Schema convertToIcebergSchema(InternalSchema hudiSchema) { return new Schema(converted.asNestedType().asStructType().fields()); } + private PartitionSpec getPartitionSpecFromHoodieMetadataData(Schema schema) { + Option partitionNames = hoodieTableConfig.getPartitionFields(); + if (partitionNames.isPresent()) { + PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); + for (String partitionName : partitionNames.get()) { + builder.identity(partitionName); + } + return builder.build(); + } + + return PartitionSpec.unpartitioned(); + } + + private Map destTableProperties() { + // TODO: need to check which hoodie properties to add to + additionalPropertiesBuilder.putAll(hoodieTableConfig.propsMap()); + additionalPropertiesBuilder.putAll( + ImmutableMap.of( + SNAPSHOT_SOURCE_PROP, + HOODIE_SOURCE_VALUE, + ORIGINAL_LOCATION_PROP, + hoodieTableMetaClient.getBasePathV2().toString())); + + return additionalPropertiesBuilder.build(); + } + private static HoodieTableMetaClient buildTableMetaClient(Configuration conf, String basePath) { HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() @@ -107,4 +357,31 @@ private static HoodieTableMetaClient buildTableMetaClient(Configuration conf, St .build(); return metaClient; } + + private FileFormat determineFileFormatFromPath(String path) { + if (path.endsWith(PARQUET_SUFFIX)) { + return FileFormat.PARQUET; + } else if (path.endsWith(AVRO_SUFFIX)) { + return FileFormat.AVRO; + } else if (path.endsWith(ORC_SUFFIX)) { + return FileFormat.ORC; + } else { + throw new ValidationException("Cannot determine file format from path %s", path); + } + } + + private Metrics getMetricsForFile( + InputFile file, FileFormat format, MetricsConfig metricsSpec, NameMapping mapping) { + switch (format) { + case AVRO: + long rowCount = Avro.rowCount(file); + return new Metrics(rowCount, null, null, null, null); + case PARQUET: + return ParquetUtil.fileMetrics(file, metricsSpec, mapping); + case ORC: + return OrcMetrics.fromInputFile(file, metricsSpec, mapping); + default: + throw new ValidationException("Cannot get metrics from file format: %s", format); + } + } } From bbb5c3649d743bde1caf242c209bd8560db41965 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Sat, 21 Jan 2023 23:58:07 -0500 Subject: [PATCH 08/20] prepare for draft PR discussion --- build.gradle | 3 + .../iceberg/hudi/TestSnapshotHudiTable.java | 8 +-- .../hudi/BaseSnapshotHudiTableAction.java | 61 ++++++++++++++++--- 3 files changed, 59 insertions(+), 13 deletions(-) diff --git a/build.gradle b/build.gradle index f02d0230c1cf..a2c284166c49 100644 --- a/build.gradle +++ b/build.gradle @@ -458,6 +458,9 @@ project(':iceberg-hudi') { implementation project(':iceberg-orc') implementation "com.fasterxml.jackson.core:jackson-databind" + // TODO: we only need hudi-common here, however, hudi-common has some dependency conflicts with hudi-spark-bundle + // which is currently used by the integration test. We should fix this in the future. + // Also, hudi uses java8, may need to assess if we can use hudi in java11. compileOnly("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2") implementation("org.apache.avro:avro") { exclude group: 'org.tukaani' // xz compression is not supported diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index 0111b1f20776..b28d9289a126 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -219,21 +219,21 @@ public void before() throws IOException { } @Test - public void TestHudiUnpartitionedTableWrite() { + public void testHudiUnpartitionedTableWrite() { Dataset df = spark.read().format("hudi").load(unpartitionedLocation); LOG.info("Generated unpartitioned dataframe shcema: {}", df.schema().treeString()); LOG.info("Generated unpartitioned dataframe: {}", df.showString(10, 20, false)); } @Test - public void TestHudiPartitionedTableWrite() { + public void testHudiPartitionedTableWrite() { Dataset df = spark.read().format("hudi").load(partitionedLocation); LOG.info("Generated partitioned dataframe shcema: {}", df.schema().treeString()); LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false)); } @Test - public void TestHudiMetaClientExploration() { + public void testHudiMetaClientExploration() { HoodieTableMetaClient hoodieTableMetaClient = HoodieTableMetaClient.builder() .setConf(spark.sessionState().newHadoopConf()) @@ -285,7 +285,7 @@ public void TestHudiMetaClientExploration() { } @Test - public void TestHudiMetaClientAlpha() { + public void testHudiMetaClientAlpha() { LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation); String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table"); SnapshotHudiTable.Result result = diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 241d97bf370b..3fc11b84fc22 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -52,6 +52,7 @@ import org.apache.iceberg.OverwriteFiles; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; @@ -128,6 +129,8 @@ public Result execute() { LOG.info( "Alpha test: hoodie getBootStrapIndexByPartitionPath: {}", hoodieTableMetaClient.getBootstrapIndexByPartitionFolderPath()); + + // Convert Hoodie table schema to Iceberg schema and extract the partition spec InternalSchema hudiSchema = getHudiSchema(); LOG.info("Alpha test: hoodie table schema: {}", hudiSchema); LOG.info("Alpha test: get record type: {}", hudiSchema.getRecord()); @@ -135,10 +138,15 @@ public Result execute() { LOG.info("Alpha test: get converted schema: {}", icebergSchema); PartitionSpec partitionSpec = getPartitionSpecFromHoodieMetadataData(icebergSchema); LOG.info("Alpha test: get partition spec: {}", partitionSpec); + // TODO: add support for newTableLocation Transaction icebergTransaction = icebergCatalog.newCreateTableTransaction( newTableIdentifier, icebergSchema, partitionSpec, destTableProperties()); + // We need name mapping to ensure we can read data files correctly as iceberg table has its own + // rule to assign field id + // Although the field id rule seems to be the same as hudi, but the rule is not guaranteed by + // any API icebergTransaction .table() .updateProperties() @@ -147,15 +155,21 @@ public Result execute() { NameMappingParser.toJson(MappingUtil.create(icebergTransaction.table().schema()))) .commit(); + // Pre-process the timeline, we only need to process all COMPLETED commit for COW table + // Commit that has been rollbacked will not be in either REQUESTED or INFLIGHT state HoodieTimeline timeline = hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); LOG.info("Alpha test: hoodie timeline: {}", timeline); + // Initialize the FileSystemView for querying table data files + // TODO: need to choose the correct implementation of the FileSystemView HoodieTableFileSystemView hoodieTableFileSystemView = FileSystemViewManager.createInMemoryFileSystemViewWithTimeline( hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline); + // get all instants on the timeline Stream completedInstants = timeline.getInstants(); LOG.info("Alpha test: get completed instants: {}", completedInstants); // file group id -> Map + // This pre-process aims to make a timestamp to HoodieBaseFile map for each file group Map> allStampedDataFiles = hoodieTableFileSystemView .getAllFileGroups() @@ -168,17 +182,24 @@ public Result execute() { .collect( ImmutableMap.toImmutableMap( HoodieBaseFile::getCommitTime, baseFile -> baseFile)))); + // BEGIN TEST ONLY CODE List testGroups = hoodieTableFileSystemView.getAllFileGroups().collect(Collectors.toList()); LOG.info("Alpha test: get all stamped data files: {}", allStampedDataFiles); LOG.info("Alpha test: get all file groups: {}", testGroups); + // END TEST ONLY CODE + + // Help tracked if a previous version of the data file has been added to the iceberg table Map convertedDataFiles = Maps.newHashMap(); + // Replay the timeline from beginning to the end completedInstants.forEachOrdered( instant -> { LOG.info("Alpha test: get completed instant: {}", instant); // copyInstants to iceberg table // TODO: need to verify the order of the instants, make sure it is from the oldest to the // newest + + // commit each instant as a transaction to the iceberg table commitHoodieInstantToIcebergTransaction( instant, hoodieTableFileSystemView.getAllFileGroups(), @@ -186,14 +207,11 @@ public Result execute() { convertedDataFiles, icebergTransaction); }); - + Snapshot icebergSnapshot = icebergTransaction.table().currentSnapshot(); long totalDataFiles = - Long.parseLong( - icebergTransaction - .table() - .currentSnapshot() - .summary() - .get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + icebergSnapshot != null + ? Long.parseLong(icebergSnapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)) + : 0; icebergTransaction.commitTransaction(); LOG.info( "Successfully created Iceberg table {} from hudi table at {}, total data file count: {}", @@ -203,6 +221,17 @@ public Result execute() { return new BaseSnapshotHudiTableActionResult(totalDataFiles); } + /** + * In COW Hoodie table, each file group is a combination of different versions of the same data + * file. + * + *

During each write, a new version of the file will be copied and modified to be a new version + * in the file group. Therefore, when committing the datafile to the iceberg table, we need to + * make sure that the older version of the data file is deleted before adding the newer version of + * the data file. + * + *

In other words, the COW behavior can be mapped to the overwrite operation in the iceberg. + */ public void commitHoodieInstantToIcebergTransaction( HoodieInstant instant, Stream fileGroups, @@ -212,7 +241,7 @@ public void commitHoodieInstantToIcebergTransaction( List filesToAdd = Lists.newArrayList(); List filesToRemove = Lists.newArrayList(); - // TODO: need to add synchronization if want to rely on parallelism here + // TODO: may need to add synchronization lock for parallelism fileGroups .sequential() .forEach( @@ -225,13 +254,17 @@ public void commitHoodieInstantToIcebergTransaction( fileGroup, allStampedDataFiles.get(fileGroupId), transaction.table()); + if (currentDataFile != null) { filesToAdd.add(currentDataFile); + DataFile previousDataFile = convertedDataFiles.get(fileGroupId); if (previousDataFile != null) { // need to delete the previous data file since a new version will be added filesToRemove.add(previousDataFile); } + + // update the converted data file map convertedDataFiles.put(fileGroupId, currentDataFile); } }); @@ -271,7 +304,7 @@ private DataFile buildDataFileFromHoodieBaseFile( } PartitionSpec spec = table.spec(); - // TODO: need to verify the path is absolute + // TODO: need to verify the path is absolute (the field's name is fullPath) String path = baseFile.getPath(); long fileSize = baseFile.getFileSize(); String partitionPath = fileGroup.getPartitionPath(); @@ -294,6 +327,11 @@ private DataFile buildDataFileFromHoodieBaseFile( .build(); } + /** + * Taken from getInternalSchema + * in HoodieWriteClient. + */ private InternalSchema getHudiSchema() { TableSchemaResolver schemaUtil = new TableSchemaResolver(hoodieTableMetaClient); Option hudiSchema = schemaUtil.getTableInternalSchemaFromCommitMetadata(); @@ -315,6 +353,11 @@ private InternalSchema getHudiSchema() { }); } + /** + * Use nested type visitor to convert the internal schema to iceberg schema. + * + *

just like what we did with spark table's schema and delta lake table's schema. + */ private Schema convertToIcebergSchema(InternalSchema hudiSchema) { Type converted = HudiDataTypeVisitor.visit( From 44f7f693e35e642cbc3d3865f075572c064a5468 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Tue, 24 Jan 2023 22:28:43 -0500 Subject: [PATCH 09/20] fix get all file groups --- .../hudi/BaseSnapshotHudiTableAction.java | 44 ++++++++++--------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 3fc11b84fc22..25be14d4db2d 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -18,6 +18,7 @@ */ package org.apache.iceberg.hudi; +import java.io.IOException; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -27,6 +28,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieFileGroupId; @@ -42,6 +44,9 @@ import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.internal.schema.InternalSchema; import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; +import org.apache.hudi.metadata.HoodieTableMetadata; + +import org.apache.hadoop.fs.Path; import org.apache.iceberg.AppendFiles; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; @@ -122,22 +127,11 @@ public SnapshotHudiTable tableProperty(String key, String value) { @Override public Result execute() { - LOG.info("Alpha test: hoodie table base path: {}", hoodieTableMetaClient.getBasePathV2()); - LOG.info( - "Alpha test: hoodie getBootStrapIndexByFileId: {}", - hoodieTableMetaClient.getBootstrapIndexByFileIdFolderNameFolderPath()); - LOG.info( - "Alpha test: hoodie getBootStrapIndexByPartitionPath: {}", - hoodieTableMetaClient.getBootstrapIndexByPartitionFolderPath()); // Convert Hoodie table schema to Iceberg schema and extract the partition spec InternalSchema hudiSchema = getHudiSchema(); - LOG.info("Alpha test: hoodie table schema: {}", hudiSchema); - LOG.info("Alpha test: get record type: {}", hudiSchema.getRecord()); Schema icebergSchema = convertToIcebergSchema(hudiSchema); - LOG.info("Alpha test: get converted schema: {}", icebergSchema); PartitionSpec partitionSpec = getPartitionSpecFromHoodieMetadataData(icebergSchema); - LOG.info("Alpha test: get partition spec: {}", partitionSpec); // TODO: add support for newTableLocation Transaction icebergTransaction = @@ -159,20 +153,29 @@ public Result execute() { // Commit that has been rollbacked will not be in either REQUESTED or INFLIGHT state HoodieTimeline timeline = hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - LOG.info("Alpha test: hoodie timeline: {}", timeline); // Initialize the FileSystemView for querying table data files // TODO: need to choose the correct implementation of the FileSystemView - HoodieTableFileSystemView hoodieTableFileSystemView = - FileSystemViewManager.createInMemoryFileSystemViewWithTimeline( - hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline); +// HoodieTableFileSystemView hoodieTableFileSystemView = +// FileSystemViewManager.createInMemoryFileSystemViewWithTimeline( +// hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline); + HoodieTableFileSystemView hoodieTableFileSystemView = new HoodieTableFileSystemView( + hoodieTableMetaClient, timeline); // get all instants on the timeline Stream completedInstants = timeline.getInstants(); - LOG.info("Alpha test: get completed instants: {}", completedInstants); + List partitionPaths = FSUtils.getAllPartitionPaths(hoodieEngineContext, hoodieMetadataConfig, hoodieTableMetaClient.getBasePathV2().toString()); + try { + for (String partitionPath : partitionPaths) { + Path fullPartitionPath = FSUtils.getPartitionPath(hoodieTableMetaClient.getBasePathV2(), partitionPath); + hoodieTableFileSystemView.addFilesToView(FSUtils.getAllDataFilesInPartition(hoodieTableMetaClient.getFs(), fullPartitionPath)); + } + } catch (IOException e) { + throw new RuntimeException("Failed to get all data files in partition", e); + } // file group id -> Map // This pre-process aims to make a timestamp to HoodieBaseFile map for each file group Map> allStampedDataFiles = hoodieTableFileSystemView - .getAllFileGroups() + .fetchAllStoredFileGroups() .collect( ImmutableMap.toImmutableMap( HoodieFileGroup::getFileGroupId, @@ -182,9 +185,10 @@ public Result execute() { .collect( ImmutableMap.toImmutableMap( HoodieBaseFile::getCommitTime, baseFile -> baseFile)))); + // BEGIN TEST ONLY CODE - List testGroups = - hoodieTableFileSystemView.getAllFileGroups().collect(Collectors.toList()); + List testGroups = + hoodieTableFileSystemView.getLatestBaseFiles().collect(Collectors.toList()); LOG.info("Alpha test: get all stamped data files: {}", allStampedDataFiles); LOG.info("Alpha test: get all file groups: {}", testGroups); // END TEST ONLY CODE @@ -307,7 +311,7 @@ private DataFile buildDataFileFromHoodieBaseFile( // TODO: need to verify the path is absolute (the field's name is fullPath) String path = baseFile.getPath(); long fileSize = baseFile.getFileSize(); - String partitionPath = fileGroup.getPartitionPath(); + String partitionPath = FSUtils.getPartitionPath(hoodieTableMetaClient.getBasePathV2(), fileGroup.getPartitionPath()).toString(); MetricsConfig metricsConfig = MetricsConfig.forTable(table); String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); From 43e30de5b08b84d2a3a09529f836e2485a0d8db3 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Tue, 24 Jan 2023 22:50:46 -0500 Subject: [PATCH 10/20] successfully snapshot first hoodie table --- .../iceberg/hudi/TestSnapshotHudiTable.java | 19 +++++++++++++++++++ .../hudi/BaseSnapshotHudiTableAction.java | 13 +++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index b28d9289a126..abd91fd87cb1 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -36,6 +36,7 @@ import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.hudi.catalog.HoodieCatalog; +import org.assertj.core.api.Assertions; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -292,6 +293,24 @@ public void testHudiMetaClientAlpha() { HudiToIcebergMigrationSparkIntegration.snapshotHudiTable( spark, partitionedLocation, newTableIdentifier) .execute(); + + checkSnapshotIntegrity(partitionedIdentifier, newTableIdentifier); + } + + private void checkSnapshotIntegrity( + String hudiTableIdentifier, + String icebergTableIdentifier) { + +// List deltaTableContents = +// spark.sql("SELECT * FROM " + hudiTableIdentifier).collectAsList(); + List icebergTableContents = + spark.sql("SELECT * FROM " + icebergTableIdentifier).collectAsList(); + LOG.info("Iceberg table contents: {}", spark.sql("SELECT * FROM " + icebergTableIdentifier).showString(10, 20, false)); + return; + +// Assertions.assertThat(deltaTableContents).hasSize(icebergTableContents.size()); +// Assertions.assertThat(icebergTableContents).containsAll(deltaTableContents); +// Assertions.assertThat(deltaTableContents).containsAll(icebergTableContents); } private String destName(String catalogName, String dest) { diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 25be14d4db2d..06f61cf1b62c 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -55,6 +55,7 @@ import org.apache.iceberg.Metrics; import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.OverwriteFiles; +import org.apache.iceberg.PartitionField; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; @@ -311,7 +312,7 @@ private DataFile buildDataFileFromHoodieBaseFile( // TODO: need to verify the path is absolute (the field's name is fullPath) String path = baseFile.getPath(); long fileSize = baseFile.getFileSize(); - String partitionPath = FSUtils.getPartitionPath(hoodieTableMetaClient.getBasePathV2(), fileGroup.getPartitionPath()).toString(); + String partitionValue = fileGroup.getPartitionPath(); MetricsConfig metricsConfig = MetricsConfig.forTable(table); String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); @@ -322,11 +323,19 @@ private DataFile buildDataFileFromHoodieBaseFile( FileFormat format = determineFileFormatFromPath(path); Metrics metrics = getMetricsForFile(file, format, metricsConfig, nameMapping); + List testFields = spec.fields(); + + String partition = + spec.fields().stream() + .map(PartitionField::name) + .map(name -> String.format("%s=%s", name, partitionValue)) + .collect(Collectors.joining("/")); + return DataFiles.builder(spec) .withPath(path) .withFormat(format) .withFileSizeInBytes(fileSize) - .withPartitionPath(partitionPath) // TODO: need to verify the partition path is correct + .withPartitionPath(partition) // TODO: need to handle multiple partition fields .withMetrics(metrics) .build(); } From cb382db62718f1dd439b5a699e3702c32b73d3be Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Mon, 30 Jan 2023 00:18:02 -0500 Subject: [PATCH 11/20] pass test for all primitive types and partition table --- .../iceberg/hudi/TestSnapshotHudiTable.java | 211 +++++++++--------- 1 file changed, 102 insertions(+), 109 deletions(-) diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index abd91fd87cb1..071e6151093c 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -18,20 +18,34 @@ */ package org.apache.iceberg.hudi; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.File; import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.hudi.DataSourceReadOptions; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.QuickstartUtils; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.SparkSessionCatalog; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.RelationalGroupedDataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SaveMode; @@ -50,29 +64,6 @@ public class TestSnapshotHudiTable extends SparkHudiMigrationTestBase { private static final Logger LOG = LoggerFactory.getLogger(TestSnapshotHudiTable.class.getName()); - private static final String row1 = - "{\"name\":\"Michael\",\"addresses\":[{\"city\":\"SanJose\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}]," - + "\"address_nested\":{\"current\":{\"state\":\"NY\",\"city\":\"NewYork\"},\"previous\":{\"state\":\"NJ\",\"city\":\"Newark\"}}," - + "\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"6\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"]," - + "[\"Spark\",\"Java\"]],\"id\":1,\"magic_number\":1.123123123123}"; - private static final String row2 = - "{\"name\":\"Test\",\"addresses\":[{\"city\":\"SanJos123123e\",\"state\":\"CA\"},{\"city\":\"Sand12312iago\",\"state\":\"CA\"}]," - + "\"address_nested\":{\"current\":{\"state\":\"N12Y\",\"city\":\"NewY1231ork\"}},\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"}," - + "\"secondProp\":{\"height\":\"6\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"],[\"Spark\",\"Java\"]],\"id\":2,\"magic_number\":2.123123123123}"; - private static final String row3 = - "{\"name\":\"Test\",\"addresses\":[{\"city\":\"SanJose\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}]," - + "\"properties\":{\"hair\":\"brown\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"6\"},\"subjects\":" - + "[[\"Java\",\"Scala\",\"C++\"],[\"Spark\",\"Java\"]],\"id\":3,\"magic_number\":3.123123123123}"; - private static final String row4 = - "{\"name\":\"John\",\"addresses\":[{\"city\":\"LA\",\"state\":\"CA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}]," - + "\"address_nested\":{\"current\":{\"state\":\"NY\",\"city\":\"NewYork\"},\"previous\":{\"state\":\"NJ123\"}}," - + "\"properties\":{\"hair\":\"b12rown\",\"eye\":\"bla3221ck\"},\"secondProp\":{\"height\":\"633\"},\"subjects\":" - + "[[\"Spark\",\"Java\"]],\"id\":4,\"magic_number\":4.123123123123}"; - private static final String row5 = - "{\"name\":\"Jonas\",\"addresses\":[{\"city\":\"Pittsburgh\",\"state\":\"PA\"},{\"city\":\"Sandiago\",\"state\":\"CA\"}]," - + "\"address_nested\":{\"current\":{\"state\":\"PA\",\"city\":\"Haha\"},\"previous\":{\"state\":\"NJ\"}}," - + "\"properties\":{\"hair\":\"black\",\"eye\":\"black\"},\"secondProp\":{\"height\":\"7\"},\"subjects\":[[\"Java\",\"Scala\",\"C++\"]," - + "[\"Spark\",\"Java\"]],\"id\":5,\"magic_number\":5.123123123123}"; private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source"; private static final String DELTA_SOURCE_VALUE = "delta"; private static final String ORIGINAL_LOCATION_PROP = "original_location"; @@ -186,37 +177,27 @@ public void before() throws IOException { spark.sql(String.format("DROP TABLE IF EXISTS %s", unpartitionedIdentifier)); spark.sql(String.format("DROP TABLE IF EXISTS %s", externalDataFilesIdentifier)); - // hard code the dataframe - List jsonList = Lists.newArrayList(); - jsonList.add(row1); - jsonList.add(row2); - jsonList.add(row3); - jsonList.add(row4); - jsonList.add(row5); - JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); - SQLContext sqlContext = new SQLContext(javaSparkContext); - JavaRDD rdd = javaSparkContext.parallelize(jsonList); - Dataset df = sqlContext.read().json(rdd); + Dataset df = typeTestDataFrame(); df.write() .format("hudi") .options(QuickstartUtils.getQuickstartWriteConfigs()) - .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number") - .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name") - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "id") + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "decimalCol") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "intCol") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partitionPath") .option(HoodieWriteConfig.TABLE_NAME, partitionedIdentifier) .mode(SaveMode.Overwrite) .save(partitionedLocation); - df.write() - .format("hudi") - .options(QuickstartUtils.getQuickstartWriteConfigs()) - .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number") - .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name") - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "") - .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier) - .mode(SaveMode.Overwrite) - .save(unpartitionedLocation); +// df.write() +// .format("hudi") +// .options(QuickstartUtils.getQuickstartWriteConfigs()) +// .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number") +// .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name") +// .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "") +// .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier) +// .mode(SaveMode.Overwrite) +// .save(unpartitionedLocation); } @Test @@ -233,58 +214,6 @@ public void testHudiPartitionedTableWrite() { LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false)); } - @Test - public void testHudiMetaClientExploration() { - HoodieTableMetaClient hoodieTableMetaClient = - HoodieTableMetaClient.builder() - .setConf(spark.sessionState().newHadoopConf()) - .setBasePath(partitionedLocation) - .setLoadActiveTimelineOnLoad(true) - .build(); - - LOG.info("Alpha test: hoodie table base path: {}", hoodieTableMetaClient.getBasePathV2()); - LOG.info( - "Alpha test: hoodie getBootStrapIndexByFileId: {}", - hoodieTableMetaClient.getBootstrapIndexByFileIdFolderNameFolderPath()); - LOG.info( - "Alpha test: hoodie getBootStrapIndexByPartitionPath: {}", - hoodieTableMetaClient.getBootstrapIndexByPartitionFolderPath()); - LOG.info( - "Alpha test: hoodie getCommitActionType: {}", hoodieTableMetaClient.getCommitActionType()); - LOG.info( - "Alpha test: hoodie getCommitsAndCompactionTimeline: {}", - hoodieTableMetaClient.getCommitsAndCompactionTimeline()); - LOG.info( - "Alpha test: hoodie getCommitsTimeline: {}", hoodieTableMetaClient.getCommitsTimeline()); - LOG.info("Alpha test: hoodie getCommitTimeline: {}", hoodieTableMetaClient.getCommitTimeline()); - LOG.info( - "Alpha test: hoodie getConsistencyGuardConfig: {}", - hoodieTableMetaClient.getConsistencyGuardConfig().toString()); - LOG.info( - "Alpha test: hoodie getFileSystemRetryConfig: {}", - hoodieTableMetaClient.getFileSystemRetryConfig().toString()); - LOG.info( - "Alpha test: hoodie getHashingMetadataPath: {}", - hoodieTableMetaClient.getHashingMetadataPath()); - LOG.info( - "Alpha test: hoodie getMetaAuxiliaryPath: {}", - hoodieTableMetaClient.getMetaAuxiliaryPath()); - LOG.info("Alpha test: hoodie getMetaPath: {}", hoodieTableMetaClient.getMetaPath()); - LOG.info( - "Alpha test: hoodie getMetastoreConfig: {}", - hoodieTableMetaClient.getMetastoreConfig().toString()); - LOG.info( - "Alpha test: hoodie getSchemaFolderName: {}", hoodieTableMetaClient.getSchemaFolderName()); - LOG.info( - "Alpha test: hoodie getTableConfig: {}", hoodieTableMetaClient.getTableConfig().toString()); - LOG.info( - "Alpha test: hoodie getTableType: {}", hoodieTableMetaClient.getTableType().toString()); - LOG.info("Alpha test: hoodie getTempFolderPath: {}", hoodieTableMetaClient.getTempFolderPath()); - LOG.info( - "Alpha test: hoodie getTimelineLayoutVersion: {}", - hoodieTableMetaClient.getTimelineLayoutVersion()); - } - @Test public void testHudiMetaClientAlpha() { LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation); @@ -294,29 +223,93 @@ public void testHudiMetaClientAlpha() { spark, partitionedLocation, newTableIdentifier) .execute(); - checkSnapshotIntegrity(partitionedIdentifier, newTableIdentifier); + checkSnapshotIntegrity(partitionedLocation, newTableIdentifier); } private void checkSnapshotIntegrity( - String hudiTableIdentifier, + String hudiTableLocation, String icebergTableIdentifier) { - -// List deltaTableContents = -// spark.sql("SELECT * FROM " + hudiTableIdentifier).collectAsList(); + Dataset hudiResult = spark.read().format("hudi").option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()).load(hudiTableLocation); + Dataset icebergResult = spark.sql("SELECT * FROM " + icebergTableIdentifier); + // Need to sort the column by names since hudi tends to return the columns in a different order (put the one used for partitioning last) +// Dataset hudiSortedResult = hudiResult.groupBy(getColumns(hudiResult)).count(); +// Dataset icebergSortedResult = icebergResult.groupBy(getColumns(icebergResult)).count(); + List hudiTableContents = + hudiResult.collectAsList(); List icebergTableContents = - spark.sql("SELECT * FROM " + icebergTableIdentifier).collectAsList(); - LOG.info("Iceberg table contents: {}", spark.sql("SELECT * FROM " + icebergTableIdentifier).showString(10, 20, false)); - return; + icebergResult.collectAsList(); + LOG.info("Hudi table contents: {}", hudiResult.showString(10, 20, false)); + LOG.info("Iceberg table contents: {}", icebergResult.showString(10, 20, false)); + Assertions.assertThat(hudiTableContents).hasSize(icebergTableContents.size()); + Assertions.assertThat(hudiTableContents).containsAll(icebergTableContents); + Assertions.assertThat(icebergTableContents).containsAll(hudiTableContents); // TODO: may change to containsExactlyInAnyOrderElementsOf + } -// Assertions.assertThat(deltaTableContents).hasSize(icebergTableContents.size()); -// Assertions.assertThat(icebergTableContents).containsAll(deltaTableContents); -// Assertions.assertThat(deltaTableContents).containsAll(icebergTableContents); + private Column[] getColumns(Dataset df) { + Column[] columns = new Column[df.columns().length]; + for (int i = 0; i < df.columns().length; i++) { + columns[i] = df.col(df.columns()[i]); + } + Arrays.sort(columns, Comparator.comparing(Column::toString)); + return columns; } + private String destName(String catalogName, String dest) { if (catalogName.equals(defaultSparkCatalog)) { return NAMESPACE + "." + catalogName + "_" + dest; } return catalogName + "." + NAMESPACE + "." + catalogName + "_" + dest; } + + private Dataset typeTestDataFrame() { + return spark + .range(0, 5, 1, 5) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("dateCol", date_add(current_date(), 1)) +// .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")) + .withColumn("booleanCol", expr("longCol > 5")) + .withColumn("binaryCol", expr("CAST(longCol AS BINARY)")) + .withColumn("byteCol", expr("CAST(longCol AS BYTE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))")) + .withColumn("shortCol", expr("CAST(longCol AS SHORT)")) + .withColumn("mapCol", expr("MAP(stringCol, shortCol)")) // Hudi requires Map key to be String + .withColumn("arrayCol", expr("ARRAY(longCol)")) + .withColumn("structCol", expr("STRUCT(mapCol, arrayCol)")) + .withColumn("partitionPath", expr("CAST(longCol AS STRING)")); + } + + private Dataset nestedDataFrame() { + return spark + .range(0, 5, 1, 5) + .withColumn("longCol", expr("id")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))")) + .withColumn("magic_number", expr("rand(5) * 100")) + .withColumn("dateCol", date_add(current_date(), 1)) + .withColumn("dateString", expr("CAST(dateCol AS STRING)")) + .withColumn("random1", expr("CAST(rand(5) * 100 as LONG)")) + .withColumn("random2", expr("CAST(rand(51) * 100 as LONG)")) + .withColumn("random3", expr("CAST(rand(511) * 100 as LONG)")) + .withColumn("random4", expr("CAST(rand(15) * 100 as LONG)")) + .withColumn("random5", expr("CAST(rand(115) * 100 as LONG)")) + .withColumn("innerStruct1", expr("STRUCT(random1, random2)")) + .withColumn("innerStruct2", expr("STRUCT(random3, random4)")) + .withColumn("structCol1", expr("STRUCT(innerStruct1, innerStruct2)")) + .withColumn( + "innerStruct3", + expr("STRUCT(SHA1(CAST(random5 AS BINARY)), SHA1(CAST(random1 AS BINARY)))")) + .withColumn( + "structCol2", + expr( + "STRUCT(innerStruct3, STRUCT(SHA1(CAST(random2 AS BINARY)), SHA1(CAST(random3 AS BINARY))))")) + .withColumn("arrayCol", expr("ARRAY(random1, random2, random3, random4, random5)")) + .withColumn("mapCol1", expr("MAP(structCol1, structCol2)")) + .withColumn("mapCol2", expr("MAP(longCol, dateString)")) + .withColumn("mapCol3", expr("MAP(dateCol, arrayCol)")) + .withColumn("structCol3", expr("STRUCT(structCol2, mapCol3, arrayCol)")); + } } From 62ef777e9d35ae87560ce685211fa67693d3e380 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Mon, 30 Jan 2023 22:43:04 -0500 Subject: [PATCH 12/20] find bugs when arrayType presents --- build.gradle | 1 + .../iceberg/hudi/TestSnapshotHudiTable.java | 214 ++++++++++++------ .../hudi/BaseSnapshotHudiTableAction.java | 46 ++-- 3 files changed, 174 insertions(+), 87 deletions(-) diff --git a/build.gradle b/build.gradle index a2c284166c49..82d27288d7d1 100644 --- a/build.gradle +++ b/build.gradle @@ -473,6 +473,7 @@ project(':iceberg-hudi') { exclude group: 'com.google.code.gson', module: 'gson' } if (sparkVersions.contains("3.3") && scalaVersion == "2.12") { + integrationImplementation project(':iceberg-data') integrationImplementation("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2") integrationImplementation project(path: ":iceberg-spark:iceberg-spark-3.3_${scalaVersion}") integrationImplementation("org.apache.hadoop:hadoop-minicluster") { diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index 071e6151093c..bf21ebeb4bbe 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -24,31 +24,24 @@ import java.io.File; import java.io.IOException; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - import org.apache.hudi.DataSourceReadOptions; import org.apache.hudi.DataSourceWriteOptions; -import org.apache.hudi.QuickstartUtils; -import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.spark.SparkSessionCatalog; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Column; +import org.apache.iceberg.spark.Spark3Util; +import org.apache.iceberg.spark.SparkCatalog; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.RelationalGroupedDataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.connector.catalog.CatalogPlugin; import org.apache.spark.sql.hudi.catalog.HoodieCatalog; import org.assertj.core.api.Assertions; import org.junit.Before; @@ -80,13 +73,15 @@ public class TestSnapshotHudiTable extends SparkHudiMigrationTestBase { private String unpartitionedLocation; private String newIcebergTableLocation; private String externalDataFilesTableLocation; + private Dataset typeTestDataframe = typeTestDataFrame(); + private Dataset nestedDataframe = nestedDataFrame(); @Parameterized.Parameters(name = "Catalog Name {0} - Options {2}") public static Object[][] parameters() { return new Object[][] { new Object[] { icebergCatalogName, - SparkSessionCatalog.class.getName(), + SparkCatalog.class.getName(), ImmutableMap.of( "type", "hive", @@ -177,27 +172,39 @@ public void before() throws IOException { spark.sql(String.format("DROP TABLE IF EXISTS %s", unpartitionedIdentifier)); spark.sql(String.format("DROP TABLE IF EXISTS %s", externalDataFilesIdentifier)); - Dataset df = typeTestDataFrame(); - - df.write() - .format("hudi") - .options(QuickstartUtils.getQuickstartWriteConfigs()) - .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "decimalCol") - .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "intCol") - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partitionPath") - .option(HoodieWriteConfig.TABLE_NAME, partitionedIdentifier) - .mode(SaveMode.Overwrite) - .save(partitionedLocation); + // typeTestDataframe.write() + // .format("hudi") + // .options(QuickstartUtils.getQuickstartWriteConfigs()) + // .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "decimalCol") + // .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "intCol") + // .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partitionPath") + // .option(HoodieWriteConfig.TABLE_NAME, partitionedIdentifier) + // .mode(SaveMode.Overwrite) + // .save(partitionedLocation); + writeHoodieTable( + typeTestDataframe, + "decimalCol", + "intCol", + "partitionPath", + partitionedLocation, + partitionedIdentifier); -// df.write() -// .format("hudi") -// .options(QuickstartUtils.getQuickstartWriteConfigs()) -// .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number") -// .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name") -// .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "") -// .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier) -// .mode(SaveMode.Overwrite) -// .save(unpartitionedLocation); + // df.write() + // .format("hudi") + // .options(QuickstartUtils.getQuickstartWriteConfigs()) + // .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number") + // .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name") + // .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "") + // .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier) + // .mode(SaveMode.Overwrite) + // .save(unpartitionedLocation); + writeHoodieTable( + typeTestDataframe, + "decimalCol", + "intCol", + "", + unpartitionedLocation, + unpartitionedIdentifier); } @Test @@ -215,45 +222,95 @@ public void testHudiPartitionedTableWrite() { } @Test - public void testHudiMetaClientAlpha() { + public void testBasicPartitionedTable() { LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation); String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table"); SnapshotHudiTable.Result result = HudiToIcebergMigrationSparkIntegration.snapshotHudiTable( spark, partitionedLocation, newTableIdentifier) .execute(); + Table table = getIcebergTable(newTableIdentifier); + queryManual(table); + // checkSnapshotIntegrity(partitionedLocation, newTableIdentifier); + } - checkSnapshotIntegrity(partitionedLocation, newTableIdentifier); + @Test + public void referenceIcebergTable() { + String newTableIdentifier = destName(icebergCatalogName, "reference_iceberg_table"); + typeTestDataframe + .writeTo(newTableIdentifier) + .using("iceberg") + .tableProperty( + TableProperties.WRITE_DATA_LOCATION, + "/Users/jonasjiang/Workspace/Apache_Hudi_ws/hudi_table_test/unpartitioned_iceberg_ref") + .tableProperty( + TableProperties.WRITE_METADATA_LOCATION, + "/Users/jonasjiang/Workspace/Apache_Hudi_ws/hudi_table_test/unpartitioned_iceberg_ref/metadata") + .createOrReplace(); + Table table = getIcebergTable(newTableIdentifier); + queryManual(table); } - private void checkSnapshotIntegrity( - String hudiTableLocation, - String icebergTableIdentifier) { - Dataset hudiResult = spark.read().format("hudi").option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()).load(hudiTableLocation); + @Test + public void testBasicUnpartitionedTable() { + String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_2"); + SnapshotHudiTable.Result result = + HudiToIcebergMigrationSparkIntegration.snapshotHudiTable( + spark, unpartitionedLocation, newTableIdentifier) + .execute(); + + Dataset hudiResult = + spark + .read() + .format("hudi") + .option( + DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), + DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()) + .load(unpartitionedLocation); + LOG.info("Hudi table contents: {}", hudiResult.showString(10, 20, false)); + Table table = getIcebergTable(newTableIdentifier); + queryManual(table); + checkSnapshotIntegrity(unpartitionedLocation, newTableIdentifier); + } + + private void checkSnapshotIntegrity(String hudiTableLocation, String icebergTableIdentifier) { + Dataset hudiResult = + spark + .read() + .format("hudi") + .option( + DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), + DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()) + .load(hudiTableLocation); Dataset icebergResult = spark.sql("SELECT * FROM " + icebergTableIdentifier); - // Need to sort the column by names since hudi tends to return the columns in a different order (put the one used for partitioning last) -// Dataset hudiSortedResult = hudiResult.groupBy(getColumns(hudiResult)).count(); -// Dataset icebergSortedResult = icebergResult.groupBy(getColumns(icebergResult)).count(); - List hudiTableContents = - hudiResult.collectAsList(); - List icebergTableContents = - icebergResult.collectAsList(); LOG.info("Hudi table contents: {}", hudiResult.showString(10, 20, false)); LOG.info("Iceberg table contents: {}", icebergResult.showString(10, 20, false)); + // TODO: adjust test technique since hudi tends to return the columns in a different order (put + // the one used for partitioning last) + List hudiTableContents = hudiResult.collectAsList(); + List icebergTableContents = icebergResult.collectAsList(); + Assertions.assertThat(hudiTableContents).hasSize(icebergTableContents.size()); Assertions.assertThat(hudiTableContents).containsAll(icebergTableContents); - Assertions.assertThat(icebergTableContents).containsAll(hudiTableContents); // TODO: may change to containsExactlyInAnyOrderElementsOf + Assertions.assertThat(icebergTableContents) + .containsAll(hudiTableContents); // TODO: may change to containsExactlyInAnyOrderElementsOf } - private Column[] getColumns(Dataset df) { - Column[] columns = new Column[df.columns().length]; - for (int i = 0; i < df.columns().length; i++) { - columns[i] = df.col(df.columns()[i]); + private void queryManual(Table table) { + CloseableIterable records = IcebergGenerics.read(table).build(); + for (Record record : records) { + LOG.info("Alpha Test Iceberg Record: {}", record); } - Arrays.sort(columns, Comparator.comparing(Column::toString)); - return columns; } + private Table getIcebergTable(String icebergTableIdentifier) { + CatalogPlugin defaultCatalog = spark.sessionState().catalogManager().currentCatalog(); + Spark3Util.CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier( + "test catalog", spark, icebergTableIdentifier, defaultCatalog); + return Spark3Util.loadIcebergCatalog(spark, catalogAndIdent.catalog().name()) + .loadTable(TableIdentifier.parse(catalogAndIdent.identifier().toString())); + } private String destName(String catalogName, String dest) { if (catalogName.equals(defaultSparkCatalog)) { @@ -267,20 +324,23 @@ private Dataset typeTestDataFrame() { .range(0, 5, 1, 5) .withColumnRenamed("id", "longCol") .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + // .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + // .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) .withColumn("dateCol", date_add(current_date(), 1)) -// .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + // .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) .withColumn("stringCol", expr("CAST(dateCol AS STRING)")) - .withColumn("booleanCol", expr("longCol > 5")) - .withColumn("binaryCol", expr("CAST(longCol AS BINARY)")) - .withColumn("byteCol", expr("CAST(longCol AS BYTE)")) + // .withColumn("booleanCol", expr("longCol > 5")) + // .withColumn("binaryCol", expr("CAST(longCol AS BINARY)")) + // .withColumn("byteCol", expr("CAST(longCol AS BYTE)")) .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))")) - .withColumn("shortCol", expr("CAST(longCol AS SHORT)")) - .withColumn("mapCol", expr("MAP(stringCol, shortCol)")) // Hudi requires Map key to be String - .withColumn("arrayCol", expr("ARRAY(longCol)")) - .withColumn("structCol", expr("STRUCT(mapCol, arrayCol)")) - .withColumn("partitionPath", expr("CAST(longCol AS STRING)")); + // .withColumn("shortCol", expr("CAST(longCol AS SHORT)")) + .withColumn("mapCol", expr("MAP(stringCol, intCol)")) // Hudi requires Map key to be String + .withColumn("arrayCol", expr("ARRAY(dateCol)")) + // .withColumn("structCol", expr("STRUCT(longCol AS a, longCol AS b)")) + .withColumn( + "partitionPath", + expr("CAST(longCol AS STRING)")); // For test convenience, please put the partition col + // in the end. } private Dataset nestedDataFrame() { @@ -312,4 +372,22 @@ private Dataset nestedDataFrame() { .withColumn("mapCol3", expr("MAP(dateCol, arrayCol)")) .withColumn("structCol3", expr("STRUCT(structCol2, mapCol3, arrayCol)")); } + + private void writeHoodieTable( + Dataset df, + String recordKey, + String preCombineKey, + String partitionPathField, + String tableLocation, + String tableIdentifier) { + df.write() + .format("hudi") + // .options(QuickstartUtils.getQuickstartWriteConfigs()) + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey) + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), preCombineKey) + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionPathField) + .option(HoodieWriteConfig.TBL_NAME.key(), tableIdentifier) + .mode(SaveMode.Append) + .save(tableLocation); + } } diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 06f61cf1b62c..69f631180a5e 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -25,6 +25,7 @@ import java.util.stream.Stream; import javax.annotation.Nullable; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.engine.HoodieLocalEngineContext; @@ -37,16 +38,12 @@ import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.FileSystemViewManager; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.internal.schema.InternalSchema; import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; -import org.apache.hudi.metadata.HoodieTableMetadata; - -import org.apache.hadoop.fs.Path; import org.apache.iceberg.AppendFiles; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; @@ -118,12 +115,14 @@ public BaseSnapshotHudiTableAction( @Override public SnapshotHudiTable tableProperties(Map properties) { - return null; + additionalPropertiesBuilder.putAll(properties); + return this; } @Override public SnapshotHudiTable tableProperty(String key, String value) { - return null; + additionalPropertiesBuilder.put(key, value); + return this; } @Override @@ -137,17 +136,20 @@ public Result execute() { // TODO: add support for newTableLocation Transaction icebergTransaction = icebergCatalog.newCreateTableTransaction( - newTableIdentifier, icebergSchema, partitionSpec, destTableProperties()); + newTableIdentifier, + icebergSchema, + partitionSpec, + hoodieTableBasePath, + destTableProperties()); // We need name mapping to ensure we can read data files correctly as iceberg table has its own // rule to assign field id // Although the field id rule seems to be the same as hudi, but the rule is not guaranteed by // any API + NameMapping nameMapping = MappingUtil.create(icebergTransaction.table().schema()); icebergTransaction .table() .updateProperties() - .set( - TableProperties.DEFAULT_NAME_MAPPING, - NameMappingParser.toJson(MappingUtil.create(icebergTransaction.table().schema()))) + .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)) .commit(); // Pre-process the timeline, we only need to process all COMPLETED commit for COW table @@ -156,18 +158,24 @@ public Result execute() { hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); // Initialize the FileSystemView for querying table data files // TODO: need to choose the correct implementation of the FileSystemView -// HoodieTableFileSystemView hoodieTableFileSystemView = -// FileSystemViewManager.createInMemoryFileSystemViewWithTimeline( -// hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline); - HoodieTableFileSystemView hoodieTableFileSystemView = new HoodieTableFileSystemView( - hoodieTableMetaClient, timeline); + // HoodieTableFileSystemView hoodieTableFileSystemView = + // FileSystemViewManager.createInMemoryFileSystemViewWithTimeline( + // hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline); + HoodieTableFileSystemView hoodieTableFileSystemView = + new HoodieTableFileSystemView(hoodieTableMetaClient, timeline); // get all instants on the timeline Stream completedInstants = timeline.getInstants(); - List partitionPaths = FSUtils.getAllPartitionPaths(hoodieEngineContext, hoodieMetadataConfig, hoodieTableMetaClient.getBasePathV2().toString()); + List partitionPaths = + FSUtils.getAllPartitionPaths( + hoodieEngineContext, + hoodieMetadataConfig, + hoodieTableMetaClient.getBasePathV2().toString()); try { for (String partitionPath : partitionPaths) { - Path fullPartitionPath = FSUtils.getPartitionPath(hoodieTableMetaClient.getBasePathV2(), partitionPath); - hoodieTableFileSystemView.addFilesToView(FSUtils.getAllDataFilesInPartition(hoodieTableMetaClient.getFs(), fullPartitionPath)); + Path fullPartitionPath = + FSUtils.getPartitionPath(hoodieTableMetaClient.getBasePathV2(), partitionPath); + hoodieTableFileSystemView.addFilesToView( + FSUtils.getAllDataFilesInPartition(hoodieTableMetaClient.getFs(), fullPartitionPath)); } } catch (IOException e) { throw new RuntimeException("Failed to get all data files in partition", e); @@ -380,7 +388,7 @@ private Schema convertToIcebergSchema(InternalSchema hudiSchema) { private PartitionSpec getPartitionSpecFromHoodieMetadataData(Schema schema) { Option partitionNames = hoodieTableConfig.getPartitionFields(); - if (partitionNames.isPresent()) { + if (partitionNames.isPresent() && partitionNames.get().length > 0) { PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); for (String partitionName : partitionNames.get()) { builder.identity(partitionName); From 8fef8c9b9f3962792195e09e143372cbd75a67c1 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Tue, 31 Jan 2023 17:40:19 -0500 Subject: [PATCH 13/20] remove the need of hadoop-mr --- .../apache/iceberg/hudi/TestSnapshotHudiTable.java | 2 +- .../iceberg/hudi/BaseSnapshotHudiTableAction.java | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index bf21ebeb4bbe..57a58cb383ab 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -231,7 +231,7 @@ public void testBasicPartitionedTable() { .execute(); Table table = getIcebergTable(newTableIdentifier); queryManual(table); - // checkSnapshotIntegrity(partitionedLocation, newTableIdentifier); + checkSnapshotIntegrity(partitionedLocation, newTableIdentifier); } @Test diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 69f631180a5e..2191c6a88f56 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -41,7 +41,6 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.internal.schema.InternalSchema; import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; import org.apache.iceberg.AppendFiles; @@ -107,7 +106,7 @@ public BaseSnapshotHudiTableAction( this.hoodieTableConfig = hoodieTableMetaClient.getTableConfig(); this.hoodieEngineContext = new HoodieLocalEngineContext(hoodieConfiguration); this.hoodieTableBasePath = hoodieTableBasePath; - this.hoodieMetadataConfig = HoodieInputFormatUtils.buildMetadataConfig(hoodieConfiguration); + this.hoodieMetadataConfig = buildMetadataConfig(hoodieConfiguration); this.hoodieFileIO = new HadoopFileIO(hoodieConfiguration); this.icebergCatalog = icebergCatalog; this.newTableIdentifier = newTableIdentifier; @@ -198,8 +197,6 @@ public Result execute() { // BEGIN TEST ONLY CODE List testGroups = hoodieTableFileSystemView.getLatestBaseFiles().collect(Collectors.toList()); - LOG.info("Alpha test: get all stamped data files: {}", allStampedDataFiles); - LOG.info("Alpha test: get all file groups: {}", testGroups); // END TEST ONLY CODE // Help tracked if a previous version of the data file has been added to the iceberg table @@ -448,4 +445,13 @@ private Metrics getMetricsForFile( throw new ValidationException("Cannot get metrics from file format: %s", format); } } + + private HoodieMetadataConfig buildMetadataConfig(Configuration conf) { + return HoodieMetadataConfig.newBuilder() + .enable( + conf.getBoolean( + HoodieMetadataConfig.ENABLE.key(), + HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS)) + .build(); + } } From 9db1ead9b263d2b5fe9475f6edba5db6c1b7a4cc Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Tue, 31 Jan 2023 22:40:54 -0500 Subject: [PATCH 14/20] verify multiple commits --- .../iceberg/hudi/TestSnapshotHudiTable.java | 274 +++++++++--------- .../hudi/BaseSnapshotHudiTableAction.java | 2 + 2 files changed, 141 insertions(+), 135 deletions(-) diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index 57a58cb383ab..5a466f6fcee3 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -30,7 +30,6 @@ import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.data.IcebergGenerics; import org.apache.iceberg.data.Record; @@ -65,14 +64,14 @@ public class TestSnapshotHudiTable extends SparkHudiMigrationTestBase { private static final String icebergCatalogName = "iceberg_hive"; private String partitionedIdentifier; private String unpartitionedIdentifier; - private String externalDataFilesIdentifier; + private String multiCommitIdentifier; private final String partitionedTableName = "partitioned_table"; private final String unpartitionedTableName = "unpartitioned_table"; - private final String externalDataFilesTableName = "external_data_files_table"; + private final String multiCommitTableName = "multi_commit_table"; private String partitionedLocation; private String unpartitionedLocation; private String newIcebergTableLocation; - private String externalDataFilesTableLocation; + private String multiCommitTableLocation; private Dataset typeTestDataframe = typeTestDataFrame(); private Dataset nestedDataframe = nestedDataFrame(); @@ -107,122 +106,38 @@ public TestSnapshotHudiTable( spark.conf().set("spark.sql.catalog." + defaultSparkCatalog, HoodieCatalog.class.getName()); } - /** - * The test hardcode a nested dataframe to test the snapshot feature. The schema of created - * dataframe is: - * - *

-   *  root
-   *  |-- address_nested: struct (nullable = true)
-   *  |    |-- current: struct (nullable = true)
-   *  |    |    |-- city: string (nullable = true)
-   *  |    |    |-- state: string (nullable = true)
-   *  |    |-- previous: struct (nullable = true)
-   *  |    |    |-- city: string (nullable = true)
-   *  |    |    |-- state: string (nullable = true)
-   *  |-- addresses: array (nullable = true)
-   *  |    |-- element: struct (containsNull = true)
-   *  |    |    |-- city: string (nullable = true)
-   *  |    |    |-- state: string (nullable = true)
-   *  |-- id: long (nullable = true)
-   *  |-- magic_number: double (nullable = true)
-   *  |-- name: string (nullable = true)
-   *  |-- properties: struct (nullable = true)
-   *  |    |-- eye: string (nullable = true)
-   *  |    |-- hair: string (nullable = true)
-   *  |-- secondProp: struct (nullable = true)
-   *  |    |-- height: string (nullable = true)
-   *  |-- subjects: array (nullable = true)
-   *  |    |-- element: array (containsNull = true)
-   *  |    |    |-- element: string (containsNull = true)
-   * 
- * - * The dataframe content is (by calling df.show()): - * - *
-   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
-   * |      address_nested|           addresses| id|  magic_number|   name|          properties|secondProp|            subjects|
-   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
-   * |{{NewYork, NY}, {...|[{SanJose, CA}, {...|  1|1.123123123123|Michael|      {black, brown}|       {6}|[[Java, Scala, C+...|
-   * |{{NewY1231ork, N1...|[{SanJos123123e, ...|  2|2.123123123123|   Test|      {black, brown}|       {6}|[[Java, Scala, C+...|
-   * |                null|[{SanJose, CA}, {...|  3|3.123123123123|   Test|      {black, brown}|       {6}|[[Java, Scala, C+...|
-   * |{{NewYork, NY}, {...|[{LA, CA}, {Sandi...|  4|4.123123123123|   John|{bla3221ck, b12rown}|     {633}|     [[Spark, Java]]|
-   * |{{Haha, PA}, {nul...|[{Pittsburgh, PA}...|  5|5.123123123123|  Jonas|      {black, black}|       {7}|[[Java, Scala, C+...|
-   * +--------------------+--------------------+---+--------------+-------+--------------------+----------+--------------------+
-   * 
- */ @Before public void before() throws IOException { File partitionedFolder = temp1.newFolder(); File unpartitionedFolder = temp2.newFolder(); File newIcebergTableFolder = temp3.newFolder(); - File externalDataFilesTableFolder = temp4.newFolder(); + File multiCommitTableFolder = temp4.newFolder(); partitionedLocation = partitionedFolder.toURI().toString(); unpartitionedLocation = unpartitionedFolder.toURI().toString(); newIcebergTableLocation = newIcebergTableFolder.toURI().toString(); - externalDataFilesTableLocation = externalDataFilesTableFolder.toURI().toString(); + multiCommitTableLocation = multiCommitTableFolder.toURI().toString(); spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", NAMESPACE)); partitionedIdentifier = destName(defaultSparkCatalog, partitionedTableName); unpartitionedIdentifier = destName(defaultSparkCatalog, unpartitionedTableName); - externalDataFilesIdentifier = destName(defaultSparkCatalog, externalDataFilesTableName); + multiCommitIdentifier = destName(defaultSparkCatalog, multiCommitTableName); spark.sql(String.format("DROP TABLE IF EXISTS %s", partitionedIdentifier)); spark.sql(String.format("DROP TABLE IF EXISTS %s", unpartitionedIdentifier)); - spark.sql(String.format("DROP TABLE IF EXISTS %s", externalDataFilesIdentifier)); + spark.sql(String.format("DROP TABLE IF EXISTS %s", multiCommitIdentifier)); + } - // typeTestDataframe.write() - // .format("hudi") - // .options(QuickstartUtils.getQuickstartWriteConfigs()) - // .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "decimalCol") - // .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "intCol") - // .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partitionPath") - // .option(HoodieWriteConfig.TABLE_NAME, partitionedIdentifier) - // .mode(SaveMode.Overwrite) - // .save(partitionedLocation); + @Test + public void testBasicPartitionedTable() { writeHoodieTable( typeTestDataframe, "decimalCol", "intCol", "partitionPath", + SaveMode.Overwrite, partitionedLocation, partitionedIdentifier); - - // df.write() - // .format("hudi") - // .options(QuickstartUtils.getQuickstartWriteConfigs()) - // .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "magic_number") - // .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "name") - // .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "") - // .option(HoodieWriteConfig.TABLE_NAME, unpartitionedIdentifier) - // .mode(SaveMode.Overwrite) - // .save(unpartitionedLocation); - writeHoodieTable( - typeTestDataframe, - "decimalCol", - "intCol", - "", - unpartitionedLocation, - unpartitionedIdentifier); - } - - @Test - public void testHudiUnpartitionedTableWrite() { - Dataset df = spark.read().format("hudi").load(unpartitionedLocation); - LOG.info("Generated unpartitioned dataframe shcema: {}", df.schema().treeString()); - LOG.info("Generated unpartitioned dataframe: {}", df.showString(10, 20, false)); - } - - @Test - public void testHudiPartitionedTableWrite() { - Dataset df = spark.read().format("hudi").load(partitionedLocation); - LOG.info("Generated partitioned dataframe shcema: {}", df.schema().treeString()); - LOG.info("Generated partitioned dataframe: {}", df.showString(10, 20, false)); - } - - @Test - public void testBasicPartitionedTable() { LOG.info("Alpha test reference: hoodie table path: {}", partitionedLocation); String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table"); SnapshotHudiTable.Result result = @@ -234,52 +149,93 @@ public void testBasicPartitionedTable() { checkSnapshotIntegrity(partitionedLocation, newTableIdentifier); } - @Test - public void referenceIcebergTable() { - String newTableIdentifier = destName(icebergCatalogName, "reference_iceberg_table"); - typeTestDataframe - .writeTo(newTableIdentifier) - .using("iceberg") - .tableProperty( - TableProperties.WRITE_DATA_LOCATION, - "/Users/jonasjiang/Workspace/Apache_Hudi_ws/hudi_table_test/unpartitioned_iceberg_ref") - .tableProperty( - TableProperties.WRITE_METADATA_LOCATION, - "/Users/jonasjiang/Workspace/Apache_Hudi_ws/hudi_table_test/unpartitioned_iceberg_ref/metadata") - .createOrReplace(); - Table table = getIcebergTable(newTableIdentifier); - queryManual(table); - } - @Test public void testBasicUnpartitionedTable() { + writeHoodieTable( + typeTestDataframe, + "decimalCol", + "intCol", + "", + SaveMode.Overwrite, + unpartitionedLocation, + unpartitionedIdentifier); String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_2"); SnapshotHudiTable.Result result = HudiToIcebergMigrationSparkIntegration.snapshotHudiTable( spark, unpartitionedLocation, newTableIdentifier) .execute(); - - Dataset hudiResult = - spark - .read() - .format("hudi") - .option( - DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), - DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()) - .load(unpartitionedLocation); - LOG.info("Hudi table contents: {}", hudiResult.showString(10, 20, false)); - Table table = getIcebergTable(newTableIdentifier); - queryManual(table); checkSnapshotIntegrity(unpartitionedLocation, newTableIdentifier); } + @Test + public void testMultiCommitTable() { + Dataset initialDataFrame = multiDataFrame(0, 2); + writeHoodieTable( + initialDataFrame, + "decimalCol", + "magic_number", + "partitionPath", + SaveMode.Append, + multiCommitTableLocation, + multiCommitIdentifier); + writeHoodieTable( + initialDataFrame, + "decimalCol", + "magic_number", + "partitionPath", + SaveMode.Append, + multiCommitTableLocation, + multiCommitIdentifier); + writeHoodieTable( + multiDataFrame(2, 5), + "decimalCol", + "magic_number", + "partitionPath", + SaveMode.Append, + multiCommitTableLocation, + multiCommitIdentifier); + writeHoodieTable( + multiDataFrame(0, 1), + "decimalCol", + "magic_number", + "partitionPath", + SaveMode.Append, + multiCommitTableLocation, + multiCommitIdentifier); + Dataset toDelete = multiDataFrame(4, 5); + writeHoodieTable( + toDelete, + "decimalCol", + "magic_number", + "partitionPath", + SaveMode.Append, + multiCommitTableLocation, + multiCommitIdentifier); + writeHoodieTableOperation( + toDelete, + DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(), + "decimalCol", + "magic_number", + "partitionPath", + SaveMode.Append, + multiCommitTableLocation, + multiCommitIdentifier); + + String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_3"); + SnapshotHudiTable.Result result = + HudiToIcebergMigrationSparkIntegration.snapshotHudiTable( + spark, multiCommitTableLocation, newTableIdentifier) + .execute(); + checkSnapshotIntegrity(multiCommitTableLocation, newTableIdentifier); + } + private void checkSnapshotIntegrity(String hudiTableLocation, String icebergTableIdentifier) { Dataset hudiResult = spark .read() .format("hudi") .option( - DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), + DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()) .load(hudiTableLocation); Dataset icebergResult = spark.sql("SELECT * FROM " + icebergTableIdentifier); @@ -324,25 +280,52 @@ private Dataset typeTestDataFrame() { .range(0, 5, 1, 5) .withColumnRenamed("id", "longCol") .withColumn("intCol", expr("CAST(longCol AS INT)")) - // .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - // .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) .withColumn("dateCol", date_add(current_date(), 1)) // .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) .withColumn("stringCol", expr("CAST(dateCol AS STRING)")) - // .withColumn("booleanCol", expr("longCol > 5")) - // .withColumn("binaryCol", expr("CAST(longCol AS BINARY)")) - // .withColumn("byteCol", expr("CAST(longCol AS BYTE)")) + .withColumn("booleanCol", expr("longCol > 5")) + .withColumn("binaryCol", expr("CAST(longCol AS BINARY)")) + .withColumn("byteCol", expr("CAST(longCol AS BYTE)")) .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))")) - // .withColumn("shortCol", expr("CAST(longCol AS SHORT)")) + .withColumn("shortCol", expr("CAST(longCol AS SHORT)")) .withColumn("mapCol", expr("MAP(stringCol, intCol)")) // Hudi requires Map key to be String - .withColumn("arrayCol", expr("ARRAY(dateCol)")) - // .withColumn("structCol", expr("STRUCT(longCol AS a, longCol AS b)")) + // .withColumn("arrayCol", expr("ARRAY(dateCol)")) // hudi's parquet handles array + // type differently from iceberg + .withColumn("structCol", expr("STRUCT(longCol AS a, longCol AS b)")) .withColumn( "partitionPath", expr("CAST(longCol AS STRING)")); // For test convenience, please put the partition col // in the end. } + private Dataset multiDataFrame(int start, int end) { + return spark + .range(start, end, 1, end - start) + .withColumn("longCol", expr("id")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))")) + .withColumn("magic_number", expr("rand(5) * 100")) + .withColumn("dateCol", date_add(current_date(), 1)) + .withColumn("dateString", expr("CAST(dateCol AS STRING)")) + .withColumn("random1", expr("CAST(rand(5) * 100 as LONG)")) + .withColumn("random2", expr("CAST(rand(51) * 100 as LONG)")) + .withColumn("random3", expr("CAST(rand(511) * 100 as LONG)")) + .withColumn("random4", expr("CAST(rand(15) * 100 as LONG)")) + .withColumn("random5", expr("CAST(rand(115) * 100 as LONG)")) + .withColumn("innerStruct1", expr("STRUCT(random1, random2)")) + .withColumn("innerStruct2", expr("STRUCT(random3, random4)")) + .withColumn("structCol1", expr("STRUCT(innerStruct1, innerStruct2)")) + .withColumn( + "innerStruct3", + expr("STRUCT(SHA1(CAST(random5 AS BINARY)), SHA1(CAST(random1 AS BINARY)))")) + .withColumn( + "structCol2", + expr( + "STRUCT(innerStruct3, STRUCT(SHA1(CAST(random2 AS BINARY)), SHA1(CAST(random3 AS BINARY))))")) + .withColumn("partitionPath", expr("CAST(id AS STRING)")); + } + private Dataset nestedDataFrame() { return spark .range(0, 5, 1, 5) @@ -378,6 +361,7 @@ private void writeHoodieTable( String recordKey, String preCombineKey, String partitionPathField, + SaveMode saveMode, String tableLocation, String tableIdentifier) { df.write() @@ -387,7 +371,27 @@ private void writeHoodieTable( .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), preCombineKey) .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionPathField) .option(HoodieWriteConfig.TBL_NAME.key(), tableIdentifier) - .mode(SaveMode.Append) + .mode(saveMode) + .save(tableLocation); + } + + private void writeHoodieTableOperation( + Dataset df, + String operationKey, + String recordKey, + String preCombineKey, + String partitionPathField, + SaveMode saveMode, + String tableLocation, + String tableIdentifier) { + df.write() + .format("hudi") + .option(DataSourceWriteOptions.OPERATION().key(), operationKey) + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey) + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), preCombineKey) + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionPathField) + .option(HoodieWriteConfig.TBL_NAME.key(), tableIdentifier) + .mode(saveMode) .save(tableLocation); } } diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 2191c6a88f56..5c80cef94f68 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -153,6 +153,8 @@ public Result execute() { // Pre-process the timeline, we only need to process all COMPLETED commit for COW table // Commit that has been rollbacked will not be in either REQUESTED or INFLIGHT state + HoodieTimeline commitsTimeline = hoodieTableMetaClient.getCommitsTimeline(); + HoodieTimeline archivedTimeline = hoodieTableMetaClient.getArchivedTimeline(); HoodieTimeline timeline = hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); // Initialize the FileSystemView for querying table data files From 94dddef5b6ff02551fc223cb03718418e54b6bd7 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Wed, 1 Feb 2023 21:59:10 -0500 Subject: [PATCH 15/20] make arrayType possible by enforcing the new list type in parquet --- .../apache/iceberg/hudi/SparkHudiMigrationTestBase.java | 1 + .../org/apache/iceberg/hudi/TestSnapshotHudiTable.java | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java b/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java index 42703c4403ae..7fd5f9bd69f1 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/SparkHudiMigrationTestBase.java @@ -45,6 +45,7 @@ public static void startMetastoreAndSpark() { .config( "spark.hadoop." + HiveConf.ConfVars.METASTOREURIS.varname, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)) + .config("spark.hadoop.parquet.avro.write-old-list-structure", "false") .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index 5a466f6fcee3..d5cc02234645 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -283,7 +283,7 @@ private Dataset typeTestDataFrame() { .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) .withColumn("dateCol", date_add(current_date(), 1)) - // .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) .withColumn("stringCol", expr("CAST(dateCol AS STRING)")) .withColumn("booleanCol", expr("longCol > 5")) .withColumn("binaryCol", expr("CAST(longCol AS BINARY)")) @@ -291,13 +291,11 @@ private Dataset typeTestDataFrame() { .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))")) .withColumn("shortCol", expr("CAST(longCol AS SHORT)")) .withColumn("mapCol", expr("MAP(stringCol, intCol)")) // Hudi requires Map key to be String - // .withColumn("arrayCol", expr("ARRAY(dateCol)")) // hudi's parquet handles array - // type differently from iceberg + .withColumn("arrayCol", expr("ARRAY(dateCol)")) .withColumn("structCol", expr("STRUCT(longCol AS a, longCol AS b)")) .withColumn( "partitionPath", - expr("CAST(longCol AS STRING)")); // For test convenience, please put the partition col - // in the end. + expr("CAST(longCol AS STRING)")); } private Dataset multiDataFrame(int start, int end) { From cc0a9bb001d08a6ce23d4f7b18995beebc05e860 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Thu, 2 Feb 2023 00:20:24 -0500 Subject: [PATCH 16/20] add tests refactor the base action implementation and add ci --- .github/workflows/hudi-conversion-ci.yaml | 86 ++++++++++++++++++ ...udiToIcebergMigrationSparkIntegration.java | 21 +++-- .../iceberg/hudi/TestSnapshotHudiTable.java | 90 +++++++++++++++---- .../hudi/BaseSnapshotHudiTableAction.java | 89 ++++++++---------- ...HudiToIcebergMigrationActionsProvider.java | 20 ++++- .../iceberg/hudi/SnapshotHudiTable.java | 55 +++++++++++- 6 files changed, 282 insertions(+), 79 deletions(-) create mode 100644 .github/workflows/hudi-conversion-ci.yaml diff --git a/.github/workflows/hudi-conversion-ci.yaml b/.github/workflows/hudi-conversion-ci.yaml new file mode 100644 index 000000000000..3e2b1018acec --- /dev/null +++ b/.github/workflows/hudi-conversion-ci.yaml @@ -0,0 +1,86 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Hudi Conversion CI" +on: + push: + branches: + - 'master' + - '0.**' + tags: + - 'apache-iceberg-**' + pull_request: + paths-ignore: + - '.github/ISSUE_TEMPLATE/iceberg_bug_report.yml' + - '.github/workflows/python-ci.yml' + - '.github/workflows/flink-ci.yml' + - '.github/workflows/hive-ci.yml' + - '.gitignore' + - '.asf.yml' + - 'dev/**' + - 'mr/**' + - 'hive3/**' + - 'hive3-orc-bundle/**' + - 'hive-runtime/**' + - 'flink/**' + - 'pig/**' + - 'python/**' + - 'python_legacy/**' + - 'docs/**' + - 'open-api/**' + - 'format/**' + - '.gitattributes' + - 'README.md' + - 'CONTRIBUTING.md' + - 'LICENSE' + - 'NOTICE' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + hudi-conversion-scala-2-12-tests: + runs-on: ubuntu-20.04 + strategy: + matrix: + jvm: [8, 11] + env: + SPARK_LOCAL_IP: localhost + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-java@v3 + with: + distribution: zulu + java-version: ${{ matrix.jvm }} + - uses: actions/cache@v3 + with: + path: | + ~/.gradle/caches + ~/.gradle/wrapper + key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties') }} + restore-keys: ${{ runner.os }}-gradle- + - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts + - run: ./gradlew -DsparkVersions=3.3 -DscalaVersion=2.12 -DhiveVersions= -DflinkVersions= :iceberg-hudi:check -Pquick=true -x javadoc + - uses: actions/upload-artifact@v3 + if: failure() + with: + name: test logs + path: | + **/build/testlogs diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java index cfeca68687b1..cf06fa9556cc 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/HudiToIcebergMigrationSparkIntegration.java @@ -19,6 +19,7 @@ package org.apache.iceberg.hudi; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.spark.Spark3Util; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.connector.catalog.CatalogPlugin; @@ -28,15 +29,23 @@ private HudiToIcebergMigrationSparkIntegration() {} static SnapshotHudiTable snapshotHudiTable( SparkSession spark, String hudiTablePath, String newTableIdentifier) { + Preconditions.checkArgument( + spark != null, "The SparkSession cannot be null, please provide a valid SparkSession"); + Preconditions.checkArgument( + newTableIdentifier != null, + "The table identifier cannot be null, please provide a valid table identifier for the new iceberg table"); + Preconditions.checkArgument( + hudiTablePath != null, + "The hudi table location cannot be null, please provide a valid location of the delta lake table to be snapshot"); String ctx = "hudi snapshot target"; CatalogPlugin defaultCatalog = spark.sessionState().catalogManager().currentCatalog(); Spark3Util.CatalogAndIdentifier catalogAndIdentifier = Spark3Util.catalogAndIdentifier(ctx, spark, newTableIdentifier, defaultCatalog); - - return new BaseSnapshotHudiTableAction( - spark.sessionState().newHadoopConf(), - hudiTablePath, - Spark3Util.loadIcebergCatalog(spark, catalogAndIdentifier.catalog().name()), - TableIdentifier.parse(catalogAndIdentifier.identifier().toString())); + return HudiToIcebergMigrationActionsProvider.defaultProvider() + .snapshotHudiTable(hudiTablePath) + .as(TableIdentifier.parse(catalogAndIdentifier.identifier().toString())) + .hoodieConfiguration(spark.sessionState().newHadoopConf()) + .icebergCatalog( + Spark3Util.loadIcebergCatalog(spark, catalogAndIdentifier.catalog().name())); } } diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index d5cc02234645..5d3e33d4921d 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -31,9 +31,6 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.iceberg.Table; import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkCatalog; @@ -57,7 +54,7 @@ public class TestSnapshotHudiTable extends SparkHudiMigrationTestBase { private static final Logger LOG = LoggerFactory.getLogger(TestSnapshotHudiTable.class.getName()); private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source"; - private static final String DELTA_SOURCE_VALUE = "delta"; + private static final String HUDI_SOURCE_VALUE = "hudi"; private static final String ORIGINAL_LOCATION_PROP = "original_location"; private static final String NAMESPACE = "delta_conversion_test"; private static final String defaultSparkCatalog = "spark_catalog"; @@ -144,9 +141,9 @@ public void testBasicPartitionedTable() { HudiToIcebergMigrationSparkIntegration.snapshotHudiTable( spark, partitionedLocation, newTableIdentifier) .execute(); - Table table = getIcebergTable(newTableIdentifier); - queryManual(table); checkSnapshotIntegrity(partitionedLocation, newTableIdentifier); + checkIcebergTableLocation(newTableIdentifier, partitionedLocation); + checkIcebergTableProperties(newTableIdentifier, ImmutableMap.of(), partitionedLocation); } @Test @@ -165,6 +162,8 @@ public void testBasicUnpartitionedTable() { spark, unpartitionedLocation, newTableIdentifier) .execute(); checkSnapshotIntegrity(unpartitionedLocation, newTableIdentifier); + checkIcebergTableLocation(newTableIdentifier, unpartitionedLocation); + checkIcebergTableProperties(newTableIdentifier, ImmutableMap.of(), unpartitionedLocation); } @Test @@ -227,6 +226,49 @@ public void testMultiCommitTable() { spark, multiCommitTableLocation, newTableIdentifier) .execute(); checkSnapshotIntegrity(multiCommitTableLocation, newTableIdentifier); + checkIcebergTableLocation(newTableIdentifier, multiCommitTableLocation); + checkIcebergTableProperties(newTableIdentifier, ImmutableMap.of(), multiCommitTableLocation); + } + + @Test + public void testSnapshotWithNewLocation() { + writeHoodieTable( + typeTestDataframe, + "decimalCol", + "intCol", + "partitionPath", + SaveMode.Overwrite, + partitionedLocation, + partitionedIdentifier); + String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_4"); + SnapshotHudiTable.Result result = + HudiToIcebergMigrationSparkIntegration.snapshotHudiTable( + spark, partitionedLocation, newTableIdentifier) + .tableLocation(newIcebergTableLocation) + .execute(); + checkSnapshotIntegrity(partitionedLocation, newTableIdentifier); + checkIcebergTableLocation(newTableIdentifier, newIcebergTableLocation); + } + + @Test + public void testSnapshotWithAdditionalProperties() { + writeHoodieTable( + typeTestDataframe, + "decimalCol", + "intCol", + "partitionPath", + SaveMode.Overwrite, + partitionedLocation, + partitionedIdentifier); + String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_5"); + SnapshotHudiTable.Result result = + HudiToIcebergMigrationSparkIntegration.snapshotHudiTable( + spark, partitionedLocation, newTableIdentifier) + .tableProperties(ImmutableMap.of("test", "test")) + .execute(); + checkSnapshotIntegrity(partitionedLocation, newTableIdentifier); + checkIcebergTableProperties( + newTableIdentifier, ImmutableMap.of("test", "test"), partitionedLocation); } private void checkSnapshotIntegrity(String hudiTableLocation, String icebergTableIdentifier) { @@ -239,24 +281,36 @@ private void checkSnapshotIntegrity(String hudiTableLocation, String icebergTabl DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()) .load(hudiTableLocation); Dataset icebergResult = spark.sql("SELECT * FROM " + icebergTableIdentifier); - LOG.info("Hudi table contents: {}", hudiResult.showString(10, 20, false)); - LOG.info("Iceberg table contents: {}", icebergResult.showString(10, 20, false)); // TODO: adjust test technique since hudi tends to return the columns in a different order (put // the one used for partitioning last) List hudiTableContents = hudiResult.collectAsList(); List icebergTableContents = icebergResult.collectAsList(); Assertions.assertThat(hudiTableContents).hasSize(icebergTableContents.size()); - Assertions.assertThat(hudiTableContents).containsAll(icebergTableContents); - Assertions.assertThat(icebergTableContents) - .containsAll(hudiTableContents); // TODO: may change to containsExactlyInAnyOrderElementsOf + Assertions.assertThat(hudiTableContents) + .containsExactlyInAnyOrderElementsOf(icebergTableContents); } - private void queryManual(Table table) { - CloseableIterable records = IcebergGenerics.read(table).build(); - for (Record record : records) { - LOG.info("Alpha Test Iceberg Record: {}", record); - } + private void checkIcebergTableLocation(String icebergTableIdentifier, String expectedLoacation) { + Table table = getIcebergTable(icebergTableIdentifier); + Assertions.assertThat(table.location()).isEqualTo(expectedLoacation); + } + + private void checkIcebergTableProperties( + String icebergTableIdentifier, + Map expectedAdditionalProperties, + String hudiTableLocation) { + Table icebergTable = getIcebergTable(icebergTableIdentifier); + ImmutableMap.Builder expectedPropertiesBuilder = ImmutableMap.builder(); + // The snapshot action will put some fixed properties to the table + expectedPropertiesBuilder.put(SNAPSHOT_SOURCE_PROP, HUDI_SOURCE_VALUE); + expectedPropertiesBuilder.putAll(expectedAdditionalProperties); + ImmutableMap expectedProperties = expectedPropertiesBuilder.build(); + + Assertions.assertThat(icebergTable.properties().entrySet()) + .containsAll(expectedProperties.entrySet()); + Assertions.assertThat(icebergTable.properties()) + .containsEntry(ORIGINAL_LOCATION_PROP, hudiTableLocation); } private Table getIcebergTable(String icebergTableIdentifier) { @@ -293,9 +347,7 @@ private Dataset typeTestDataFrame() { .withColumn("mapCol", expr("MAP(stringCol, intCol)")) // Hudi requires Map key to be String .withColumn("arrayCol", expr("ARRAY(dateCol)")) .withColumn("structCol", expr("STRUCT(longCol AS a, longCol AS b)")) - .withColumn( - "partitionPath", - expr("CAST(longCol AS STRING)")); + .withColumn("partitionPath", expr("CAST(longCol AS STRING)")); } private Dataset multiDataFrame(int start, int end) { diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 5c80cef94f68..7b1e25791d3c 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -94,22 +94,13 @@ public class BaseSnapshotHudiTableAction implements SnapshotHudiTable { private String hoodieTableBasePath; private Catalog icebergCatalog; private TableIdentifier newTableIdentifier; + private String newTableLocation; private HadoopFileIO hoodieFileIO; private ImmutableMap.Builder additionalPropertiesBuilder = ImmutableMap.builder(); - public BaseSnapshotHudiTableAction( - Configuration hoodieConfiguration, - String hoodieTableBasePath, - Catalog icebergCatalog, - TableIdentifier newTableIdentifier) { - this.hoodieTableMetaClient = buildTableMetaClient(hoodieConfiguration, hoodieTableBasePath); - this.hoodieTableConfig = hoodieTableMetaClient.getTableConfig(); - this.hoodieEngineContext = new HoodieLocalEngineContext(hoodieConfiguration); + public BaseSnapshotHudiTableAction(String hoodieTableBasePath) { this.hoodieTableBasePath = hoodieTableBasePath; - this.hoodieMetadataConfig = buildMetadataConfig(hoodieConfiguration); - this.hoodieFileIO = new HadoopFileIO(hoodieConfiguration); - this.icebergCatalog = icebergCatalog; - this.newTableIdentifier = newTableIdentifier; + this.newTableLocation = hoodieTableBasePath; } @Override @@ -119,8 +110,36 @@ public SnapshotHudiTable tableProperties(Map properties) { } @Override - public SnapshotHudiTable tableProperty(String key, String value) { - additionalPropertiesBuilder.put(key, value); + public SnapshotHudiTable tableProperty(String name, String value) { + additionalPropertiesBuilder.put(name, value); + return this; + } + + @Override + public SnapshotHudiTable tableLocation(String location) { + this.newTableLocation = location; + return this; + } + + @Override + public SnapshotHudiTable as(TableIdentifier identifier) { + this.newTableIdentifier = identifier; + return this; + } + + @Override + public SnapshotHudiTable icebergCatalog(Catalog catalog) { + this.icebergCatalog = catalog; + return this; + } + + @Override + public SnapshotHudiTable hoodieConfiguration(Configuration configuration) { + this.hoodieTableMetaClient = buildTableMetaClient(configuration, hoodieTableBasePath); + this.hoodieTableConfig = hoodieTableMetaClient.getTableConfig(); + this.hoodieEngineContext = new HoodieLocalEngineContext(configuration); + this.hoodieMetadataConfig = buildMetadataConfig(configuration); + this.hoodieFileIO = new HadoopFileIO(configuration); return this; } @@ -132,18 +151,15 @@ public Result execute() { Schema icebergSchema = convertToIcebergSchema(hudiSchema); PartitionSpec partitionSpec = getPartitionSpecFromHoodieMetadataData(icebergSchema); - // TODO: add support for newTableLocation Transaction icebergTransaction = icebergCatalog.newCreateTableTransaction( newTableIdentifier, icebergSchema, partitionSpec, - hoodieTableBasePath, + newTableLocation, destTableProperties()); - // We need name mapping to ensure we can read data files correctly as iceberg table has its own + // Need name mapping to ensure we can read data files correctly as iceberg table has its own // rule to assign field id - // Although the field id rule seems to be the same as hudi, but the rule is not guaranteed by - // any API NameMapping nameMapping = MappingUtil.create(icebergTransaction.table().schema()); icebergTransaction .table() @@ -153,19 +169,11 @@ public Result execute() { // Pre-process the timeline, we only need to process all COMPLETED commit for COW table // Commit that has been rollbacked will not be in either REQUESTED or INFLIGHT state - HoodieTimeline commitsTimeline = hoodieTableMetaClient.getCommitsTimeline(); - HoodieTimeline archivedTimeline = hoodieTableMetaClient.getArchivedTimeline(); HoodieTimeline timeline = hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); // Initialize the FileSystemView for querying table data files - // TODO: need to choose the correct implementation of the FileSystemView - // HoodieTableFileSystemView hoodieTableFileSystemView = - // FileSystemViewManager.createInMemoryFileSystemViewWithTimeline( - // hoodieEngineContext, hoodieTableMetaClient, hoodieMetadataConfig, timeline); HoodieTableFileSystemView hoodieTableFileSystemView = new HoodieTableFileSystemView(hoodieTableMetaClient, timeline); - // get all instants on the timeline - Stream completedInstants = timeline.getInstants(); List partitionPaths = FSUtils.getAllPartitionPaths( hoodieEngineContext, @@ -181,6 +189,8 @@ public Result execute() { } catch (IOException e) { throw new RuntimeException("Failed to get all data files in partition", e); } + // get all instants on the timeline + Stream completedInstants = timeline.getInstants(); // file group id -> Map // This pre-process aims to make a timestamp to HoodieBaseFile map for each file group Map> allStampedDataFiles = @@ -196,21 +206,11 @@ public Result execute() { ImmutableMap.toImmutableMap( HoodieBaseFile::getCommitTime, baseFile -> baseFile)))); - // BEGIN TEST ONLY CODE - List testGroups = - hoodieTableFileSystemView.getLatestBaseFiles().collect(Collectors.toList()); - // END TEST ONLY CODE - // Help tracked if a previous version of the data file has been added to the iceberg table Map convertedDataFiles = Maps.newHashMap(); // Replay the timeline from beginning to the end completedInstants.forEachOrdered( instant -> { - LOG.info("Alpha test: get completed instant: {}", instant); - // copyInstants to iceberg table - // TODO: need to verify the order of the instants, make sure it is from the oldest to the - // newest - // commit each instant as a transaction to the iceberg table commitHoodieInstantToIcebergTransaction( instant, @@ -253,7 +253,6 @@ public void commitHoodieInstantToIcebergTransaction( List filesToAdd = Lists.newArrayList(); List filesToRemove = Lists.newArrayList(); - // TODO: may need to add synchronization lock for parallelism fileGroups .sequential() .forEach( @@ -316,7 +315,6 @@ private DataFile buildDataFileFromHoodieBaseFile( } PartitionSpec spec = table.spec(); - // TODO: need to verify the path is absolute (the field's name is fullPath) String path = baseFile.getPath(); long fileSize = baseFile.getFileSize(); String partitionValue = fileGroup.getPartitionPath(); @@ -330,8 +328,6 @@ private DataFile buildDataFileFromHoodieBaseFile( FileFormat format = determineFileFormatFromPath(path); Metrics metrics = getMetricsForFile(file, format, metricsConfig, nameMapping); - List testFields = spec.fields(); - String partition = spec.fields().stream() .map(PartitionField::name) @@ -355,14 +351,6 @@ private DataFile buildDataFileFromHoodieBaseFile( private InternalSchema getHudiSchema() { TableSchemaResolver schemaUtil = new TableSchemaResolver(hoodieTableMetaClient); Option hudiSchema = schemaUtil.getTableInternalSchemaFromCommitMetadata(); - LOG.info("Alpha test: hoodie schema: {}", hudiSchema); - LOG.info("Alpha test: active timeline: {}", hoodieTableMetaClient.getActiveTimeline()); - LOG.info( - "Alpha test: active timeline commit timeline: {}", - hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline()); - LOG.info( - "Alpha test: active timeline commit timeline instants: {}", - hoodieTableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()); return hudiSchema.orElseGet( () -> { try { @@ -399,14 +387,13 @@ private PartitionSpec getPartitionSpecFromHoodieMetadataData(Schema schema) { } private Map destTableProperties() { - // TODO: need to check which hoodie properties to add to additionalPropertiesBuilder.putAll(hoodieTableConfig.propsMap()); additionalPropertiesBuilder.putAll( ImmutableMap.of( SNAPSHOT_SOURCE_PROP, HOODIE_SOURCE_VALUE, ORIGINAL_LOCATION_PROP, - hoodieTableMetaClient.getBasePathV2().toString())); + hoodieTableBasePath)); return additionalPropertiesBuilder.build(); } diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java b/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java index 0a1e0808af43..8ba58e2ed203 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/HudiToIcebergMigrationActionsProvider.java @@ -18,12 +18,28 @@ */ package org.apache.iceberg.hudi; +/** + * An API that provide actions for migration from an Apache Hudi table to an Iceberg table. Query + * engines can use {@code defaultActions()} to access default action implementations, or implement + * this provider to supply a different implementation if necessary. + */ public interface HudiToIcebergMigrationActionsProvider { - default SnapshotHudiTable snapshotHudiTable() { - throw new UnsupportedOperationException("snapshotHudiTable is not supported"); + /** + * Initiates an action to snapshot an existing Delta Lake table to an Iceberg table. + * + * @param sourceTableLocation the location of the Delta Lake table + * @return a {@link SnapshotHudiTable} action + */ + default SnapshotHudiTable snapshotHudiTable(String sourceTableLocation) { + return new BaseSnapshotHudiTableAction(sourceTableLocation); } + /** + * Get the default implementation of {@link HudiToIcebergMigrationActionsProvider} + * + * @return an instance with access to all default actions + */ static HudiToIcebergMigrationActionsProvider defaultProvider() { return DefaultHudiToIcebergMigrationActions.defaultMigrationActions(); } diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java b/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java index a5208809b314..cf86139516d8 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/SnapshotHudiTable.java @@ -19,16 +19,69 @@ package org.apache.iceberg.hudi; import java.util.Map; +import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.actions.Action; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; public interface SnapshotHudiTable extends Action { + /** + * Sets table properties in the newly created Iceberg table. Any properties with the same key name + * will be overwritten. + * + * @param properties a map of properties to set + * @return this for method chaining + */ SnapshotHudiTable tableProperties(Map properties); - SnapshotHudiTable tableProperty(String key, String value); + /** + * Sets a table property in the newly created Iceberg table. Any properties with the same key will + * be overwritten. + * + * @param name a table property name + * @param value a table property value + * @return this for method chaining + */ + SnapshotHudiTable tableProperty(String name, String value); + /** + * Sets the location of the newly created Iceberg table. Default location is the same as the Hudi + * table. + * + * @param location a path to the new table location + * @return this for method chaining + */ + SnapshotHudiTable tableLocation(String location); + + /** + * Sets the identifier of the newly created Iceberg table. This is required to be set before + * execute the action. + * + * @param identifier a table identifier (namespace, name) @Returns this for method chaining + */ + SnapshotHudiTable as(TableIdentifier identifier); + + /** + * Sets the catalog of the newly created Iceberg table. This is required to be set before execute + * the action + * + * @param catalog a catalog @Returns this for method chaining + */ + SnapshotHudiTable icebergCatalog(Catalog catalog); + + /** + * Sets the Hadoop configuration used to access hudi table's timeline and file groups. This is + * required to be set before execute the action. + * + * @param conf a Hadoop configuration @Returns this for method chaining + */ + SnapshotHudiTable hoodieConfiguration(Configuration conf); + + /** The action result that contains a summary of the execution. */ interface Result { + /** Returns the number of snapshot data files. */ long snapshotFilesCount(); } } From 3ee5a6b5e7b5e17198a5ca97b08aefbb0e741dfe Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Thu, 2 Feb 2023 01:33:39 -0500 Subject: [PATCH 17/20] resolve dependency issue (kind of...) --- build.gradle | 13 ++++++++----- versions.props | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/build.gradle b/build.gradle index 82d27288d7d1..8de344026481 100644 --- a/build.gradle +++ b/build.gradle @@ -458,10 +458,10 @@ project(':iceberg-hudi') { implementation project(':iceberg-orc') implementation "com.fasterxml.jackson.core:jackson-databind" - // TODO: we only need hudi-common here, however, hudi-common has some dependency conflicts with hudi-spark-bundle - // which is currently used by the integration test. We should fix this in the future. - // Also, hudi uses java8, may need to assess if we can use hudi in java11. - compileOnly("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2") + // Hudi uses java8, may need to assess if we can use hudi in java11. + compileOnly("org.apache.hudi:hudi-common") + // Added to resolve dependency conflicts with hudi-spark-bundle + compileOnly("org.apache.hudi:hudi-client-common") implementation("org.apache.avro:avro") { exclude group: 'org.tukaani' // xz compression is not supported } @@ -474,7 +474,10 @@ project(':iceberg-hudi') { } if (sparkVersions.contains("3.3") && scalaVersion == "2.12") { integrationImplementation project(':iceberg-data') - integrationImplementation("org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2") + integrationImplementation("org.apache.hudi:hudi-spark3.3-bundle_2.12") { + exclude group: 'org.apache.hudi', module: 'hudi-common' + exclude group: 'org.apache.hudi', module: 'hudi-client-common' + } integrationImplementation project(path: ":iceberg-spark:iceberg-spark-3.3_${scalaVersion}") integrationImplementation("org.apache.hadoop:hadoop-minicluster") { exclude group: 'org.apache.avro', module: 'avro' diff --git a/versions.props b/versions.props index 99dbea48a244..3739ab1748cf 100644 --- a/versions.props +++ b/versions.props @@ -29,6 +29,7 @@ org.scala-lang.modules:scala-collection-compat_2.13 = 2.6.0 com.emc.ecs:object-client-bundle = 3.3.2 org.immutables:value = 2.9.2 net.snowflake:snowflake-jdbc = 3.13.22 +org.apache.hudi:* = 0.12.0 # test deps org.junit.vintage:junit-vintage-engine = 5.8.2 From fb479a6cec5f10d0f8d4737b840f74d80493abd9 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Tue, 7 Feb 2023 21:42:29 -0500 Subject: [PATCH 18/20] handle multiple partition columns --- .../iceberg/hudi/TestSnapshotHudiTable.java | 47 ++++++++++++++++++- .../hudi/BaseSnapshotHudiTableAction.java | 18 +++++-- 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index 5d3e33d4921d..55864a93f380 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -271,6 +271,27 @@ public void testSnapshotWithAdditionalProperties() { newTableIdentifier, ImmutableMap.of("test", "test"), partitionedLocation); } + @Test + public void testSnapshotWithComplexKeyGen() { + writeHoodieTableKeyGenerator( + multiDataFrame(0, 1), + "decimalCol,dateCol", + "magic_number", + "zpartitionPath,partitionPath,partitionPath2", + SaveMode.Append, + partitionedLocation, + partitionedIdentifier); + String newTableIdentifier = destName(icebergCatalogName, "alpha_iceberg_table_6"); + SnapshotHudiTable.Result result = + HudiToIcebergMigrationSparkIntegration.snapshotHudiTable( + spark, partitionedLocation, newTableIdentifier) + .tableProperties(ImmutableMap.of("test", "test")) + .execute(); + checkSnapshotIntegrity(partitionedLocation, newTableIdentifier); + checkIcebergTableProperties( + newTableIdentifier, ImmutableMap.of("test", "test"), partitionedLocation); + } + private void checkSnapshotIntegrity(String hudiTableLocation, String icebergTableIdentifier) { Dataset hudiResult = spark @@ -373,7 +394,9 @@ private Dataset multiDataFrame(int start, int end) { "structCol2", expr( "STRUCT(innerStruct3, STRUCT(SHA1(CAST(random2 AS BINARY)), SHA1(CAST(random3 AS BINARY))))")) - .withColumn("partitionPath", expr("CAST(id AS STRING)")); + .withColumn("zpartitionPath", expr("CAST(dateCol AS STRING)")) + .withColumn("partitionPath", expr("CAST(id AS STRING)")) + .withColumn("partitionPath2", expr("CAST(random1 AS STRING)")); } private Dataset nestedDataFrame() { @@ -425,6 +448,28 @@ private void writeHoodieTable( .save(tableLocation); } + private void writeHoodieTableKeyGenerator( + Dataset df, + String recordKey, + String preCombineKey, + String partitionPathField, + SaveMode saveMode, + String tableLocation, + String tableIdentifier) { + df.write() + .format("hudi") + // .options(QuickstartUtils.getQuickstartWriteConfigs()) + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey) + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), preCombineKey) + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionPathField) + .option( + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), + "org.apache.hudi.keygen.ComplexKeyGenerator") + .option(HoodieWriteConfig.TBL_NAME.key(), tableIdentifier) + .mode(saveMode) + .save(tableLocation); + } + private void writeHoodieTableOperation( Dataset df, String operationKey, diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 7b1e25791d3c..3a70d559f284 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -70,6 +70,7 @@ import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.orc.OrcMetrics; import org.apache.iceberg.parquet.ParquetUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -317,7 +318,17 @@ private DataFile buildDataFileFromHoodieBaseFile( PartitionSpec spec = table.spec(); String path = baseFile.getPath(); long fileSize = baseFile.getFileSize(); - String partitionValue = fileGroup.getPartitionPath(); + String[] partitionValues = fileGroup.getPartitionPath().split("/"); + List partitionFields = spec.fields(); + Preconditions.checkState( + partitionValues.length == partitionFields.size(), "Invalid partition values"); + // map partition values to spec + ImmutableMap.Builder partitionValueMapBuilder = ImmutableMap.builder(); + ImmutableMap partitionValueMap; + for (int i = 0; i < partitionFields.size(); i++) { + partitionValueMapBuilder.put(partitionFields.get(i).name(), partitionValues[i]); + } + partitionValueMap = partitionValueMapBuilder.build(); MetricsConfig metricsConfig = MetricsConfig.forTable(table); String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); @@ -329,9 +340,8 @@ private DataFile buildDataFileFromHoodieBaseFile( Metrics metrics = getMetricsForFile(file, format, metricsConfig, nameMapping); String partition = - spec.fields().stream() - .map(PartitionField::name) - .map(name -> String.format("%s=%s", name, partitionValue)) + partitionValueMap.entrySet().stream() + .map(e -> String.format("%s=%s", e.getKey(), e.getValue())) .collect(Collectors.joining("/")); return DataFiles.builder(spec) From 42f97096c94c4ab8ce72dda39dda4a9fff66c4d0 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Tue, 7 Feb 2023 22:20:17 -0500 Subject: [PATCH 19/20] fix bug for unpartitioned table --- .../org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index 3a70d559f284..d6f5efdae5c8 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -321,7 +321,7 @@ private DataFile buildDataFileFromHoodieBaseFile( String[] partitionValues = fileGroup.getPartitionPath().split("/"); List partitionFields = spec.fields(); Preconditions.checkState( - partitionValues.length == partitionFields.size(), "Invalid partition values"); + partitionValues.length == partitionFields.size() || partitionFields.isEmpty(), "Invalid partition values"); // map partition values to spec ImmutableMap.Builder partitionValueMapBuilder = ImmutableMap.builder(); ImmutableMap partitionValueMap; From dde7fcd548a7cb59030972e397bb528ed3eba0c4 Mon Sep 17 00:00:00 2001 From: Rushan Jiang Date: Wed, 8 Feb 2023 01:53:27 -0500 Subject: [PATCH 20/20] checked multi partitions --- .../apache/iceberg/hudi/TestSnapshotHudiTable.java | 12 ++++++------ .../iceberg/hudi/BaseSnapshotHudiTableAction.java | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java index 55864a93f380..02c5db45d0b9 100644 --- a/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java +++ b/hudi/src/integration/java/org/apache/iceberg/hudi/TestSnapshotHudiTable.java @@ -173,7 +173,7 @@ public void testMultiCommitTable() { initialDataFrame, "decimalCol", "magic_number", - "partitionPath", + "partitionPath2", SaveMode.Append, multiCommitTableLocation, multiCommitIdentifier); @@ -181,7 +181,7 @@ public void testMultiCommitTable() { initialDataFrame, "decimalCol", "magic_number", - "partitionPath", + "partitionPath2", SaveMode.Append, multiCommitTableLocation, multiCommitIdentifier); @@ -189,7 +189,7 @@ public void testMultiCommitTable() { multiDataFrame(2, 5), "decimalCol", "magic_number", - "partitionPath", + "partitionPath2", SaveMode.Append, multiCommitTableLocation, multiCommitIdentifier); @@ -197,7 +197,7 @@ public void testMultiCommitTable() { multiDataFrame(0, 1), "decimalCol", "magic_number", - "partitionPath", + "partitionPath2", SaveMode.Append, multiCommitTableLocation, multiCommitIdentifier); @@ -206,7 +206,7 @@ public void testMultiCommitTable() { toDelete, "decimalCol", "magic_number", - "partitionPath", + "partitionPath2", SaveMode.Append, multiCommitTableLocation, multiCommitIdentifier); @@ -215,7 +215,7 @@ public void testMultiCommitTable() { DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(), "decimalCol", "magic_number", - "partitionPath", + "partitionPath2", SaveMode.Append, multiCommitTableLocation, multiCommitIdentifier); diff --git a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java index d6f5efdae5c8..c80a6600f64f 100644 --- a/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java +++ b/hudi/src/main/java/org/apache/iceberg/hudi/BaseSnapshotHudiTableAction.java @@ -321,7 +321,8 @@ private DataFile buildDataFileFromHoodieBaseFile( String[] partitionValues = fileGroup.getPartitionPath().split("/"); List partitionFields = spec.fields(); Preconditions.checkState( - partitionValues.length == partitionFields.size() || partitionFields.isEmpty(), "Invalid partition values"); + partitionValues.length == partitionFields.size() || partitionFields.isEmpty(), + "Invalid partition values"); // map partition values to spec ImmutableMap.Builder partitionValueMapBuilder = ImmutableMap.builder(); ImmutableMap partitionValueMap;