From 9b9d5533c4f5471d2bcb978be880bbd167caf8ea Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 23 Feb 2026 11:43:05 -0500 Subject: [PATCH 01/15] add files transform and schematransform --- .../apache/beam/sdk/io/iceberg/AddFiles.java | 279 ++++++++++++++++++ .../AddFilesSchemaTransformProvider.java | 112 +++++++ 2 files changed, 391 insertions(+) create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java new file mode 100644 index 000000000000..324e5e80e049 --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg; + +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.schemas.SchemaRegistry; +import org.apache.beam.sdk.schemas.transforms.SchemaTransform; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.GroupIntoBatches; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Redistribute; +import org.apache.beam.sdk.transforms.Values; +import org.apache.beam.sdk.transforms.WithKeys; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TupleTagList; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.orc.OrcMetrics; +import org.apache.iceberg.parquet.ParquetUtil; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; + +import java.util.List; + +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.DATA_FILES; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERRORS; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERROR_SCHEMA; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +public class AddFiles extends SchemaTransform { + private final IcebergCatalogConfig catalogConfig; + private final String tableIdentifier; + private final Duration intervalTrigger; + private final int numFilesTrigger; + private final @Nullable String locationPrefix; + private static final int DEFAULT_FILES_TRIGGER = 100_000; + private static final Duration DEFAULT_TRIGGER_INTERVAL = Duration.standardMinutes(10); + + public AddFiles( + IcebergCatalogConfig catalogConfig, + String tableIdentifier, + @Nullable String locationPrefix, + @Nullable Integer numFilesTrigger, + @Nullable Duration intervalTrigger) { + System.out.println("got catalog config: " + catalogConfig); + this.catalogConfig = catalogConfig; + this.tableIdentifier = tableIdentifier; + this.intervalTrigger = intervalTrigger != null ? intervalTrigger : DEFAULT_TRIGGER_INTERVAL; + this.numFilesTrigger = numFilesTrigger != null ? numFilesTrigger : DEFAULT_FILES_TRIGGER; + this.locationPrefix = locationPrefix; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { + PCollection filePaths = input.getSinglePCollection(); + Schema inputSchema = filePaths.getSchema(); + Preconditions.checkState( + inputSchema.getFieldCount() == 1 + && inputSchema.getField(0).getType().getTypeName().equals(Schema.TypeName.STRING) + && !inputSchema.getField(0).getType().getNullable(), + "Incoming Row Schema contain only one (required) field of type String."); + + PCollectionTuple dataFiles = + filePaths + .apply( + "ExtractPaths", + MapElements.into(TypeDescriptors.strings()) + .via(row -> checkStateNotNull(row.getString(0)))) + .apply(Redistribute.arbitrarily()) + .apply( + "ConvertToDataFiles", + ParDo.of(new ConvertToDataFile(catalogConfig, tableIdentifier, locationPrefix)) + .withOutputTags(DATA_FILES, TupleTagList.of(ERRORS))); + SchemaCoder sdfSchema; + try { + sdfSchema = SchemaRegistry.createDefault().getSchemaCoder(SerializableDataFile.class); + } catch (Exception e) { + throw new RuntimeException(e); + } + dataFiles.get(ERRORS).setRowSchema(ERROR_SCHEMA); + PCollection snapshots = + dataFiles + .get(DATA_FILES) + .setCoder(sdfSchema) + .apply("AddStaticKey", WithKeys.of((Void) null)) + .apply( + GroupIntoBatches.ofSize(numFilesTrigger) + .withMaxBufferingDuration(intervalTrigger)) + .apply("DropKey", Values.create()) + .apply("AddFilesToIceberg", ParDo.of(new AddFilesDoFn(catalogConfig, tableIdentifier))) + .setRowSchema(SnapshotInfo.getSchema()); + + return PCollectionRowTuple.of("snapshots", snapshots); + } + + static class ConvertToDataFile extends DoFn { + private final IcebergCatalogConfig catalogConfig; + private final String identifier; + public static final TupleTag ERRORS = new TupleTag<>(); + public static final TupleTag DATA_FILES = new TupleTag<>(); + public static final Schema ERROR_SCHEMA = + Schema.builder().addStringField("file").addStringField("error").build(); + private final @Nullable String prefix; + private static volatile @MonotonicNonNull Table sharedTable; + private boolean isPartitioned = false; + + public ConvertToDataFile( + IcebergCatalogConfig catalogConfig, String identifier, @Nullable String prefix) { + this.catalogConfig = catalogConfig; + this.identifier = identifier; + this.prefix = prefix; + } + + @Setup + public void start() { + if (sharedTable == null) { + synchronized (ConvertToDataFile.class) { + if (sharedTable == null) { + sharedTable = catalogConfig.catalog().loadTable(TableIdentifier.parse(identifier)); + } + } + } + isPartitioned = sharedTable.spec().isPartitioned(); + Preconditions.checkArgument( + !isPartitioned || prefix != null, + "A location prefix must be specified when adding files to a partitioned table."); + } + + private static final String UNKNOWN_FORMAT_ERROR = "Could not determine the file's format"; + private static final String PREFIX_ERROR = "File did not start with the specified prefix"; + + @ProcessElement + public void process(@Element String filePath, MultiOutputReceiver output) { + final Table table = checkStateNotNull(sharedTable); + boolean isPartitioned = table.spec().isPartitioned(); + if (isPartitioned && !filePath.startsWith(checkStateNotNull(prefix))) { + output + .get(ERRORS) + .output(Row.withSchema(ERROR_SCHEMA).addValues(filePath, PREFIX_ERROR).build()); + return; + } + + InputFile inputFile = table.io().newInputFile(filePath); + FileFormat format; + try { + format = inferFormat(inputFile.location()); + } catch (UnknownFormat e) { + output + .get(ERRORS) + .output(Row.withSchema(ERROR_SCHEMA).addValues(filePath, UNKNOWN_FORMAT_ERROR).build()); + return; + } + Metrics metrics = getFileMetrics(inputFile, format, MetricsConfig.forTable(table)); + + String partitionPath = getPartitionPath(filePath); + DataFile df = + DataFiles.builder(table.spec()) + .withPath(filePath) + .withFormat(format) + .withMetrics(metrics) + .withFileSizeInBytes(inputFile.getLength()) + .withPartitionPath(partitionPath) + .build(); + + output.get(DATA_FILES).output(SerializableDataFile.from(df, partitionPath)); + } + + private String getPartitionPath(String filePath) { + if (!isPartitioned) { + return ""; + } + String partitionPath = filePath.substring(checkStateNotNull(prefix).length()); + List components = Lists.newArrayList(Splitter.on('/').split(partitionPath)); + if (!components.isEmpty()) { + components = components.subList(0, components.size() - 1); + } + return String.join("/", components); + } + } + + static class AddFilesDoFn extends DoFn, Row> { + private final IcebergCatalogConfig catalogConfig; + private final String identifier; + private @MonotonicNonNull Table table = null; + + public AddFilesDoFn(IcebergCatalogConfig catalogConfig, String identifier) { + this.catalogConfig = catalogConfig; + this.identifier = identifier; + } + + @Setup + public void start() { + table = catalogConfig.catalog().loadTable(TableIdentifier.parse(identifier)); + } + + @ProcessElement + public void process(@Element Iterable files, OutputReceiver output) { + if (!files.iterator().hasNext()) { + return; + } + Table table = checkStateNotNull(this.table); + + AppendFiles appendFiles = table.newAppend(); + for (SerializableDataFile file : files) { + DataFile df = file.createDataFile(table.specs()); + appendFiles.appendFile(df); + } + + appendFiles.commit(); + Snapshot snapshot = table.currentSnapshot(); + output.output(SnapshotInfo.fromSnapshot(snapshot).toRow()); + } + } + + @SuppressWarnings("argument") + public static Metrics getFileMetrics(InputFile file, FileFormat format, MetricsConfig config) { + switch (format) { + case PARQUET: + return ParquetUtil.fileMetrics(file, config); + case ORC: + return OrcMetrics.fromInputFile(file, config); + case AVRO: + return new Metrics(Avro.rowCount(file), null, null, null, null); + default: + throw new UnsupportedOperationException("Unsupported format: " + format); + } + } + + public static FileFormat inferFormat(String path) { + String lowerPath = path.toLowerCase(); + + if (lowerPath.endsWith(".parquet") || lowerPath.endsWith(".pqt")) { + return FileFormat.PARQUET; + } else if (lowerPath.endsWith(".orc")) { + return FileFormat.ORC; + } else if (lowerPath.endsWith(".avro")) { + return FileFormat.AVRO; + } + + throw new UnknownFormat(); + } + + static class UnknownFormat extends IllegalArgumentException {} +} diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java new file mode 100644 index 000000000000..b402902cba1d --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg; + +import com.google.auto.service.AutoService; +import com.google.auto.value.AutoValue; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; +import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; + +import java.util.Map; + +import static org.apache.beam.sdk.io.iceberg.AddFilesSchemaTransformProvider.Configuration; + +@AutoService(SchemaTransformProvider.class) +public class AddFilesSchemaTransformProvider extends TypedSchemaTransformProvider { + @Override + public AddFiles from(Configuration configuration) { + @Nullable Integer frequency = configuration.getTriggeringFrequencySeconds(); + + return new AddFiles( + configuration.getIcebergCatalog(), + configuration.getTable(), + configuration.getLocationPrefix(), + configuration.getAppendBatchSize(), + frequency != null ? Duration.standardSeconds(frequency) : null); + } + + @Override + public String identifier() { + return "beam:schematransform:iceberg_add_files:v1"; + } + + @DefaultSchema(AutoValueSchema.class) + @AutoValue + public abstract static class Configuration { + public static Builder builder() { + return new AutoValue_AddFilesSchemaTransformProvider_Configuration.Builder(); + } + + @SchemaFieldDescription("A fully-qualified table identifier.") + public abstract String getTable(); + + @SchemaFieldDescription("Properties used to set up the Iceberg catalog.") + public abstract @Nullable Map getCatalogProperties(); + + @SchemaFieldDescription("Properties passed to the Hadoop ") + public abstract @Nullable Map getConfigProperties(); + + @SchemaFieldDescription( + "For a streaming pipeline, sets the frequency at which incoming files are appended. Defaults to 600 (10 minutes). " + + "A commit is triggered when either this or append batch size is reached.") + public abstract @Nullable Integer getTriggeringFrequencySeconds(); + + @SchemaFieldDescription( + "For a streaming pipeline, sets the desired number of appended files per commit. Defaults to 100,000 files. " + + "A commit is triggered when either this or append triggering interval is reached.") + public abstract @Nullable Integer getAppendBatchSize(); + + @SchemaFieldDescription( + "The prefix shared among all partitions. For example, a data file may have the following" + + " location:\n" + + "'file:/Users/user/Documents/tmp/namespace/table_name/data/id=13/name=beam/data_file.parquet'\n\n" + + "The provided prefix should go up until the partition information:\n" + + "'file:/Users/user/Documents/tmp/namespace/table_name/data/'.\n" + + "Required if the table is partitioned. ") + public abstract @Nullable String getLocationPrefix(); + + @AutoValue.Builder + public abstract static class Builder { + public abstract Builder setTable(String table); + + public abstract Builder setCatalogProperties(Map catalogProperties); + + public abstract Builder setConfigProperties(Map confProperties); + + public abstract Builder setTriggeringFrequencySeconds(Integer triggeringFrequencySeconds); + + public abstract Builder setAppendBatchSize(Integer size); + + public abstract Builder setLocationPrefix(String prefix); + + public abstract Configuration build(); + } + + public IcebergCatalogConfig getIcebergCatalog() { + return IcebergCatalogConfig.builder() + .setCatalogProperties(getCatalogProperties()) + .setConfigProperties(getConfigProperties()) + .build(); + } + } +} From 98328b8e85bed28a66298c5bbdab490433ef7e2c Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 24 Feb 2026 13:01:21 -0500 Subject: [PATCH 02/15] minor fixes --- .../org/apache/beam/sdk/io/iceberg/AddFiles.java | 14 ++++++++------ .../iceberg/AddFilesSchemaTransformProvider.java | 7 +++---- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index 324e5e80e049..9ed3a52cfff6 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -22,6 +22,7 @@ import org.apache.beam.sdk.schemas.SchemaRegistry; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.Filter; import org.apache.beam.sdk.transforms.GroupIntoBatches; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.ParDo; @@ -77,7 +78,6 @@ public AddFiles( @Nullable String locationPrefix, @Nullable Integer numFilesTrigger, @Nullable Duration intervalTrigger) { - System.out.println("got catalog config: " + catalogConfig); this.catalogConfig = catalogConfig; this.tableIdentifier = tableIdentifier; this.intervalTrigger = intervalTrigger != null ? intervalTrigger : DEFAULT_TRIGGER_INTERVAL; @@ -91,12 +91,13 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { Schema inputSchema = filePaths.getSchema(); Preconditions.checkState( inputSchema.getFieldCount() == 1 - && inputSchema.getField(0).getType().getTypeName().equals(Schema.TypeName.STRING) - && !inputSchema.getField(0).getType().getNullable(), - "Incoming Row Schema contain only one (required) field of type String."); + && inputSchema.getField(0).getType().getTypeName().equals(Schema.TypeName.STRING), + "Incoming Row Schema must contain only one field of type String. Instead, got schema: %s", + inputSchema); PCollectionTuple dataFiles = filePaths + .apply("Filter empty paths", Filter.by(row -> row.getString(0) != null)) .apply( "ExtractPaths", MapElements.into(TypeDescriptors.strings()) @@ -112,7 +113,7 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { } catch (Exception e) { throw new RuntimeException(e); } - dataFiles.get(ERRORS).setRowSchema(ERROR_SCHEMA); + PCollection snapshots = dataFiles .get(DATA_FILES) @@ -125,7 +126,8 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { .apply("AddFilesToIceberg", ParDo.of(new AddFilesDoFn(catalogConfig, tableIdentifier))) .setRowSchema(SnapshotInfo.getSchema()); - return PCollectionRowTuple.of("snapshots", snapshots); + return PCollectionRowTuple.of( + "snapshots", snapshots, "errors", dataFiles.get(ERRORS).setRowSchema(ERROR_SCHEMA)); } static class ConvertToDataFile extends DoFn { diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java index b402902cba1d..8322667ac656 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java @@ -17,8 +17,11 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.io.iceberg.AddFilesSchemaTransformProvider.Configuration; + import com.google.auto.service.AutoService; import com.google.auto.value.AutoValue; +import java.util.Map; import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; @@ -27,10 +30,6 @@ import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; -import java.util.Map; - -import static org.apache.beam.sdk.io.iceberg.AddFilesSchemaTransformProvider.Configuration; - @AutoService(SchemaTransformProvider.class) public class AddFilesSchemaTransformProvider extends TypedSchemaTransformProvider { @Override From 49936abbd61b481a553c0057ce9653cfa4a8984c Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Thu, 5 Mar 2026 12:28:59 -0500 Subject: [PATCH 03/15] add tests --- .../apache/beam/sdk/io/iceberg/AddFiles.java | 83 +++-- .../AddFilesSchemaTransformProvider.java | 8 +- .../beam/sdk/io/iceberg/AddFilesTest.java | 332 ++++++++++++++++++ 3 files changed, 388 insertions(+), 35 deletions(-) create mode 100644 sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index 9ed3a52cfff6..caf966d9a819 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -17,6 +17,15 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.DATA_FILES; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERRORS; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERROR_SCHEMA; +import static org.apache.beam.sdk.metrics.Metrics.counter; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaCoder; import org.apache.beam.sdk.schemas.SchemaRegistry; @@ -37,8 +46,7 @@ import org.apache.beam.sdk.values.TupleTagList; import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.iceberg.AppendFiles; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; @@ -55,22 +63,24 @@ import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.util.List; - -import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.DATA_FILES; -import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERRORS; -import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERROR_SCHEMA; -import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; - +/** + * A transform that takes in a stream of file paths, converts them to Iceberg {@link DataFile}s with + * partition metadata and metrics, then commits them to an Iceberg {@link Table}. + */ public class AddFiles extends SchemaTransform { + private static final Duration DEFAULT_TRIGGER_INTERVAL = Duration.standardMinutes(10); + private static final Counter numFilesAdded = counter(AddFiles.class, "numFilesAdded"); + private static final Counter numErrorFiles = counter(AddFiles.class, "numErrorFiles"); + private static final Logger LOG = LoggerFactory.getLogger(AddFiles.class); + private static final int DEFAULT_FILES_TRIGGER = 100_000; private final IcebergCatalogConfig catalogConfig; private final String tableIdentifier; private final Duration intervalTrigger; private final int numFilesTrigger; private final @Nullable String locationPrefix; - private static final int DEFAULT_FILES_TRIGGER = 100_000; - private static final Duration DEFAULT_TRIGGER_INTERVAL = Duration.standardMinutes(10); public AddFiles( IcebergCatalogConfig catalogConfig, @@ -87,6 +97,15 @@ public AddFiles( @Override public PCollectionRowTuple expand(PCollectionRowTuple input) { + LOG.info( + "AddFiles configured to commit after accumulating {} files, or after {} seconds.", + numFilesTrigger, + intervalTrigger.getStandardSeconds()); + if (!Strings.isNullOrEmpty(locationPrefix)) { + LOG.info( + "AddFiles configured to build partition metadata after the prefix: '{}'", locationPrefix); + } + PCollection filePaths = input.getSinglePCollection(); Schema inputSchema = filePaths.getSchema(); Preconditions.checkState( @@ -123,7 +142,9 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { GroupIntoBatches.ofSize(numFilesTrigger) .withMaxBufferingDuration(intervalTrigger)) .apply("DropKey", Values.create()) - .apply("AddFilesToIceberg", ParDo.of(new AddFilesDoFn(catalogConfig, tableIdentifier))) + .apply( + "CommitFilesToIceberg", + ParDo.of(new CommitFilesDoFn(catalogConfig, tableIdentifier))) .setRowSchema(SnapshotInfo.getSchema()); return PCollectionRowTuple.of( @@ -138,7 +159,7 @@ static class ConvertToDataFile extends DoFn { public static final Schema ERROR_SCHEMA = Schema.builder().addStringField("file").addStringField("error").build(); private final @Nullable String prefix; - private static volatile @MonotonicNonNull Table sharedTable; + private static final Map tableCache = new ConcurrentHashMap<>(); private boolean isPartitioned = false; public ConvertToDataFile( @@ -150,30 +171,27 @@ public ConvertToDataFile( @Setup public void start() { - if (sharedTable == null) { - synchronized (ConvertToDataFile.class) { - if (sharedTable == null) { - sharedTable = catalogConfig.catalog().loadTable(TableIdentifier.parse(identifier)); - } - } - } - isPartitioned = sharedTable.spec().isPartitioned(); + Table table = + tableCache.computeIfAbsent( + identifier, (id) -> catalogConfig.catalog().loadTable(TableIdentifier.parse(id))); + isPartitioned = table.spec().isPartitioned(); Preconditions.checkArgument( !isPartitioned || prefix != null, "A location prefix must be specified when adding files to a partitioned table."); } private static final String UNKNOWN_FORMAT_ERROR = "Could not determine the file's format"; - private static final String PREFIX_ERROR = "File did not start with the specified prefix"; + private static final String PREFIX_ERROR = "File path did not start with the specified prefix"; @ProcessElement public void process(@Element String filePath, MultiOutputReceiver output) { - final Table table = checkStateNotNull(sharedTable); + final Table table = checkStateNotNull(tableCache.get(identifier)); boolean isPartitioned = table.spec().isPartitioned(); if (isPartitioned && !filePath.startsWith(checkStateNotNull(prefix))) { output .get(ERRORS) .output(Row.withSchema(ERROR_SCHEMA).addValues(filePath, PREFIX_ERROR).build()); + numErrorFiles.inc(); return; } @@ -185,6 +203,7 @@ public void process(@Element String filePath, MultiOutputReceiver output) { output .get(ERRORS) .output(Row.withSchema(ERROR_SCHEMA).addValues(filePath, UNKNOWN_FORMAT_ERROR).build()); + numErrorFiles.inc(); return; } Metrics metrics = getFileMetrics(inputFile, format, MetricsConfig.forTable(table)); @@ -207,20 +226,18 @@ private String getPartitionPath(String filePath) { return ""; } String partitionPath = filePath.substring(checkStateNotNull(prefix).length()); - List components = Lists.newArrayList(Splitter.on('/').split(partitionPath)); - if (!components.isEmpty()) { - components = components.subList(0, components.size() - 1); - } - return String.join("/", components); + int lastSlashIndex = partitionPath.lastIndexOf('/'); + + return lastSlashIndex > 0 ? partitionPath.substring(0, lastSlashIndex) : ""; } } - static class AddFilesDoFn extends DoFn, Row> { + static class CommitFilesDoFn extends DoFn, Row> { private final IcebergCatalogConfig catalogConfig; private final String identifier; - private @MonotonicNonNull Table table = null; + private transient @MonotonicNonNull Table table = null; - public AddFilesDoFn(IcebergCatalogConfig catalogConfig, String identifier) { + public CommitFilesDoFn(IcebergCatalogConfig catalogConfig, String identifier) { this.catalogConfig = catalogConfig; this.identifier = identifier; } @@ -237,13 +254,17 @@ public void process(@Element Iterable files, OutputReceive } Table table = checkStateNotNull(this.table); + int numFiles = 0; AppendFiles appendFiles = table.newAppend(); for (SerializableDataFile file : files) { DataFile df = file.createDataFile(table.specs()); appendFiles.appendFile(df); + numFiles++; } appendFiles.commit(); + numFilesAdded.inc(numFiles); + Snapshot snapshot = table.currentSnapshot(); output.output(SnapshotInfo.fromSnapshot(snapshot).toRow()); } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java index 8322667ac656..ed3ae5bded5a 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java @@ -77,10 +77,10 @@ public static Builder builder() { @SchemaFieldDescription( "The prefix shared among all partitions. For example, a data file may have the following" - + " location:\n" - + "'file:/Users/user/Documents/tmp/namespace/table_name/data/id=13/name=beam/data_file.parquet'\n\n" - + "The provided prefix should go up until the partition information:\n" - + "'file:/Users/user/Documents/tmp/namespace/table_name/data/'.\n" + + " location:%n" + + "'gs://bucket/namespace/table/data/id=13/name=beam/data_file.parquet'%n%n" + + "The provided prefix should go up until the partition information:%n" + + "'gs://bucket/namespace/table/data/'.%n" + "Required if the table is partitioned. ") public abstract @Nullable String getLocationPrefix(); diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java new file mode 100644 index 000000000000..a2f48a64d482 --- /dev/null +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java @@ -0,0 +1,332 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestStream; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.types.Types; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.rules.TestName; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class AddFilesTest { + @Rule public TemporaryFolder temp = new TemporaryFolder(); + private String root; + @Rule public TestPipeline pipeline = TestPipeline.create(); + + private HadoopCatalog catalog; + private TableIdentifier tableId; + private final org.apache.iceberg.Schema icebergSchema = + new org.apache.iceberg.Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "name", Types.StringType.get()), + Types.NestedField.required(3, "age", Types.IntegerType.get())); + private final PartitionSpec spec = + PartitionSpec.builderFor(icebergSchema).identity("age").truncate("name", 3).build(); + private final PartitionKey wrapper = new PartitionKey(spec, icebergSchema); + private IcebergCatalogConfig catalogConfig; + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @Rule public TestName testName = new TestName(); + + @Rule + public transient TestDataWarehouse warehouse = new TestDataWarehouse(TEMPORARY_FOLDER, "default"); + + private static final Schema INPUT_SCHEMA = Schema.builder().addStringField("filepath").build(); + + @Before + public void setup() throws Exception { + // Root for existing data files: + root = temp.getRoot().getAbsolutePath() + "/"; + + // Set up a local Hadoop Catalog using the temporary folder + catalog = new HadoopCatalog(new Configuration(), warehouse.location); + tableId = TableIdentifier.of("default", testName.getMethodName()); + + // Create an unpartitioned table by default, and full table metrics enabled + catalog.createTable( + tableId, + icebergSchema, + PartitionSpec.unpartitioned(), + ImmutableMap.of("write.metadata.metrics.default", "full")); + + catalogConfig = + IcebergCatalogConfig.builder() + .setCatalogProperties( + ImmutableMap.of("type", "hadoop", "warehouse", warehouse.location)) + .build(); + } + + @Test + public void testPartitionedFilesAreCommitted() throws Exception { + testFilesAreCommittedToIceberg(true); + } + + @Test + public void testUnPartitionedFilesAreCommitted() throws Exception { + testFilesAreCommittedToIceberg(false); + } + + public void testFilesAreCommittedToIceberg(boolean isPartitioned) throws Exception { + if (isPartitioned) { + // recreate table with the partition spec + catalog.dropTable(tableId); + catalog.createTable( + tableId, icebergSchema, spec, ImmutableMap.of("write.metadata.metrics.default", "full")); + } + + // 1. Generate two local Parquet file. + // Include Hive-like partition path if testing partition case + String partitionPath1 = isPartitioned ? "age=20/name_trunc=Mar/" : ""; + String file1 = root + partitionPath1 + "data1.parquet"; + wrapper.wrap(record(-1, "Mar", 20)); + DataWriter writer = createWriter(file1, isPartitioned ? wrapper.copy() : null); + writer.write(record(1, "Mark", 20)); + writer.write(record(2, "Martin", 20)); + writer.close(); + + String partitionPath2 = isPartitioned ? "age=25/name_trunc=Sam/" : ""; + String file2 = root + partitionPath2 + "data2.parquet"; + wrapper.wrap(record(-1, "Sam", 25)); + DataWriter writer2 = createWriter(file2, isPartitioned ? wrapper.copy() : null); + writer2.write(record(3, "Samantha", 25)); + writer2.write(record(4, "Sammy", 25)); + writer2.close(); + + // 2. Setup the input PCollection + Row row1 = Row.withSchema(INPUT_SCHEMA).addValue(file1).build(); + Row row2 = Row.withSchema(INPUT_SCHEMA).addValue(file2).build(); + PCollection inputFiles = + pipeline.apply("Create Input", Create.of(row1, row2).withRowSchema(INPUT_SCHEMA)); + + // 3. Apply the transform (Trigger aggressively for testing) + PCollectionRowTuple output = + PCollectionRowTuple.of("input", inputFiles) + .apply( + new AddFiles( + catalogConfig, + tableId.toString(), + isPartitioned ? root : null, + 2, // trigger at 2 files + Duration.standardSeconds(1))); + + // 4. Validate PCollection Outputs + PAssert.that(output.get("errors")).empty(); + + // 5. Run the pipeline + pipeline.run().waitUntilFinish(); + + // 6. Validate the Iceberg Table state directly + Table table = catalog.loadTable(tableId); + + // Check that we have exactly 1 snapshot with 2 files + assertEquals(1, Iterables.size(table.snapshots())); + + List addedFiles = + Lists.newArrayList(table.currentSnapshot().addedDataFiles(table.io())); + assertEquals(2, addedFiles.size()); + + // Verify file paths + assertTrue(addedFiles.stream().anyMatch(df -> df.location().contains("data1.parquet"))); + assertTrue(addedFiles.stream().anyMatch(df -> df.location().contains("data2.parquet"))); + + // check metrics metadata is preserved + DataFile writtenDf1 = writer.toDataFile(); + DataFile writtenDf2 = writer2.toDataFile(); + DataFile addedDf1 = + Iterables.getOnlyElement( + addedFiles.stream() + .filter(df -> df.location().contains("data1.parquet")) + .collect(Collectors.toList())); + DataFile addedDf2 = + Iterables.getOnlyElement( + addedFiles.stream() + .filter(df -> df.location().contains("data2.parquet")) + .collect(Collectors.toList())); + + assertEquals(writtenDf1.lowerBounds(), addedDf1.lowerBounds()); + assertEquals(writtenDf1.upperBounds(), addedDf1.upperBounds()); + assertEquals(writtenDf2.lowerBounds(), addedDf2.lowerBounds()); + assertEquals(writtenDf2.upperBounds(), addedDf2.upperBounds()); + + // check partition metadata is preserved + assertEquals(writtenDf1.partition(), addedDf1.partition()); + assertEquals(writtenDf2.partition(), addedDf2.partition()); + } + + @Test + public void testStreamingAdds() throws IOException { + List paths = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + String file = String.format("%sdata_%s.parquet", root, i); + DataWriter writer = createWriter(file); + writer.write(record(1, "SomeName", 30)); + writer.close(); + paths.add(Row.withSchema(INPUT_SCHEMA).addValue(file).build()); + } + + PCollection files = + pipeline.apply( + TestStream.create(INPUT_SCHEMA) + .addElements( + paths.get(0), paths.subList(1, 15).toArray(new Row[] {})) // should commit twice + .advanceProcessingTime(Duration.standardSeconds(10)) + .addElements( + paths.get(15), + paths.subList(16, 40).toArray(new Row[] {})) // should commit 3 times + .advanceProcessingTime(Duration.standardSeconds(10)) + .addElements( + paths.get(40), + paths.subList(41, 45).toArray(new Row[] {})) // should commit once + .advanceWatermarkToInfinity()); + + PCollectionRowTuple.of("input", files) + .apply( + new AddFiles( + catalogConfig, + tableId.toString(), + null, + 10, // trigger at 10 files + Duration.standardSeconds(5))); + pipeline.run().waitUntilFinish(); + + Table table = catalog.loadTable(tableId); + + List snapshots = Lists.newArrayList(table.snapshots()); + snapshots.sort(Comparator.comparingLong(Snapshot::timestampMillis)); + + assertEquals(6, snapshots.size()); + assertEquals(10, Iterables.size(snapshots.get(0).addedDataFiles(table.io()))); + assertEquals(5, Iterables.size(snapshots.get(1).addedDataFiles(table.io()))); + assertEquals(10, Iterables.size(snapshots.get(2).addedDataFiles(table.io()))); + assertEquals(10, Iterables.size(snapshots.get(3).addedDataFiles(table.io()))); + assertEquals(5, Iterables.size(snapshots.get(4).addedDataFiles(table.io()))); + assertEquals(5, Iterables.size(snapshots.get(5).addedDataFiles(table.io()))); + } + + @Test + public void testUnknownFormatErrors() throws Exception { + // Create a dummy text file (unsupported extension) + File txtFile = temp.newFile("unsupported.txt"); + txtFile.createNewFile(); + + Row badRow = Row.withSchema(INPUT_SCHEMA).addValue(txtFile.getAbsolutePath()).build(); + PCollection inputFiles = + pipeline.apply("Create Input", Create.of(badRow).withRowSchema(INPUT_SCHEMA)); + + AddFiles addFiles = new AddFiles(catalogConfig, tableId.toString(), null, 1, null); + PCollectionRowTuple outputTuple = PCollectionRowTuple.of("input", inputFiles).apply(addFiles); + + // Validate the file ended up in the errors PCollection with the correct schema + PAssert.that(outputTuple.get("errors")) + .containsInAnyOrder( + Row.withSchema(AddFiles.ConvertToDataFile.ERROR_SCHEMA) + .addValues(txtFile.getAbsolutePath(), "Could not determine the file's format") + .build()); + + pipeline.run().waitUntilFinish(); + } + + @Test + public void testPartitionPrefixErrors() throws Exception { + // Drop unpartitioned table and create a partitioned one + catalog.dropTable(tableId); + PartitionSpec spec = PartitionSpec.builderFor(icebergSchema).identity("name").build(); + catalog.createTable(tableId, icebergSchema, spec); + + String file1 = root + "data1.parquet"; + wrapper.wrap(record(-1, "And", 30)); + DataWriter writer = createWriter(file1, wrapper.copy()); + writer.write(record(1, "Andrew", 30)); + writer.close(); + + Row row1 = Row.withSchema(INPUT_SCHEMA).addValue(file1).build(); + PCollection inputFiles = + pipeline.apply("Create Input", Create.of(row1).withRowSchema(INPUT_SCHEMA)); + + // Notice locationPrefix is "some/prefix/" but the absolute path doesn't start with it + AddFiles addFiles = new AddFiles(catalogConfig, tableId.toString(), "some/prefix/", 1, null); + PCollectionRowTuple outputTuple = PCollectionRowTuple.of("input", inputFiles).apply(addFiles); + + PAssert.that(outputTuple.get("errors")) + .containsInAnyOrder( + Row.withSchema(AddFiles.ConvertToDataFile.ERROR_SCHEMA) + .addValues(file1, "File path did not start with the specified prefix") + .build()); + + pipeline.run().waitUntilFinish(); + } + + private DataWriter createWriter(String file) throws IOException { + return createWriter(file, null); + } + + private DataWriter createWriter(String file, @Nullable StructLike partition) + throws IOException { + return Parquet.writeData(Files.localOutput(file)) + .schema(icebergSchema) + .withSpec(partition != null ? spec : PartitionSpec.unpartitioned()) + .withPartition(partition) + .createWriterFunc(GenericParquetWriter::create) + .build(); + } + + private Record record(int id, String name, int age) { + return GenericRecord.create(icebergSchema).copy("id", id, "name", name, "age", age); + } +} From 3b2e86dbfcd2909ad5f1cce89945e87511b860da Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Sat, 7 Mar 2026 15:09:05 -0500 Subject: [PATCH 04/15] create table if needed; determine partition spec from metrics --- sdks/java/io/iceberg/build.gradle | 4 +- .../apache/beam/sdk/io/iceberg/AddFiles.java | 337 +++++++++++++++--- .../AddFilesSchemaTransformProvider.java | 29 +- .../beam/sdk/io/iceberg/PartitionUtils.java | 5 +- .../beam/sdk/io/iceberg/AddFilesTest.java | 164 +++++++-- .../apache/beam/sdk/io/parquet/ParquetIO.java | 4 +- 6 files changed, 458 insertions(+), 85 deletions(-) diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle index c188e4f63853..b185a962415f 100644 --- a/sdks/java/io/iceberg/build.gradle +++ b/sdks/java/io/iceberg/build.gradle @@ -51,6 +51,8 @@ dependencies { implementation library.java.joda_time implementation "org.apache.parquet:parquet-column:$parquet_version" implementation "org.apache.parquet:parquet-hadoop:$parquet_version" + implementation "org.apache.parquet:parquet-common:$parquet_version" + implementation project(":sdks:java:io:parquet") implementation "org.apache.orc:orc-core:$orc_version" implementation "org.apache.iceberg:iceberg-core:$iceberg_version" implementation "org.apache.iceberg:iceberg-api:$iceberg_version" @@ -74,11 +76,11 @@ dependencies { testImplementation library.java.bigdataoss_gcsio testImplementation library.java.bigdataoss_util_hadoop testImplementation "org.apache.parquet:parquet-avro:$parquet_version" - testImplementation "org.apache.parquet:parquet-common:$parquet_version" testImplementation "org.apache.iceberg:iceberg-data:$iceberg_version" testImplementation project(path: ":sdks:java:core", configuration: "shadowTest") testImplementation project(":sdks:java:extensions:google-cloud-platform-core") testImplementation library.java.junit + testImplementation library.java.hamcrest // Hive catalog test dependencies testImplementation project(path: ":sdks:java:io:iceberg:hive") diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index caf966d9a819..afdc92ba186d 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -19,13 +19,24 @@ import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.DATA_FILES; import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERRORS; -import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERROR_SCHEMA; import static org.apache.beam.sdk.metrics.Metrics.counter; import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.SeekableByteChannel; +import java.util.List; import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; +import java.util.Objects; +import java.util.UUID; +import java.util.stream.Collectors; +import org.apache.beam.sdk.io.Compression; +import org.apache.beam.sdk.io.FileSystems; +import org.apache.beam.sdk.io.fs.MatchResult; +import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.io.parquet.ParquetIO.ReadFiles.BeamParquetInputFile; import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaCoder; import org.apache.beam.sdk.schemas.SchemaRegistry; @@ -36,8 +47,9 @@ import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.Redistribute; -import org.apache.beam.sdk.transforms.Values; import org.apache.beam.sdk.transforms.WithKeys; +import org.apache.beam.sdk.transforms.windowing.PaneInfo; +import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.PCollectionTuple; @@ -47,19 +59,30 @@ import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.iceberg.AppendFiles; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.FileFormat; import org.apache.iceberg.Metrics; import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Snapshot; import org.apache.iceberg.Table; import org.apache.iceberg.avro.Avro; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.AlreadyExistsException; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.orc.OrcMetrics; +import org.apache.iceberg.parquet.ParquetSchemaUtil; import org.apache.iceberg.parquet.ParquetUtil; +import org.apache.iceberg.transforms.Transform; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.schema.MessageType; import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; @@ -76,20 +99,29 @@ public class AddFiles extends SchemaTransform { private static final Counter numErrorFiles = counter(AddFiles.class, "numErrorFiles"); private static final Logger LOG = LoggerFactory.getLogger(AddFiles.class); private static final int DEFAULT_FILES_TRIGGER = 100_000; + static final Schema ERROR_SCHEMA = + Schema.builder().addStringField("file").addStringField("error").build(); private final IcebergCatalogConfig catalogConfig; private final String tableIdentifier; private final Duration intervalTrigger; private final int numFilesTrigger; private final @Nullable String locationPrefix; + private final @Nullable List partitionFields; + private final @Nullable Map tableProps; + private final String jobId = UUID.randomUUID().toString(); public AddFiles( IcebergCatalogConfig catalogConfig, String tableIdentifier, @Nullable String locationPrefix, + @Nullable List partitionFields, + @Nullable Map tableProps, @Nullable Integer numFilesTrigger, @Nullable Duration intervalTrigger) { this.catalogConfig = catalogConfig; this.tableIdentifier = tableIdentifier; + this.partitionFields = partitionFields; + this.tableProps = tableProps; this.intervalTrigger = intervalTrigger != null ? intervalTrigger : DEFAULT_TRIGGER_INTERVAL; this.numFilesTrigger = numFilesTrigger != null ? numFilesTrigger : DEFAULT_FILES_TRIGGER; this.locationPrefix = locationPrefix; @@ -106,8 +138,7 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { "AddFiles configured to build partition metadata after the prefix: '{}'", locationPrefix); } - PCollection filePaths = input.getSinglePCollection(); - Schema inputSchema = filePaths.getSchema(); + Schema inputSchema = input.getSinglePCollection().getSchema(); Preconditions.checkState( inputSchema.getFieldCount() == 1 && inputSchema.getField(0).getType().getTypeName().equals(Schema.TypeName.STRING), @@ -115,7 +146,8 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { inputSchema); PCollectionTuple dataFiles = - filePaths + input + .getSinglePCollection() .apply("Filter empty paths", Filter.by(row -> row.getString(0) != null)) .apply( "ExtractPaths", @@ -124,7 +156,13 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { .apply(Redistribute.arbitrarily()) .apply( "ConvertToDataFiles", - ParDo.of(new ConvertToDataFile(catalogConfig, tableIdentifier, locationPrefix)) + ParDo.of( + new ConvertToDataFile( + catalogConfig, + tableIdentifier, + locationPrefix, + partitionFields, + tableProps)) .withOutputTags(DATA_FILES, TupleTagList.of(ERRORS))); SchemaCoder sdfSchema; try { @@ -141,10 +179,9 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { .apply( GroupIntoBatches.ofSize(numFilesTrigger) .withMaxBufferingDuration(intervalTrigger)) - .apply("DropKey", Values.create()) .apply( "CommitFilesToIceberg", - ParDo.of(new CommitFilesDoFn(catalogConfig, tableIdentifier))) + ParDo.of(new CommitFilesDoFn(catalogConfig, tableIdentifier, jobId))) .setRowSchema(SnapshotInfo.getSchema()); return PCollectionRowTuple.of( @@ -156,59 +193,84 @@ static class ConvertToDataFile extends DoFn { private final String identifier; public static final TupleTag ERRORS = new TupleTag<>(); public static final TupleTag DATA_FILES = new TupleTag<>(); - public static final Schema ERROR_SCHEMA = - Schema.builder().addStringField("file").addStringField("error").build(); private final @Nullable String prefix; - private static final Map tableCache = new ConcurrentHashMap<>(); - private boolean isPartitioned = false; + private final @Nullable List partitionFields; + private final @Nullable Map tableProps; + private transient @MonotonicNonNull Table table; public ConvertToDataFile( - IcebergCatalogConfig catalogConfig, String identifier, @Nullable String prefix) { + IcebergCatalogConfig catalogConfig, + String identifier, + @Nullable String prefix, + @Nullable List partitionFields, + @Nullable Map tableProps) { this.catalogConfig = catalogConfig; this.identifier = identifier; this.prefix = prefix; + this.partitionFields = partitionFields; + this.tableProps = tableProps; } - @Setup - public void start() { - Table table = - tableCache.computeIfAbsent( - identifier, (id) -> catalogConfig.catalog().loadTable(TableIdentifier.parse(id))); - isPartitioned = table.spec().isPartitioned(); - Preconditions.checkArgument( - !isPartitioned || prefix != null, - "A location prefix must be specified when adding files to a partitioned table."); - } - - private static final String UNKNOWN_FORMAT_ERROR = "Could not determine the file's format"; private static final String PREFIX_ERROR = "File path did not start with the specified prefix"; + private static final String UNKNOWN_FORMAT_ERROR = "Could not determine the file's format"; + static final String UNKNOWN_PARTITION_ERROR = "Could not determine the file's partition"; @ProcessElement - public void process(@Element String filePath, MultiOutputReceiver output) { - final Table table = checkStateNotNull(tableCache.get(identifier)); - boolean isPartitioned = table.spec().isPartitioned(); - if (isPartitioned && !filePath.startsWith(checkStateNotNull(prefix))) { + public void process(@Element String filePath, MultiOutputReceiver output) throws IOException { + FileFormat format; + try { + format = inferFormat(filePath); + } catch (UnknownFormatException e) { output .get(ERRORS) - .output(Row.withSchema(ERROR_SCHEMA).addValues(filePath, PREFIX_ERROR).build()); + .output(Row.withSchema(ERROR_SCHEMA).addValues(filePath, UNKNOWN_FORMAT_ERROR).build()); numErrorFiles.inc(); return; } - InputFile inputFile = table.io().newInputFile(filePath); - FileFormat format; - try { - format = inferFormat(inputFile.location()); - } catch (UnknownFormat e) { + if (table == null) { + table = getOrCreateTable(getSchema(filePath, format)); + } + + // Check if the file path contains the provided prefix + if (table.spec().isPartitioned() + && !Strings.isNullOrEmpty(prefix) + && !filePath.startsWith(checkStateNotNull(prefix))) { output .get(ERRORS) - .output(Row.withSchema(ERROR_SCHEMA).addValues(filePath, UNKNOWN_FORMAT_ERROR).build()); + .output(Row.withSchema(ERROR_SCHEMA).addValues(filePath, PREFIX_ERROR).build()); numErrorFiles.inc(); return; } + + InputFile inputFile = table.io().newInputFile(filePath); + Metrics metrics = getFileMetrics(inputFile, format, MetricsConfig.forTable(table)); - String partitionPath = getPartitionPath(filePath); + // Figure out which partition this DataFile should go to + String partitionPath; + if (table.spec().isUnpartitioned()) { + partitionPath = ""; + } else if (!Strings.isNullOrEmpty(prefix)) { + // option 1: use directory structure to determine partition + // Note: we don't validate the DataFile content here + partitionPath = getPartitionFromFilePath(filePath); + } else { + try { + // option 2: examine DataFile min/max statistics to determine partition + partitionPath = getPartitionFromMetrics(metrics, inputFile); + } catch (UnknownPartitionException e) { + output + .get(ERRORS) + .output( + Row.withSchema(ERROR_SCHEMA) + .addValues(filePath, UNKNOWN_PARTITION_ERROR + ": " + e.getMessage()) + .build()); + numErrorFiles.inc(); + return; + } + } + DataFile df = DataFiles.builder(table.spec()) .withPath(filePath) @@ -221,8 +283,50 @@ public void process(@Element String filePath, MultiOutputReceiver output) { output.get(DATA_FILES).output(SerializableDataFile.from(df, partitionPath)); } - private String getPartitionPath(String filePath) { - if (!isPartitioned) { + private T transformValue(Transform transform, Type type, ByteBuffer bytes) { + S value = Conversions.fromByteBuffer(type, bytes); + return transform.bind(type).apply(value); + } + + private Table getOrCreateTable(org.apache.iceberg.Schema schema) { + PartitionSpec spec = PartitionUtils.toPartitionSpec(partitionFields, schema); + try { + return tableProps == null + ? catalogConfig.catalog().createTable(TableIdentifier.parse(identifier), schema, spec) + : catalogConfig + .catalog() + .createTable(TableIdentifier.parse(identifier), schema, spec, tableProps); + } catch (AlreadyExistsException e) { // if table already exists, just load it + return catalogConfig.catalog().loadTable(TableIdentifier.parse(identifier)); + } + } + + /** + * We don't have a table yet, so we don't know which FileIO to use to read these files. Instead, + * we use Beam's FileSystem utilities to read the file and extract its schema to create the + * table + */ + private static org.apache.iceberg.Schema getSchema(String filePath, FileFormat format) + throws IOException { + Preconditions.checkArgument( + format.equals(FileFormat.PARQUET), "Table creation is only supported for Parquet files."); + FileSystems.registerFileSystemsOnce(PipelineOptionsFactory.create()); + + MatchResult result = FileSystems.match(filePath); + ResourceId resourceId = Iterables.getOnlyElement(result.metadata()).resourceId(); + Compression compression = Compression.detect(checkStateNotNull(resourceId.getFilename())); + SeekableByteChannel channel = + (SeekableByteChannel) compression.readDecompressed(FileSystems.open(resourceId)); + BeamParquetInputFile file = new BeamParquetInputFile(channel); + + try (ParquetFileReader reader = ParquetFileReader.open(file)) { + MessageType messageType = reader.getFooter().getFileMetaData().getSchema(); + return ParquetSchemaUtil.convert(messageType); + } + } + + private String getPartitionFromFilePath(String filePath) { + if (checkStateNotNull(table).spec().isUnpartitioned()) { return ""; } String partitionPath = filePath.substring(checkStateNotNull(prefix).length()); @@ -230,43 +334,166 @@ private String getPartitionPath(String filePath) { return lastSlashIndex > 0 ? partitionPath.substring(0, lastSlashIndex) : ""; } + + /** + * Examines the min/max values of each partition column to determine the destination partition. + * + *

If the transformed min/max values are not equal for any given column, we won't be able to + * determine the partition. In this case, we output the DataFile to the DLQ, because assigning + * an incorrect partition may lead to it being hidden from some queries. + */ + private String getPartitionFromMetrics(Metrics metrics, InputFile inputFile) + throws UnknownPartitionException { + Table table = checkStateNotNull(this.table); + List fields = table.spec().fields(); + List sourceIds = + fields.stream().map(PartitionField::sourceId).collect(Collectors.toList()); + Metrics partitionMetrics; + // Check if metrics already includes partition columns (this is configured by table + // properties): + if (metrics.lowerBounds().keySet().containsAll(sourceIds) + && metrics.upperBounds().keySet().containsAll(sourceIds)) { + partitionMetrics = metrics; + } else { + // Otherwise, recollect metrics and ensure it includes all partition fields. + // Note: we don't attach these additional metrics to the DataFile because we can't assume + // that's in the user's best interest. + // Some tables are very wide and users may not want to store excessive metadata. + List sourceNames = + fields.stream() + .map(pf -> checkStateNotNull(table.schema().idToName().get(pf.sourceId()))) + .collect(Collectors.toList()); + Map configProps = + sourceNames.stream() + .collect(Collectors.toMap(s -> "write.metadata.metrics.column." + s, s -> "full")); + MetricsConfig configWithPartitionFields = MetricsConfig.fromProperties(configProps); + partitionMetrics = + getFileMetrics(inputFile, inferFormat(inputFile.location()), configWithPartitionFields); + } + + PartitionKey pk = new PartitionKey(table.spec(), table.schema()); + + for (int i = 0; i < fields.size(); i++) { + PartitionField field = fields.get(i); + Type type = table.schema().findType(field.sourceId()); + Transform transform = field.transform(); + + // Make a best effort estimate by comparing the lower and upper transformed values. + // If the transformed values are equal, assume that the DataFile's data safely + // aligns with the same partition. + // TODO(ahmedabu98): is comparing min/max safe enough? + // or should we compare ALL the records in a DF? + ByteBuffer lowerBytes = partitionMetrics.lowerBounds().get(field.sourceId()); + ByteBuffer upperBytes = partitionMetrics.upperBounds().get(field.sourceId()); + if (lowerBytes == null && upperBytes == null) { + pk.set(i, null); + continue; + } else if (lowerBytes == null || upperBytes == null) { + throw new UnknownPartitionException("Only one of the min/max was was null"); + } + Object lowerTransformedValue = transformValue(transform, type, lowerBytes); + Object upperTransformedValue = transformValue(transform, type, upperBytes); + + if (!Objects.deepEquals(lowerTransformedValue, upperTransformedValue)) { + // The DataFile contains values that align to different partitions, so we cannot + // safely determine a partition. + // If we commit the DataFile with an incorrect partition, downstream queries may + // completely ignore it (due to Iceberg's smart partition scan-planning). + // We also cannot commit the DataFile with a "null" partition, because that will + // also get skipped by most queries. + // The safe thing to do is to output the file to DLQ. + throw new UnknownPartitionException("Min and max transformed values were not equal"); + } + + pk.set(i, lowerTransformedValue); + } + + return pk.toPath(); + } } - static class CommitFilesDoFn extends DoFn, Row> { + static class CommitFilesDoFn extends DoFn>, Row> { private final IcebergCatalogConfig catalogConfig; private final String identifier; private transient @MonotonicNonNull Table table = null; + private final String jobId; + private static final String COMMIT_ID_KEY = "beam.add-files-commit-id"; - public CommitFilesDoFn(IcebergCatalogConfig catalogConfig, String identifier) { + public CommitFilesDoFn(IcebergCatalogConfig catalogConfig, String identifier, String jobId) { this.catalogConfig = catalogConfig; this.identifier = identifier; + this.jobId = jobId; } - @Setup + @StartBundle public void start() { - table = catalogConfig.catalog().loadTable(TableIdentifier.parse(identifier)); + if (table == null) { + table = catalogConfig.catalog().loadTable(TableIdentifier.parse(identifier)); + } } @ProcessElement - public void process(@Element Iterable files, OutputReceiver output) { - if (!files.iterator().hasNext()) { + public void process( + @Element KV> files, + PaneInfo pane, + OutputReceiver output) { + String commitId = commitHash(files.getValue()); + Table table = checkStateNotNull(this.table); + if (shouldSkip(commitId)) { return; } - Table table = checkStateNotNull(this.table); int numFiles = 0; AppendFiles appendFiles = table.newAppend(); - for (SerializableDataFile file : files) { + for (SerializableDataFile file : files.getValue()) { DataFile df = file.createDataFile(table.specs()); appendFiles.appendFile(df); numFiles++; } - + appendFiles.set(COMMIT_ID_KEY, commitId); appendFiles.commit(); - numFilesAdded.inc(numFiles); Snapshot snapshot = table.currentSnapshot(); output.output(SnapshotInfo.fromSnapshot(snapshot).toRow()); + numFilesAdded.inc(numFiles); + } + + private String commitHash(Iterable files) { + int hash = 0; + for (SerializableDataFile file : files) { + hash = 31 * hash + file.getPath().hashCode(); + } + return String.valueOf(Math.abs(hash)); + } + + /** + * If the process call fails immediately after committing files, but before registering with the + * runner, then runner will retry committing the same batch of files, possibly leading to data + * duplication. + * + *

To mitigate, we create a unique ID per commit and store it in the snapshot summary. We + * skip the pane's batch of files if we see a snapshot with the same unique ID. + */ + private boolean shouldSkip(String commitUID) { + Table table = checkStateNotNull(this.table); + table.refresh(); + + // check the last 10 snapshots to see if it contains the commit ID + int i = 0; + @Nullable Snapshot current = table.currentSnapshot(); + while (current != null && i < 10) { + Map summary = current.summary(); + if (summary != null && commitUID.equals(summary.get(COMMIT_ID_KEY))) { + return true; // commit already happened, we should skip + } + if (current.parentId() == null) { + break; + } + current = table.snapshot(current.parentId()); + i++; + } + + return false; } } @@ -295,8 +522,14 @@ public static FileFormat inferFormat(String path) { return FileFormat.AVRO; } - throw new UnknownFormat(); + throw new UnknownFormatException(); } - static class UnknownFormat extends IllegalArgumentException {} + static class UnknownFormatException extends IllegalArgumentException {} + + static class UnknownPartitionException extends IllegalStateException { + UnknownPartitionException(String msg) { + super(msg); + } + } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java index ed3ae5bded5a..7e745468f969 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java @@ -21,6 +21,7 @@ import com.google.auto.service.AutoService; import com.google.auto.value.AutoValue; +import java.util.List; import java.util.Map; import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; @@ -40,6 +41,8 @@ public AddFiles from(Configuration configuration) { configuration.getIcebergCatalog(), configuration.getTable(), configuration.getLocationPrefix(), + configuration.getPartitionFields(), + configuration.getTableProperties(), configuration.getAppendBatchSize(), frequency != null ? Duration.standardSeconds(frequency) : null); } @@ -81,9 +84,29 @@ public static Builder builder() { + "'gs://bucket/namespace/table/data/id=13/name=beam/data_file.parquet'%n%n" + "The provided prefix should go up until the partition information:%n" + "'gs://bucket/namespace/table/data/'.%n" - + "Required if the table is partitioned. ") + + "If not provided, will try determining each DataFile's partition from its metrics metadata.") public abstract @Nullable String getLocationPrefix(); + @SchemaFieldDescription( + "Fields used to create a partition spec that is applied when tables are created. For a field 'foo', " + + "the available partition transforms are:\n\n" + + "- `foo`\n" + + "- `truncate(foo, N)`\n" + + "- `bucket(foo, N)`\n" + + "- `hour(foo)`\n" + + "- `day(foo)`\n" + + "- `month(foo)`\n" + + "- `year(foo)`\n" + + "- `void(foo)`\n\n" + + "For more information on partition transforms, please visit https://iceberg.apache.org/spec/#partition-transforms.") + public abstract @Nullable List getPartitionFields(); + + @SchemaFieldDescription( + "Iceberg table properties to be set on the table when it is created.\n" + + "For more information on table properties," + + " please visit https://iceberg.apache.org/docs/latest/configuration/#table-properties.") + public abstract @Nullable Map getTableProperties(); + @AutoValue.Builder public abstract static class Builder { public abstract Builder setTable(String table); @@ -98,6 +121,10 @@ public abstract static class Builder { public abstract Builder setLocationPrefix(String prefix); + public abstract Builder setPartitionFields(List fields); + + public abstract Builder setTableProperties(Map props); + public abstract Configuration build(); } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java index 2b3117f8bf84..805cc0672940 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java @@ -68,10 +68,13 @@ class PartitionUtils { static PartitionSpec toPartitionSpec( @Nullable List fields, org.apache.beam.sdk.schemas.Schema beamSchema) { + return toPartitionSpec(fields, IcebergUtils.beamSchemaToIcebergSchema(beamSchema)); + } + + static PartitionSpec toPartitionSpec(@Nullable List fields, Schema schema) { if (fields == null) { return PartitionSpec.unpartitioned(); } - Schema schema = IcebergUtils.beamSchemaToIcebergSchema(beamSchema); PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); for (String field : fields) { diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java index a2f48a64d482..b53ab3b943a1 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java @@ -17,14 +17,20 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasEntry; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Comparator; import java.util.List; +import java.util.Map; import java.util.stream.Collectors; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.testing.PAssert; @@ -40,6 +46,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.DataFile; import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionData; import org.apache.iceberg.PartitionKey; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Snapshot; @@ -77,9 +84,11 @@ public class AddFilesTest { Types.NestedField.required(1, "id", Types.IntegerType.get()), Types.NestedField.required(2, "name", Types.StringType.get()), Types.NestedField.required(3, "age", Types.IntegerType.get())); - private final PartitionSpec spec = - PartitionSpec.builderFor(icebergSchema).identity("age").truncate("name", 3).build(); + private final List partitionFields = Arrays.asList("age", "truncate(name, 3)"); + private final PartitionSpec spec = PartitionUtils.toPartitionSpec(partitionFields, icebergSchema); private final PartitionKey wrapper = new PartitionKey(spec, icebergSchema); + private final Map tableProps = + ImmutableMap.of("write.metadata.metrics.default", "full", "foo", "bar"); private IcebergCatalogConfig catalogConfig; @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Rule public TestName testName = new TestName(); @@ -94,17 +103,10 @@ public void setup() throws Exception { // Root for existing data files: root = temp.getRoot().getAbsolutePath() + "/"; - // Set up a local Hadoop Catalog using the temporary folder + // Set up a local Hadoop Catalog catalog = new HadoopCatalog(new Configuration(), warehouse.location); tableId = TableIdentifier.of("default", testName.getMethodName()); - // Create an unpartitioned table by default, and full table metrics enabled - catalog.createTable( - tableId, - icebergSchema, - PartitionSpec.unpartitioned(), - ImmutableMap.of("write.metadata.metrics.default", "full")); - catalogConfig = IcebergCatalogConfig.builder() .setCatalogProperties( @@ -113,23 +115,16 @@ public void setup() throws Exception { } @Test - public void testPartitionedFilesAreCommitted() throws Exception { - testFilesAreCommittedToIceberg(true); + public void testAddPartitionedFiles() throws Exception { + testAddFilesWithPartitionPath(true); } @Test - public void testUnPartitionedFilesAreCommitted() throws Exception { - testFilesAreCommittedToIceberg(false); + public void testAddUnPartitionedFiles() throws Exception { + testAddFilesWithPartitionPath(false); } - public void testFilesAreCommittedToIceberg(boolean isPartitioned) throws Exception { - if (isPartitioned) { - // recreate table with the partition spec - catalog.dropTable(tableId); - catalog.createTable( - tableId, icebergSchema, spec, ImmutableMap.of("write.metadata.metrics.default", "full")); - } - + public void testAddFilesWithPartitionPath(boolean isPartitioned) throws Exception { // 1. Generate two local Parquet file. // Include Hive-like partition path if testing partition case String partitionPath1 = isPartitioned ? "age=20/name_trunc=Mar/" : ""; @@ -162,8 +157,10 @@ public void testFilesAreCommittedToIceberg(boolean isPartitioned) throws Excepti catalogConfig, tableId.toString(), isPartitioned ? root : null, + isPartitioned ? partitionFields : null, + tableProps, 2, // trigger at 2 files - Duration.standardSeconds(1))); + Duration.standardSeconds(10))); // 4. Validate PCollection Outputs PAssert.that(output.get("errors")).empty(); @@ -171,8 +168,10 @@ public void testFilesAreCommittedToIceberg(boolean isPartitioned) throws Excepti // 5. Run the pipeline pipeline.run().waitUntilFinish(); - // 6. Validate the Iceberg Table state directly + // 6. Validate the Iceberg Table was created with the correct spec and properties Table table = catalog.loadTable(tableId); + tableProps.forEach((key, value) -> assertThat(table.properties(), hasEntry(key, value))); + assertEquals(isPartitioned ? spec : PartitionSpec.unpartitioned(), table.spec()); // Check that we have exactly 1 snapshot with 2 files assertEquals(1, Iterables.size(table.snapshots())); @@ -209,6 +208,111 @@ public void testFilesAreCommittedToIceberg(boolean isPartitioned) throws Excepti assertEquals(writtenDf2.partition(), addedDf2.partition()); } + @Test + public void testAddFilesWithPartitionFromMetrics() throws IOException { + // 1. Generate local Parquet files with no directory structure. + String file1 = root + "data1.parquet"; + DataWriter writer = createWriter(file1); + writer.write(record(1, "Mark", 20)); + writer.write(record(2, "Martin", 20)); + writer.close(); + PartitionData expectedPartition1 = new PartitionData(spec.partitionType()); + expectedPartition1.set(0, 20); + expectedPartition1.set(1, "Mar"); + + String file2 = root + "data2.parquet"; + DataWriter writer2 = createWriter(file2); + writer2.write(record(3, "Samantha", 25)); + writer2.write(record(4, "Sammy", 25)); + writer2.close(); + PartitionData expectedPartition2 = new PartitionData(spec.partitionType()); + expectedPartition2.set(0, 25); + expectedPartition2.set(1, "Sam"); + + // Also create a "bad" DataFile, containing values that correspond to different partitions + // This file should get output to the DLQ, because we cannot determine its partition + String file3 = root + "data3.parquet"; + DataWriter writer3 = createWriter(file3); + writer3.write(record(5, "Johnny", 25)); + writer3.write(record(6, "Yaseen", 32)); + writer3.close(); + + // 2. Setup the input PCollection + Row row1 = Row.withSchema(INPUT_SCHEMA).addValue(file1).build(); + Row row2 = Row.withSchema(INPUT_SCHEMA).addValue(file2).build(); + Row row3 = Row.withSchema(INPUT_SCHEMA).addValue(file3).build(); + PCollection inputFiles = + pipeline.apply("Create Input", Create.of(row1, row2, row3).withRowSchema(INPUT_SCHEMA)); + + // 3. Apply the transform (Trigger aggressively for testing) + PCollectionRowTuple output = + PCollectionRowTuple.of("input", inputFiles) + .apply( + new AddFiles( + catalogConfig, + tableId.toString(), + null, // no prefix, so determine partition from DF metrics + partitionFields, + tableProps, + 2, // trigger at 2 files + Duration.standardSeconds(10))); + + // 4. There should be an error for File3, because its partition could not be determined + PAssert.that(output.get("errors")) + .satisfies( + errorRows -> { + Row errorRow = Iterables.getOnlyElement(errorRows); + checkState( + errorRow.getSchema().equals(AddFiles.ERROR_SCHEMA) + && file3.equals(errorRow.getString(0)) + && checkStateNotNull(errorRow.getString(1)) + .startsWith(AddFiles.ConvertToDataFile.UNKNOWN_PARTITION_ERROR)); + return null; + }); + + // 5. Run the pipeline + pipeline.run().waitUntilFinish(); + + // 6. Validate the Iceberg Table was created with the correct spec and properties + Table table = catalog.loadTable(tableId); + tableProps.forEach((key, value) -> assertThat(table.properties(), hasEntry(key, value))); + assertEquals(spec, table.spec()); + + // Check that we have exactly 1 snapshot with 2 files + assertEquals(1, Iterables.size(table.snapshots())); + + List addedFiles = + Lists.newArrayList(table.currentSnapshot().addedDataFiles(table.io())); + assertEquals(2, addedFiles.size()); + + // Verify file paths + assertTrue(addedFiles.stream().anyMatch(df -> df.location().contains("data1.parquet"))); + assertTrue(addedFiles.stream().anyMatch(df -> df.location().contains("data2.parquet"))); + + // check metrics metadata is preserved + DataFile writtenDf1 = writer.toDataFile(); + DataFile writtenDf2 = writer2.toDataFile(); + DataFile addedDf1 = + Iterables.getOnlyElement( + addedFiles.stream() + .filter(df -> df.location().contains("data1.parquet")) + .collect(Collectors.toList())); + DataFile addedDf2 = + Iterables.getOnlyElement( + addedFiles.stream() + .filter(df -> df.location().contains("data2.parquet")) + .collect(Collectors.toList())); + + assertEquals(writtenDf1.lowerBounds(), addedDf1.lowerBounds()); + assertEquals(writtenDf1.upperBounds(), addedDf1.upperBounds()); + assertEquals(writtenDf2.lowerBounds(), addedDf2.lowerBounds()); + assertEquals(writtenDf2.upperBounds(), addedDf2.upperBounds()); + + // check partition metadata is preserved + assertEquals(expectedPartition1, addedDf1.partition()); + assertEquals(expectedPartition2, addedDf2.partition()); + } + @Test public void testStreamingAdds() throws IOException { List paths = new ArrayList<>(); @@ -241,6 +345,8 @@ public void testStreamingAdds() throws IOException { catalogConfig, tableId.toString(), null, + null, + null, 10, // trigger at 10 files Duration.standardSeconds(5))); pipeline.run().waitUntilFinish(); @@ -261,6 +367,7 @@ public void testStreamingAdds() throws IOException { @Test public void testUnknownFormatErrors() throws Exception { + catalog.createTable(tableId, icebergSchema); // Create a dummy text file (unsupported extension) File txtFile = temp.newFile("unsupported.txt"); txtFile.createNewFile(); @@ -269,13 +376,13 @@ public void testUnknownFormatErrors() throws Exception { PCollection inputFiles = pipeline.apply("Create Input", Create.of(badRow).withRowSchema(INPUT_SCHEMA)); - AddFiles addFiles = new AddFiles(catalogConfig, tableId.toString(), null, 1, null); + AddFiles addFiles = new AddFiles(catalogConfig, tableId.toString(), null, null, null, 1, null); PCollectionRowTuple outputTuple = PCollectionRowTuple.of("input", inputFiles).apply(addFiles); // Validate the file ended up in the errors PCollection with the correct schema PAssert.that(outputTuple.get("errors")) .containsInAnyOrder( - Row.withSchema(AddFiles.ConvertToDataFile.ERROR_SCHEMA) + Row.withSchema(AddFiles.ERROR_SCHEMA) .addValues(txtFile.getAbsolutePath(), "Could not determine the file's format") .build()); @@ -300,12 +407,13 @@ public void testPartitionPrefixErrors() throws Exception { pipeline.apply("Create Input", Create.of(row1).withRowSchema(INPUT_SCHEMA)); // Notice locationPrefix is "some/prefix/" but the absolute path doesn't start with it - AddFiles addFiles = new AddFiles(catalogConfig, tableId.toString(), "some/prefix/", 1, null); + AddFiles addFiles = + new AddFiles(catalogConfig, tableId.toString(), "some/prefix/", null, null, 1, null); PCollectionRowTuple outputTuple = PCollectionRowTuple.of("input", inputFiles).apply(addFiles); PAssert.that(outputTuple.get("errors")) .containsInAnyOrder( - Row.withSchema(AddFiles.ConvertToDataFile.ERROR_SCHEMA) + Row.withSchema(AddFiles.ERROR_SCHEMA) .addValues(file1, "File path did not start with the specified prefix") .build()); diff --git a/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java b/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java index 24c18f382817..e175f646d599 100644 --- a/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java +++ b/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java @@ -970,10 +970,10 @@ public Progress getProgress() { } } - private static class BeamParquetInputFile implements InputFile { + public static class BeamParquetInputFile implements InputFile { private final SeekableByteChannel seekableByteChannel; - BeamParquetInputFile(SeekableByteChannel seekableByteChannel) { + public BeamParquetInputFile(SeekableByteChannel seekableByteChannel) { this.seekableByteChannel = seekableByteChannel; } From f507d0acdaefd10515fccbc66ba2140186d2b9f0 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Sat, 7 Mar 2026 15:43:28 -0500 Subject: [PATCH 05/15] spotless --- sdks/java/io/iceberg/build.gradle | 1 - .../apache/beam/sdk/io/iceberg/AddFiles.java | 32 +++++++++---------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle index b185a962415f..9cc8a295c7df 100644 --- a/sdks/java/io/iceberg/build.gradle +++ b/sdks/java/io/iceberg/build.gradle @@ -51,7 +51,6 @@ dependencies { implementation library.java.joda_time implementation "org.apache.parquet:parquet-column:$parquet_version" implementation "org.apache.parquet:parquet-hadoop:$parquet_version" - implementation "org.apache.parquet:parquet-common:$parquet_version" implementation project(":sdks:java:io:parquet") implementation "org.apache.orc:orc-core:$orc_version" implementation "org.apache.iceberg:iceberg-core:$iceberg_version" diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index afdc92ba186d..77b72307b3ee 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -17,19 +17,6 @@ */ package org.apache.beam.sdk.io.iceberg; -import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.DATA_FILES; -import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERRORS; -import static org.apache.beam.sdk.metrics.Metrics.counter; -import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.SeekableByteChannel; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.UUID; -import java.util.stream.Collectors; import org.apache.beam.sdk.io.Compression; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.io.fs.MatchResult; @@ -89,6 +76,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.SeekableByteChannel; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.UUID; +import java.util.stream.Collectors; + +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.DATA_FILES; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERRORS; +import static org.apache.beam.sdk.metrics.Metrics.counter; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + /** * A transform that takes in a stream of file paths, converts them to Iceberg {@link DataFile}s with * partition metadata and metrics, then commits them to an Iceberg {@link Table}. @@ -283,9 +284,8 @@ public void process(@Element String filePath, MultiOutputReceiver output) throws output.get(DATA_FILES).output(SerializableDataFile.from(df, partitionPath)); } - private T transformValue(Transform transform, Type type, ByteBuffer bytes) { - S value = Conversions.fromByteBuffer(type, bytes); - return transform.bind(type).apply(value); + private T transformValue(Transform transform, Type type, ByteBuffer bytes) { + return transform.bind(type).apply(Conversions.fromByteBuffer(type, bytes)); } private Table getOrCreateTable(org.apache.iceberg.Schema schema) { From 904599690c74caa2fca0c490a384ac0025b30e9e Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 10 Mar 2026 11:23:38 -0400 Subject: [PATCH 06/15] add batch route as well --- .../apache/beam/sdk/io/iceberg/AddFiles.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index 77b72307b3ee..aa2619d8b8fb 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -30,6 +30,7 @@ import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.transforms.GroupByKey; import org.apache.beam.sdk.transforms.GroupIntoBatches; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.ParDo; @@ -89,6 +90,7 @@ import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERRORS; import static org.apache.beam.sdk.metrics.Metrics.counter; import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import static org.apache.beam.sdk.values.PCollection.IsBounded.BOUNDED; /** * A transform that takes in a stream of file paths, converts them to Iceberg {@link DataFile}s with @@ -172,14 +174,21 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { throw new RuntimeException(e); } - PCollection snapshots = + PCollection> keyedFiles = dataFiles .get(DATA_FILES) .setCoder(sdfSchema) - .apply("AddStaticKey", WithKeys.of((Void) null)) - .apply( + .apply("AddStaticKey", WithKeys.of((Void) null)); + + PCollection>> groupedFiles = + keyedFiles.isBounded().equals(BOUNDED) + ? keyedFiles.apply(GroupByKey.create()) + : keyedFiles.apply( GroupIntoBatches.ofSize(numFilesTrigger) - .withMaxBufferingDuration(intervalTrigger)) + .withMaxBufferingDuration(intervalTrigger)); + + PCollection snapshots = + groupedFiles .apply( "CommitFilesToIceberg", ParDo.of(new CommitFilesDoFn(catalogConfig, tableIdentifier, jobId))) From d4a0db1c27d8e54c12d5adaede453e72e98fe34b Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 10 Mar 2026 11:34:29 -0400 Subject: [PATCH 07/15] spotless --- .../apache/beam/sdk/io/iceberg/AddFiles.java | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index aa2619d8b8fb..3919de759d2e 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -17,6 +17,20 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.DATA_FILES; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERRORS; +import static org.apache.beam.sdk.metrics.Metrics.counter; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import static org.apache.beam.sdk.values.PCollection.IsBounded.BOUNDED; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.SeekableByteChannel; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.UUID; +import java.util.stream.Collectors; import org.apache.beam.sdk.io.Compression; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.io.fs.MatchResult; @@ -77,21 +91,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.SeekableByteChannel; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.UUID; -import java.util.stream.Collectors; - -import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.DATA_FILES; -import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.ERRORS; -import static org.apache.beam.sdk.metrics.Metrics.counter; -import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; -import static org.apache.beam.sdk.values.PCollection.IsBounded.BOUNDED; - /** * A transform that takes in a stream of file paths, converts them to Iceberg {@link DataFile}s with * partition metadata and metrics, then commits them to an Iceberg {@link Table}. From e9e55b15f7a8659bf4c6bf0765efc9c6413a527c Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Thu, 12 Mar 2026 16:31:50 -0400 Subject: [PATCH 08/15] extract SchemaTransform logic out --- .../apache/beam/sdk/io/iceberg/AddFiles.java | 24 +---- .../AddFilesSchemaTransformProvider.java | 58 ++++++++-- .../beam/sdk/io/iceberg/AddFilesTest.java | 100 ++++++++---------- 3 files changed, 94 insertions(+), 88 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index 3919de759d2e..e2be7236fabf 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -41,12 +41,10 @@ import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaCoder; import org.apache.beam.sdk.schemas.SchemaRegistry; -import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.Filter; import org.apache.beam.sdk.transforms.GroupByKey; import org.apache.beam.sdk.transforms.GroupIntoBatches; -import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.Redistribute; import org.apache.beam.sdk.transforms.WithKeys; @@ -58,7 +56,6 @@ import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.TupleTagList; -import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -95,12 +92,12 @@ * A transform that takes in a stream of file paths, converts them to Iceberg {@link DataFile}s with * partition metadata and metrics, then commits them to an Iceberg {@link Table}. */ -public class AddFiles extends SchemaTransform { +public class AddFiles extends PTransform, PCollectionRowTuple> { private static final Duration DEFAULT_TRIGGER_INTERVAL = Duration.standardMinutes(10); private static final Counter numFilesAdded = counter(AddFiles.class, "numFilesAdded"); private static final Counter numErrorFiles = counter(AddFiles.class, "numErrorFiles"); private static final Logger LOG = LoggerFactory.getLogger(AddFiles.class); - private static final int DEFAULT_FILES_TRIGGER = 100_000; + private static final int DEFAULT_FILES_TRIGGER = 1_000; static final Schema ERROR_SCHEMA = Schema.builder().addStringField("file").addStringField("error").build(); private final IcebergCatalogConfig catalogConfig; @@ -130,7 +127,7 @@ public AddFiles( } @Override - public PCollectionRowTuple expand(PCollectionRowTuple input) { + public PCollectionRowTuple expand(PCollection input) { LOG.info( "AddFiles configured to commit after accumulating {} files, or after {} seconds.", numFilesTrigger, @@ -140,21 +137,8 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { "AddFiles configured to build partition metadata after the prefix: '{}'", locationPrefix); } - Schema inputSchema = input.getSinglePCollection().getSchema(); - Preconditions.checkState( - inputSchema.getFieldCount() == 1 - && inputSchema.getField(0).getType().getTypeName().equals(Schema.TypeName.STRING), - "Incoming Row Schema must contain only one field of type String. Instead, got schema: %s", - inputSchema); - PCollectionTuple dataFiles = input - .getSinglePCollection() - .apply("Filter empty paths", Filter.by(row -> row.getString(0) != null)) - .apply( - "ExtractPaths", - MapElements.into(TypeDescriptors.strings()) - .via(row -> checkStateNotNull(row.getString(0)))) .apply(Redistribute.arbitrarily()) .apply( "ConvertToDataFiles", diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java index 7e745468f969..37b2ca6834d2 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java @@ -18,33 +18,32 @@ package org.apache.beam.sdk.io.iceberg; import static org.apache.beam.sdk.io.iceberg.AddFilesSchemaTransformProvider.Configuration; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import com.google.auto.service.AutoService; import com.google.auto.value.AutoValue; import java.util.List; import java.util.Map; import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; +import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; @AutoService(SchemaTransformProvider.class) public class AddFilesSchemaTransformProvider extends TypedSchemaTransformProvider { @Override - public AddFiles from(Configuration configuration) { - @Nullable Integer frequency = configuration.getTriggeringFrequencySeconds(); - - return new AddFiles( - configuration.getIcebergCatalog(), - configuration.getTable(), - configuration.getLocationPrefix(), - configuration.getPartitionFields(), - configuration.getTableProperties(), - configuration.getAppendBatchSize(), - frequency != null ? Duration.standardSeconds(frequency) : null); + public AddFilesSchemaTransform from(Configuration configuration) { + return new AddFilesSchemaTransform(configuration); } @Override @@ -135,4 +134,41 @@ public IcebergCatalogConfig getIcebergCatalog() { .build(); } } + + public static class AddFilesSchemaTransform extends SchemaTransform { + private final Configuration configuration; + + public AddFilesSchemaTransform(Configuration configuration) { + this.configuration = configuration; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { + Schema inputSchema = input.getSinglePCollection().getSchema(); + Preconditions.checkState( + inputSchema.getFieldCount() == 1 + && inputSchema.getField(0).getType().getTypeName().equals(Schema.TypeName.STRING), + "Incoming Row Schema must contain only one field of type String. Instead, got schema: %s", + inputSchema); + + @Nullable Integer frequency = configuration.getTriggeringFrequencySeconds(); + + return input + .getSinglePCollection() + .apply("Filter empty paths", Filter.by(row -> row.getString(0) != null)) + .apply( + "ExtractPaths", + MapElements.into(TypeDescriptors.strings()) + .via(row -> checkStateNotNull(row.getString(0)))) + .apply( + new AddFiles( + configuration.getIcebergCatalog(), + configuration.getTable(), + configuration.getLocationPrefix(), + configuration.getPartitionFields(), + configuration.getTableProperties(), + configuration.getAppendBatchSize(), + frequency != null ? Duration.standardSeconds(frequency) : null)); + } + } } diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java index b53ab3b943a1..7e1f23d1fb87 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java @@ -32,7 +32,7 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.testing.TestStream; @@ -96,8 +96,6 @@ public class AddFilesTest { @Rule public transient TestDataWarehouse warehouse = new TestDataWarehouse(TEMPORARY_FOLDER, "default"); - private static final Schema INPUT_SCHEMA = Schema.builder().addStringField("filepath").build(); - @Before public void setup() throws Exception { // Root for existing data files: @@ -144,23 +142,19 @@ public void testAddFilesWithPartitionPath(boolean isPartitioned) throws Exceptio writer2.close(); // 2. Setup the input PCollection - Row row1 = Row.withSchema(INPUT_SCHEMA).addValue(file1).build(); - Row row2 = Row.withSchema(INPUT_SCHEMA).addValue(file2).build(); - PCollection inputFiles = - pipeline.apply("Create Input", Create.of(row1, row2).withRowSchema(INPUT_SCHEMA)); + PCollection inputFiles = pipeline.apply("Create Input", Create.of(file1, file2)); // 3. Apply the transform (Trigger aggressively for testing) PCollectionRowTuple output = - PCollectionRowTuple.of("input", inputFiles) - .apply( - new AddFiles( - catalogConfig, - tableId.toString(), - isPartitioned ? root : null, - isPartitioned ? partitionFields : null, - tableProps, - 2, // trigger at 2 files - Duration.standardSeconds(10))); + inputFiles.apply( + new AddFiles( + catalogConfig, + tableId.toString(), + isPartitioned ? root : null, + isPartitioned ? partitionFields : null, + tableProps, + 2, // trigger at 2 files + Duration.standardSeconds(10))); // 4. Validate PCollection Outputs PAssert.that(output.get("errors")).empty(); @@ -238,24 +232,19 @@ public void testAddFilesWithPartitionFromMetrics() throws IOException { writer3.close(); // 2. Setup the input PCollection - Row row1 = Row.withSchema(INPUT_SCHEMA).addValue(file1).build(); - Row row2 = Row.withSchema(INPUT_SCHEMA).addValue(file2).build(); - Row row3 = Row.withSchema(INPUT_SCHEMA).addValue(file3).build(); - PCollection inputFiles = - pipeline.apply("Create Input", Create.of(row1, row2, row3).withRowSchema(INPUT_SCHEMA)); + PCollection inputFiles = pipeline.apply("Create Input", Create.of(file1, file2, file3)); // 3. Apply the transform (Trigger aggressively for testing) PCollectionRowTuple output = - PCollectionRowTuple.of("input", inputFiles) - .apply( - new AddFiles( - catalogConfig, - tableId.toString(), - null, // no prefix, so determine partition from DF metrics - partitionFields, - tableProps, - 2, // trigger at 2 files - Duration.standardSeconds(10))); + inputFiles.apply( + new AddFiles( + catalogConfig, + tableId.toString(), + null, // no prefix, so determine partition from DF metrics + partitionFields, + tableProps, + 2, // trigger at 2 files + Duration.standardSeconds(10))); // 4. There should be an error for File3, because its partition could not be determined PAssert.that(output.get("errors")) @@ -315,40 +304,40 @@ && checkStateNotNull(errorRow.getString(1)) @Test public void testStreamingAdds() throws IOException { - List paths = new ArrayList<>(); + List paths = new ArrayList<>(); for (int i = 0; i < 100; i++) { String file = String.format("%sdata_%s.parquet", root, i); DataWriter writer = createWriter(file); writer.write(record(1, "SomeName", 30)); writer.close(); - paths.add(Row.withSchema(INPUT_SCHEMA).addValue(file).build()); + paths.add(file); } - PCollection files = + PCollection files = pipeline.apply( - TestStream.create(INPUT_SCHEMA) + TestStream.create(StringUtf8Coder.of()) .addElements( - paths.get(0), paths.subList(1, 15).toArray(new Row[] {})) // should commit twice + paths.get(0), + paths.subList(1, 15).toArray(new String[] {})) // should commit twice .advanceProcessingTime(Duration.standardSeconds(10)) .addElements( paths.get(15), - paths.subList(16, 40).toArray(new Row[] {})) // should commit 3 times + paths.subList(16, 40).toArray(new String[] {})) // should commit 3 times .advanceProcessingTime(Duration.standardSeconds(10)) .addElements( paths.get(40), - paths.subList(41, 45).toArray(new Row[] {})) // should commit once + paths.subList(41, 45).toArray(new String[] {})) // should commit once .advanceWatermarkToInfinity()); - PCollectionRowTuple.of("input", files) - .apply( - new AddFiles( - catalogConfig, - tableId.toString(), - null, - null, - null, - 10, // trigger at 10 files - Duration.standardSeconds(5))); + files.apply( + new AddFiles( + catalogConfig, + tableId.toString(), + null, + null, + null, + 10, // trigger at 10 files + Duration.standardSeconds(5))); pipeline.run().waitUntilFinish(); Table table = catalog.loadTable(tableId); @@ -372,12 +361,11 @@ public void testUnknownFormatErrors() throws Exception { File txtFile = temp.newFile("unsupported.txt"); txtFile.createNewFile(); - Row badRow = Row.withSchema(INPUT_SCHEMA).addValue(txtFile.getAbsolutePath()).build(); - PCollection inputFiles = - pipeline.apply("Create Input", Create.of(badRow).withRowSchema(INPUT_SCHEMA)); + PCollection inputFiles = + pipeline.apply("Create Input", Create.of(txtFile.getAbsolutePath())); AddFiles addFiles = new AddFiles(catalogConfig, tableId.toString(), null, null, null, 1, null); - PCollectionRowTuple outputTuple = PCollectionRowTuple.of("input", inputFiles).apply(addFiles); + PCollectionRowTuple outputTuple = inputFiles.apply(addFiles); // Validate the file ended up in the errors PCollection with the correct schema PAssert.that(outputTuple.get("errors")) @@ -402,14 +390,12 @@ public void testPartitionPrefixErrors() throws Exception { writer.write(record(1, "Andrew", 30)); writer.close(); - Row row1 = Row.withSchema(INPUT_SCHEMA).addValue(file1).build(); - PCollection inputFiles = - pipeline.apply("Create Input", Create.of(row1).withRowSchema(INPUT_SCHEMA)); + PCollection inputFiles = pipeline.apply("Create Input", Create.of(file1)); // Notice locationPrefix is "some/prefix/" but the absolute path doesn't start with it AddFiles addFiles = new AddFiles(catalogConfig, tableId.toString(), "some/prefix/", null, null, 1, null); - PCollectionRowTuple outputTuple = PCollectionRowTuple.of("input", inputFiles).apply(addFiles); + PCollectionRowTuple outputTuple = inputFiles.apply(addFiles); PAssert.that(outputTuple.get("errors")) .containsInAnyOrder( From e983676a5e50fd65de6ed81323e7580a24b2eee5 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 23 Mar 2026 17:52:28 -0400 Subject: [PATCH 09/15] add integration tests --- .../IO_Iceberg_Integration_Tests.json | 2 +- sdks/java/io/iceberg/build.gradle | 3 + .../apache/beam/sdk/io/iceberg/AddFiles.java | 187 ++++--- .../beam/sdk/io/iceberg/AddFilesIT.java | 475 ++++++++++++++++++ 4 files changed, 604 insertions(+), 63 deletions(-) create mode 100644 sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesIT.java diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json index 34a6e02150e7..b73af5e61a43 100644 --- a/.github/trigger_files/IO_Iceberg_Integration_Tests.json +++ b/.github/trigger_files/IO_Iceberg_Integration_Tests.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run.", - "modification": 4 + "modification": 1 } diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle index 78da2697f765..5661449ffc2a 100644 --- a/sdks/java/io/iceberg/build.gradle +++ b/sdks/java/io/iceberg/build.gradle @@ -80,6 +80,8 @@ dependencies { testImplementation project(":sdks:java:extensions:google-cloud-platform-core") testImplementation library.java.junit testImplementation library.java.hamcrest + testImplementation project(path: ":sdks:java:extensions:avro") + testImplementation 'org.awaitility:awaitility:4.2.0' // Hive catalog test dependencies testImplementation project(path: ":sdks:java:io:iceberg:hive") @@ -96,6 +98,7 @@ dependencies { testImplementation project(path: ":sdks:java:io:iceberg:bqms", configuration: "shadow") testImplementation project(":sdks:java:io:google-cloud-platform") testImplementation library.java.google_api_services_bigquery + testImplementation 'com.google.cloud:google-cloud-storage' testImplementation library.java.google_auth_library_oauth2_http testRuntimeOnly library.java.slf4j_jdk14 diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index e2be7236fabf..9d97771bc8a8 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -26,29 +26,32 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.SeekableByteChannel; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.UUID; import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.beam.sdk.coders.VarLongCoder; import org.apache.beam.sdk.io.Compression; import org.apache.beam.sdk.io.FileSystems; -import org.apache.beam.sdk.io.fs.MatchResult; import org.apache.beam.sdk.io.fs.ResourceId; import org.apache.beam.sdk.io.parquet.ParquetIO.ReadFiles.BeamParquetInputFile; import org.apache.beam.sdk.metrics.Counter; -import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaCoder; import org.apache.beam.sdk.schemas.SchemaRegistry; +import org.apache.beam.sdk.state.StateSpec; +import org.apache.beam.sdk.state.StateSpecs; +import org.apache.beam.sdk.state.ValueState; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.GroupByKey; import org.apache.beam.sdk.transforms.GroupIntoBatches; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.Redistribute; import org.apache.beam.sdk.transforms.WithKeys; -import org.apache.beam.sdk.transforms.windowing.PaneInfo; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; @@ -59,6 +62,8 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.hash.Hasher; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.hash.Hashing; import org.apache.iceberg.AppendFiles; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; @@ -71,9 +76,12 @@ import org.apache.iceberg.Snapshot; import org.apache.iceberg.Table; import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.catalog.SupportsNamespaces; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.exceptions.AlreadyExistsException; import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.mapping.MappingUtil; +import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.orc.OrcMetrics; import org.apache.iceberg.parquet.ParquetSchemaUtil; import org.apache.iceberg.parquet.ParquetUtil; @@ -81,6 +89,8 @@ import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Type; import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.checkerframework.checker.nullness.qual.Nullable; @@ -107,7 +117,6 @@ public class AddFiles extends PTransform, PCollectionRowTupl private final @Nullable String locationPrefix; private final @Nullable List partitionFields; private final @Nullable Map tableProps; - private final String jobId = UUID.randomUUID().toString(); public AddFiles( IcebergCatalogConfig catalogConfig, @@ -138,18 +147,16 @@ public PCollectionRowTuple expand(PCollection input) { } PCollectionTuple dataFiles = - input - .apply(Redistribute.arbitrarily()) - .apply( - "ConvertToDataFiles", - ParDo.of( - new ConvertToDataFile( - catalogConfig, - tableIdentifier, - locationPrefix, - partitionFields, - tableProps)) - .withOutputTags(DATA_FILES, TupleTagList.of(ERRORS))); + input.apply( + "ConvertToDataFiles", + ParDo.of( + new ConvertToDataFile( + catalogConfig, + tableIdentifier, + locationPrefix, + partitionFields, + tableProps)) + .withOutputTags(DATA_FILES, TupleTagList.of(ERRORS))); SchemaCoder sdfSchema; try { sdfSchema = SchemaRegistry.createDefault().getSchemaCoder(SerializableDataFile.class); @@ -174,7 +181,7 @@ public PCollectionRowTuple expand(PCollection input) { groupedFiles .apply( "CommitFilesToIceberg", - ParDo.of(new CommitFilesDoFn(catalogConfig, tableIdentifier, jobId))) + ParDo.of(new CommitFilesDoFn(catalogConfig, tableIdentifier))) .setRowSchema(SnapshotInfo.getSchema()); return PCollectionRowTuple.of( @@ -238,7 +245,9 @@ public void process(@Element String filePath, MultiOutputReceiver output) throws InputFile inputFile = table.io().newInputFile(filePath); - Metrics metrics = getFileMetrics(inputFile, format, MetricsConfig.forTable(table)); + Metrics metrics = + getFileMetrics( + inputFile, format, MetricsConfig.forTable(table), MappingUtil.create(table.schema())); // Figure out which partition this DataFile should go to String partitionPath; @@ -302,16 +311,7 @@ private static org.apache.iceberg.Schema getSchema(String filePath, FileFormat f throws IOException { Preconditions.checkArgument( format.equals(FileFormat.PARQUET), "Table creation is only supported for Parquet files."); - FileSystems.registerFileSystemsOnce(PipelineOptionsFactory.create()); - - MatchResult result = FileSystems.match(filePath); - ResourceId resourceId = Iterables.getOnlyElement(result.metadata()).resourceId(); - Compression compression = Compression.detect(checkStateNotNull(resourceId.getFilename())); - SeekableByteChannel channel = - (SeekableByteChannel) compression.readDecompressed(FileSystems.open(resourceId)); - BeamParquetInputFile file = new BeamParquetInputFile(channel); - - try (ParquetFileReader reader = ParquetFileReader.open(file)) { + try (ParquetFileReader reader = ParquetFileReader.open(getParquetInputFile(filePath))) { MessageType messageType = reader.getFooter().getFileMetaData().getSchema(); return ParquetSchemaUtil.convert(messageType); } @@ -335,14 +335,13 @@ private String getPartitionFromFilePath(String filePath) { * an incorrect partition may lead to it being hidden from some queries. */ private String getPartitionFromMetrics(Metrics metrics, InputFile inputFile) - throws UnknownPartitionException { + throws UnknownPartitionException, IOException { Table table = checkStateNotNull(this.table); List fields = table.spec().fields(); List sourceIds = fields.stream().map(PartitionField::sourceId).collect(Collectors.toList()); Metrics partitionMetrics; - // Check if metrics already includes partition columns (this is configured by table - // properties): + // Check if metrics already includes partition columns (configured by table properties): if (metrics.lowerBounds().keySet().containsAll(sourceIds) && metrics.upperBounds().keySet().containsAll(sourceIds)) { partitionMetrics = metrics; @@ -360,7 +359,11 @@ private String getPartitionFromMetrics(Metrics metrics, InputFile inputFile) .collect(Collectors.toMap(s -> "write.metadata.metrics.column." + s, s -> "full")); MetricsConfig configWithPartitionFields = MetricsConfig.fromProperties(configProps); partitionMetrics = - getFileMetrics(inputFile, inferFormat(inputFile.location()), configWithPartitionFields); + getFileMetrics( + inputFile, + inferFormat(inputFile.location()), + configWithPartitionFields, + MappingUtil.create(table.schema())); } PartitionKey pk = new PartitionKey(table.spec(), table.schema()); @@ -373,15 +376,16 @@ private String getPartitionFromMetrics(Metrics metrics, InputFile inputFile) // Make a best effort estimate by comparing the lower and upper transformed values. // If the transformed values are equal, assume that the DataFile's data safely // aligns with the same partition. - // TODO(ahmedabu98): is comparing min/max safe enough? + // TODO(ahmedabu98): is comparing min/max safe enough for bucketing transform? // or should we compare ALL the records in a DF? ByteBuffer lowerBytes = partitionMetrics.lowerBounds().get(field.sourceId()); ByteBuffer upperBytes = partitionMetrics.upperBounds().get(field.sourceId()); if (lowerBytes == null && upperBytes == null) { - pk.set(i, null); continue; } else if (lowerBytes == null || upperBytes == null) { - throw new UnknownPartitionException("Only one of the min/max was was null"); + throw new UnknownPartitionException( + "Only one of the min/max was was null, for field " + + table.schema().findColumnName(field.sourceId())); } Object lowerTransformedValue = transformValue(transform, type, lowerBytes); Object upperTransformedValue = transformValue(transform, type, upperBytes); @@ -404,17 +408,37 @@ private String getPartitionFromMetrics(Metrics metrics, InputFile inputFile) } } + /** + * A stateful {@link DoFn} that commits batches of files to an Iceberg table. + * + *

Addresses two primary concerns: + * + *

    + *
  • Concurrency: Being stateful on a dummy {@code Void} key forces the runner to + * process batches sequentially, preventing concurrent commit conflicts on the Iceberg + * table. + *
  • Idempotency: Prevents duplicate commits during bundle failures by calculating a + * deterministic hash for the file set. This ID is stored in the Iceberg {@code Snapshot} + * summary, under the key {@code "beam.add-files-commit-id"}. Before committing, the DoFn + * travereses backwards through recent snapshots to check if the current batch's ID is + * already present. + *
+ * + *

Outputs the resulting Iceberg {@link Snapshot} information. + */ static class CommitFilesDoFn extends DoFn>, Row> { private final IcebergCatalogConfig catalogConfig; private final String identifier; private transient @MonotonicNonNull Table table = null; - private final String jobId; private static final String COMMIT_ID_KEY = "beam.add-files-commit-id"; - public CommitFilesDoFn(IcebergCatalogConfig catalogConfig, String identifier, String jobId) { + @StateId("lastCommitTimestamp") + private final StateSpec> lastCommitTimestamp = + StateSpecs.value(VarLongCoder.of()); + + public CommitFilesDoFn(IcebergCatalogConfig catalogConfig, String identifier) { this.catalogConfig = catalogConfig; this.identifier = identifier; - this.jobId = jobId; } @StartBundle @@ -427,53 +451,62 @@ public void start() { @ProcessElement public void process( @Element KV> files, - PaneInfo pane, + @AlwaysFetched @StateId("lastCommitTimestamp") ValueState lastCommitTimestamp, OutputReceiver output) { String commitId = commitHash(files.getValue()); Table table = checkStateNotNull(this.table); - if (shouldSkip(commitId)) { + table.refresh(); + + if (shouldSkip(commitId, lastCommitTimestamp.read())) { return; } int numFiles = 0; - AppendFiles appendFiles = table.newAppend(); + AppendFiles appendFiles = table.newFastAppend(); for (SerializableDataFile file : files.getValue()) { DataFile df = file.createDataFile(table.specs()); appendFiles.appendFile(df); numFiles++; } appendFiles.set(COMMIT_ID_KEY, commitId); + LOG.info("Committing {} files, with commit ID: {}", numFiles, commitId); appendFiles.commit(); Snapshot snapshot = table.currentSnapshot(); output.output(SnapshotInfo.fromSnapshot(snapshot).toRow()); + lastCommitTimestamp.write(snapshot.timestampMillis()); numFilesAdded.inc(numFiles); } private String commitHash(Iterable files) { - int hash = 0; + Hasher hasher = Hashing.sha256().newHasher(); + + // Extract, sort, and hash to ensure deterministic output + List paths = new ArrayList<>(); for (SerializableDataFile file : files) { - hash = 31 * hash + file.getPath().hashCode(); + paths.add(file.getPath()); + } + Collections.sort(paths); + + for (String path : paths) { + hasher.putString(path, StandardCharsets.UTF_8); } - return String.valueOf(Math.abs(hash)); + return hasher.hash().toString(); } /** - * If the process call fails immediately after committing files, but before registering with the - * runner, then runner will retry committing the same batch of files, possibly leading to data - * duplication. - * - *

To mitigate, we create a unique ID per commit and store it in the snapshot summary. We - * skip the pane's batch of files if we see a snapshot with the same unique ID. + * Performs a look-back through Iceberg table history to determine if this specific batch of + * files has already been successfully committed. */ - private boolean shouldSkip(String commitUID) { + private boolean shouldSkip(String commitUID, @Nullable Long lastCommitTimestamp) { + if (lastCommitTimestamp == null) { + return false; + } Table table = checkStateNotNull(this.table); - table.refresh(); - // check the last 10 snapshots to see if it contains the commit ID - int i = 0; + // check past snapshots to see if they contain the commit ID @Nullable Snapshot current = table.currentSnapshot(); - while (current != null && i < 10) { + while (current != null && current.timestampMillis() > lastCommitTimestamp) { Map summary = current.summary(); if (summary != null && commitUID.equals(summary.get(COMMIT_ID_KEY))) { return true; // commit already happened, we should skip @@ -482,7 +515,6 @@ private boolean shouldSkip(String commitUID) { break; } current = table.snapshot(current.parentId()); - i++; } return false; @@ -490,12 +522,23 @@ private boolean shouldSkip(String commitUID) { } @SuppressWarnings("argument") - public static Metrics getFileMetrics(InputFile file, FileFormat format, MetricsConfig config) { + public static Metrics getFileMetrics( + InputFile file, FileFormat format, MetricsConfig config, NameMapping mapping) + throws IOException { switch (format) { case PARQUET: - return ParquetUtil.fileMetrics(file, config); + try (ParquetFileReader reader = + ParquetFileReader.open(getParquetInputFile(file.location()))) { + ParquetMetadata footer = reader.getFooter(); + MessageType originalMessageType = footer.getFileMetaData().getSchema(); + if (!ParquetSchemaUtil.hasIds(originalMessageType)) { + footer = getFooterWithTypeIds(originalMessageType, footer, mapping); + } + + return ParquetUtil.footerMetrics(footer, Stream.empty(), config, mapping); + } case ORC: - return OrcMetrics.fromInputFile(file, config); + return OrcMetrics.fromInputFile(file, config, mapping); case AVRO: return new Metrics(Avro.rowCount(file), null, null, null, null); default: @@ -503,6 +546,7 @@ public static Metrics getFileMetrics(InputFile file, FileFormat format, MetricsC } } + /** Tries to infer other file formats. Defaults to Parquet. */ public static FileFormat inferFormat(String path) { String lowerPath = path.toLowerCase(); @@ -512,9 +556,28 @@ public static FileFormat inferFormat(String path) { return FileFormat.ORC; } else if (lowerPath.endsWith(".avro")) { return FileFormat.AVRO; + } else { + throw new UnknownFormatException(); } + } + + static ParquetMetadata getFooterWithTypeIds( + MessageType originalMessageType, ParquetMetadata footer, NameMapping mapping) { + originalMessageType = ParquetSchemaUtil.applyNameMapping(originalMessageType, mapping); + FileMetaData oldFileMeta = footer.getFileMetaData(); + FileMetaData newFileMeta = + new FileMetaData( + originalMessageType, oldFileMeta.getKeyValueMetaData(), oldFileMeta.getCreatedBy()); + return new ParquetMetadata(newFileMeta, footer.getBlocks()); + } - throw new UnknownFormatException(); + static org.apache.parquet.io.InputFile getParquetInputFile(String filePath) throws IOException { + ResourceId resourceId = + Iterables.getOnlyElement(FileSystems.match(filePath).metadata()).resourceId(); + Compression compression = Compression.detect(checkStateNotNull(resourceId.getFilename())); + SeekableByteChannel channel = + (SeekableByteChannel) compression.readDecompressed(FileSystems.open(resourceId)); + return new BeamParquetInputFile(channel); } static class UnknownFormatException extends IllegalArgumentException {} diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesIT.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesIT.java new file mode 100644 index 000000000000..f4633ce53aa3 --- /dev/null +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesIT.java @@ -0,0 +1,475 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg; + +import static java.lang.String.format; +import static org.apache.beam.sdk.io.FileIO.Write.defaultNaming; +import static org.apache.beam.sdk.io.iceberg.IcebergUtils.beamSchemaToIcebergSchema; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import static org.apache.beam.sdk.values.TypeDescriptors.strings; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +import com.google.api.services.storage.model.StorageObject; +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.Notification; +import com.google.cloud.storage.NotificationInfo; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageException; +import com.google.cloud.storage.StorageOptions; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeoutException; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.avro.generic.GenericRecord; +import org.apache.beam.runners.direct.DirectOptions; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.extensions.avro.coders.AvroCoder; +import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.extensions.gcp.options.GcsOptions; +import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; +import org.apache.beam.sdk.io.FileIO; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubGrpcClient; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; +import org.apache.beam.sdk.io.gcp.pubsub.TestPubsubOptions; +import org.apache.beam.sdk.io.parquet.ParquetIO; +import org.apache.beam.sdk.managed.Managed; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.Deduplicate; +import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.transforms.JsonToRow; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; +import org.apache.hadoop.util.Lists; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.rest.RESTCatalog; +import org.apache.iceberg.util.Pair; +import org.awaitility.Awaitility; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Integration tests for {@link AddFiles}, using Pubsub notifications to pass newly created file + * paths downstream to add to an Iceberg table. + */ +public class AddFilesIT { + private static final Logger LOG = LoggerFactory.getLogger(AddFilesIT.class); + + private static final String WAREHOUSE = "gs://managed-iceberg-biglake-its"; + private static final String PROJECT = + TestPipeline.testingPipelineOptions().as(GcpOptions.class).getProject(); + @Rule public TestName testName = new TestName(); + private final String notificationsTopic = + format( + "projects/%s/topics/%s-%s-%s", + PROJECT, + AddFilesIT.class.getSimpleName(), + testName.getMethodName(), + System.currentTimeMillis()); + private static final Schema NOTIFICATION_SCHEMA = + Schema.builder() + .addStringField("bucket") + .addStringField("name") + .addStringField("kind") + .build(); + private static final Map BIGLAKE_PROPS = + Map.of( + "type", "rest", + "uri", "https://biglake.googleapis.com/iceberg/v1/restcatalog", + "warehouse", WAREHOUSE, + "header.x-goog-user-project", PROJECT, + "rest.auth.type", "google", + "io-impl", "org.apache.iceberg.gcp.gcs.GCSFileIO", + "rest-metrics-reporting-enabled", "false"); + private Storage storage; + private PubsubClient pubsub; + private Notification notification; + private final String namespace = getClass().getSimpleName(); + private String srcTableName; + private String destTableName; + private TableIdentifier srcTableId; + private TableIdentifier destTableId; + private long salt; + private String dirName; + private static final Schema ROW_SCHEMA = + Schema.builder().addInt64Field("id").addStringField("name").addInt32Field("age").build(); + private static final List PARTITION_FIELDS = + Arrays.asList("bucket(id, 16)", "truncate(name, 8)", "age"); + private static final PartitionSpec SPEC = + PartitionUtils.toPartitionSpec(PARTITION_FIELDS, ROW_SCHEMA); + private static final Map TABLE_PROPS = ImmutableMap.of("foo", "bar"); + private final RESTCatalog catalog = new RESTCatalog(); + + @Before + public void setup() throws IOException { + storage = StorageOptions.newBuilder().build().getService(); + pubsub = + PubsubGrpcClient.FACTORY.newClient( + null, null, TestPipeline.testingPipelineOptions().as(TestPubsubOptions.class)); + + pubsub.createTopic(PubsubClient.topicPathFromPath(notificationsTopic)); + + NotificationInfo notificationInfo = + NotificationInfo.newBuilder(notificationsTopic) + .setEventTypes(NotificationInfo.EventType.OBJECT_FINALIZE) + .setPayloadFormat(NotificationInfo.PayloadFormat.JSON_API_V1) + .build(); + try { + notification = storage.createNotification(WAREHOUSE.replace("gs://", ""), notificationInfo); + } catch (StorageException e) { + if (e.getMessage().contains("Too many overlapping notifications")) { + List existing = storage.listNotifications(WAREHOUSE.replace("gs://", "")); + LOG.warn( + "Too many notifications on bucket {}: {}. Deleting existing notifications to make room: {}", + WAREHOUSE, + e, + existing.stream() + .map(NotificationInfo::getNotificationId) + .collect(Collectors.toList())); + existing.forEach( + n -> storage.deleteNotification(WAREHOUSE.replace("gs://", ""), n.getNotificationId())); + + // try creating it again + notification = storage.createNotification(WAREHOUSE.replace("gs://", ""), notificationInfo); + } else { + throw e; + } + } + + salt = System.currentTimeMillis(); + dirName = format("%s-%s/%s", getClass().getSimpleName(), salt, testName.getMethodName()); + srcTableName = "src_" + testName.getMethodName() + "_" + salt; + destTableName = "dest_" + testName.getMethodName() + "_" + salt; + srcTableId = TableIdentifier.of(namespace, srcTableName); + destTableId = TableIdentifier.of(namespace, destTableName); + + catalog.initialize("test_catalog", BIGLAKE_PROPS); + cleanupCatalog(); + catalog.createNamespace(Namespace.of(namespace)); + } + + private void cleanupCatalog() { + Namespace ns = Namespace.of(namespace); + if (catalog.namespaceExists(ns)) { + catalog.listTables(ns).forEach(catalog::dropTable); + catalog.dropNamespace(ns); + } + } + + @After + public void cleanup() { + try { + pubsub.deleteTopic(PubsubClient.topicPathFromPath(notificationsTopic)); + pubsub.close(); + } catch (Exception e) { + LOG.warn("Failed to clean up PubSub", e); + } + + try { + storage.deleteNotification(WAREHOUSE.replace("gs://", ""), notification.getNotificationId()); + storage.close(); + } catch (Exception e) { + LOG.warn("Failed to clean up GCS notifications", e); + } + + try { + cleanupCatalog(); + } catch (Exception e) { + LOG.warn("Failed to clean up Iceberg catalog", e); + } + + try { + Iterable blobs = + storage + .list(WAREHOUSE.replace("gs://", ""), Storage.BlobListOption.prefix(dirName)) + .getValues(); + blobs.forEach(b -> storage.delete(b.getBlobId())); + } catch (Exception e) { + LOG.warn("Failed to clean up GCS bucket", e); + } + } + + @Test + public void testAddFilesFromSrcIcebergTable() + throws IOException, InterruptedException, TimeoutException { + // first create a source iceberg table + catalog.createTable(srcTableId, beamSchemaToIcebergSchema(ROW_SCHEMA), SPEC); + + String filter = format("%s/%s/data/", namespace, srcTableName); + + // build AddFiles pipeline and let it run in the background + PipelineResult addFilesPipeline = startAddFilesListener(filter); + + // before writing, confirm the destination table still does not exist + assertFalse(catalog.tableExists(destTableId)); + + // write some rows to the source table + LOG.info("Writing records to the source table"); + List rows = + IntStream.range(0, 20) + .mapToObj( + i -> Row.withSchema(ROW_SCHEMA).addValues((long) i, "name_" + i, i + 30).build()) + .collect(Collectors.toList()); + + Pipeline q = Pipeline.create(); + q.apply(Create.of(rows)) + .setRowSchema(ROW_SCHEMA) + .apply( + Managed.write(Managed.ICEBERG) + .withConfig( + ImmutableMap.of( + "table", srcTableId.toString(), "catalog_properties", BIGLAKE_PROPS))); + q.run().waitUntilFinish(); + + // check that the destination table has been created + Awaitility.await() + .atMost(java.time.Duration.ofMinutes(5)) + .pollInterval(java.time.Duration.ofSeconds(10)) + .until(() -> catalog.tableExists(destTableId)); + LOG.info("Destination table has been created"); + + LOG.info("Checking if all source files have been registered in the destination table"); + Awaitility.await() + .atMost(java.time.Duration.ofMinutes(2)) + .pollInterval(java.time.Duration.ofSeconds(5)) + .until(() -> checkTableFiles() != null); + LOG.info("Destination table has registered all source files."); + Pair, Map> srcAndDestfiles = + checkStateNotNull(checkTableFiles()); + + for (Map.Entry srcFile : srcAndDestfiles.first().entrySet()) { + String location = srcFile.getKey(); + DataFile destFile = + checkStateNotNull( + srcAndDestfiles.second().get(location), + "Source file '%s' was not registered in the destination table", + location); + + // check that partition metadata was preserved + assertEquals(destFile.partition(), srcFile.getValue().partition()); + } + + // safe to cancel the AddFiles pipeline now + LOG.info("Canceling AddFiles listener."); + addFilesPipeline.cancel(); + + // check all records are there + Pipeline s = Pipeline.create(); + PCollection destRows = + s.apply( + Managed.read(Managed.ICEBERG) + .withConfig( + ImmutableMap.of( + "table", destTableId.toString(), "catalog_properties", BIGLAKE_PROPS))) + .getSinglePCollection(); + PAssert.that(destRows).containsInAnyOrder(rows); + s.run().waitUntilFinish(); + } + + /** + * Fetch all added files in both tables. Return null if some files have not yet propagated to + * destination table + */ + private @Nullable Pair, Map> checkTableFiles() { + Table destTable = catalog.loadTable(destTableId); + Table srcTable = catalog.loadTable(srcTableId); + + Map srcFiles = new HashMap<>(); + Map destFiles = new HashMap<>(); + for (Snapshot snapshot : srcTable.snapshots()) { + snapshot.addedDataFiles(srcTable.io()).forEach(df -> srcFiles.put(df.location(), df)); + } + for (Snapshot snapshot : destTable.snapshots()) { + snapshot.addedDataFiles(destTable.io()).forEach(df -> destFiles.put(df.location(), df)); + } + + LOG.info( + "Number of source files: {}, Number of registered destination files: {}", + srcFiles.size(), + destFiles.size()); + + if (srcFiles.size() != destFiles.size()) { + Set onlyInSrc = new HashSet<>(srcFiles.keySet()); + onlyInSrc.removeAll(destFiles.keySet()); + LOG.info("Missing source files: {}", onlyInSrc); + return null; + } + + return Pair.of(srcFiles, destFiles); + } + + @Test + public void testAddParquetFiles() throws InterruptedException, TimeoutException, IOException { + // start with a table that does not exist + + String parquetDir = format("%s/%s/", WAREHOUSE, dirName); + String tempDir = format("%s/%s-tmp/", WAREHOUSE, dirName); + + // let the add files pipeline run in the background + PipelineResult addFilesPipeline = startAddFilesListener(dirName); + + // before writing, confirm the destination table still does not exist + assertFalse(catalog.tableExists(destTableId)); + + // write some parquet files + LOG.info("Writing records to the parquet dir"); + List rows = + IntStream.range(0, 20) + .mapToObj( + i -> Row.withSchema(ROW_SCHEMA).addValues((long) i, "name_" + i, i + 30).build()) + .collect(Collectors.toList()); + + Pipeline q = Pipeline.create(); + org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(ROW_SCHEMA); + q.apply(Create.of(rows)) + .setRowSchema(ROW_SCHEMA) + .apply( + MapElements.into(TypeDescriptor.of(GenericRecord.class)) + .via(AvroUtils.getRowToGenericRecordFunction(avroSchema))) + .setCoder(AvroCoder.of(avroSchema)) + .apply( + FileIO.writeDynamic() + .by( + record -> + format("%s-%s-%s", record.get("id"), record.get("name"), record.get("age"))) + .via(ParquetIO.sink(avroSchema)) + .withNaming(name -> defaultNaming(name, ".parquet")) + .withTempDirectory(tempDir) + .to(parquetDir) + .withDestinationCoder(StringUtf8Coder.of())); + q.run().waitUntilFinish(); + + GcsUtil gcsUtil = TestPipeline.testingPipelineOptions().as(GcsOptions.class).getGcsUtil(); + // String prefix = format("%s/", dirName); + Iterable objects = + gcsUtil.listObjects(WAREHOUSE.replace("gs://", ""), dirName, null).getItems(); + List writtenFilePaths = + Lists.newArrayList(objects).stream() + .map(o -> format("gs://%s/%s", o.getBucket(), o.getName())) + .collect(Collectors.toList()); + LOG.info("Written file paths: {}", writtenFilePaths); + + // check that the destination table has been created + Awaitility.await() + .atMost(java.time.Duration.ofMinutes(1)) + .pollInterval(java.time.Duration.ofSeconds(5)) + .until(() -> catalog.tableExists(destTableId)); + LOG.info("Destination table has been created"); + + LOG.info("Checking if all source files have been registered in the destination table"); + Awaitility.await() + .atMost(java.time.Duration.ofMinutes(2)) + .pollInterval(java.time.Duration.ofSeconds(5)) + .until(() -> checkTableHasRegisteredParquetFiles(writtenFilePaths)); + LOG.info( + "Destination table has registered all source files ({} files).", writtenFilePaths.size()); + + // safe to cancel the AddFiles pipeline now + LOG.info("Canceling AddFiles listener."); + addFilesPipeline.cancel(); + + // check all records are there + Pipeline s = Pipeline.create(); + PCollection destRows = + s.apply( + Managed.read(Managed.ICEBERG) + .withConfig( + ImmutableMap.of( + "table", destTableId.toString(), "catalog_properties", BIGLAKE_PROPS))) + .getSinglePCollection(); + PAssert.that(destRows).containsInAnyOrder(rows); + s.run().waitUntilFinish(); + } + + private boolean checkTableHasRegisteredParquetFiles(List parquetFiles) { + Table destTable = catalog.loadTable(destTableId); + + int numRegisteredFiles = 0; + for (Snapshot snapshot : destTable.snapshots()) { + numRegisteredFiles += Iterables.size(snapshot.addedDataFiles(destTable.io())); + } + LOG.info( + "Number of source files: {}, Number of registered destination files: {}", + parquetFiles.size(), + numRegisteredFiles); + return numRegisteredFiles == parquetFiles.size(); + } + + private PipelineResult startAddFilesListener(String filter) throws InterruptedException { + DirectOptions options = TestPipeline.testingPipelineOptions().as(DirectOptions.class); + options.setBlockOnRun(false); + Pipeline p = Pipeline.create(options); + + PCollectionRowTuple tuple = + p.apply(PubsubIO.readStrings().fromTopic(notificationsTopic)) + .apply(JsonToRow.withSchema(NOTIFICATION_SCHEMA)) + .apply(Filter.by(row -> row.getString("name").contains(filter))) + .apply( + MapElements.into(strings()) + .via( + row -> + format("gs://%s/%s", row.getString("bucket"), row.getString("name")))) + .apply(Deduplicate.values()) + .apply( + new AddFiles( + IcebergCatalogConfig.builder().setCatalogProperties(BIGLAKE_PROPS).build(), + namespace + "." + destTableName, + null, + PARTITION_FIELDS, + TABLE_PROPS, + 10, + Duration.standardSeconds(10))); + PAssert.that(tuple.get("errors")).empty(); + PipelineResult result = p.run(); + + LOG.info( + "Started running the AddFiles listener pipeline. Waiting for 10s to allow it enough time to setup"); + Thread.sleep(10_000); + return result; + } +} From f5ba17ed65c0781451f05161730555c81157b8d4 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 23 Mar 2026 18:09:04 -0400 Subject: [PATCH 10/15] spotless --- .../src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java | 1 - 1 file changed, 1 deletion(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index 9d97771bc8a8..55d716ef3f8e 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -76,7 +76,6 @@ import org.apache.iceberg.Snapshot; import org.apache.iceberg.Table; import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.catalog.SupportsNamespaces; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.exceptions.AlreadyExistsException; import org.apache.iceberg.io.InputFile; From f5ace5c145b25bbd4aa9323bba1ca1aa0e8edfee Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 23 Mar 2026 18:18:13 -0400 Subject: [PATCH 11/15] spotless --- sdks/java/io/iceberg/build.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle index 5661449ffc2a..36795dfb3dcd 100644 --- a/sdks/java/io/iceberg/build.gradle +++ b/sdks/java/io/iceberg/build.gradle @@ -75,6 +75,7 @@ dependencies { testImplementation library.java.bigdataoss_gcsio testImplementation library.java.bigdataoss_util_hadoop testImplementation "org.apache.parquet:parquet-avro:$parquet_version" + testImplementation "org.apache.parquet:parquet-common:$parquet_version" testImplementation "org.apache.iceberg:iceberg-data:$iceberg_version" testImplementation project(path: ":sdks:java:core", configuration: "shadowTest") testImplementation project(":sdks:java:extensions:google-cloud-platform-core") From 15df0e74d33599ee79e0e4f71423ae7e325b3aca Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 23 Mar 2026 19:08:44 -0400 Subject: [PATCH 12/15] fix deps --- sdks/java/io/iceberg/build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle index 36795dfb3dcd..bbd55fee2fc8 100644 --- a/sdks/java/io/iceberg/build.gradle +++ b/sdks/java/io/iceberg/build.gradle @@ -51,6 +51,7 @@ dependencies { implementation library.java.joda_time implementation "org.apache.parquet:parquet-column:$parquet_version" implementation "org.apache.parquet:parquet-hadoop:$parquet_version" + implementation "org.apache.parquet:parquet-common:$parquet_version" implementation project(":sdks:java:io:parquet") implementation "org.apache.orc:orc-core:$orc_version" implementation "org.apache.iceberg:iceberg-core:$iceberg_version" @@ -75,7 +76,6 @@ dependencies { testImplementation library.java.bigdataoss_gcsio testImplementation library.java.bigdataoss_util_hadoop testImplementation "org.apache.parquet:parquet-avro:$parquet_version" - testImplementation "org.apache.parquet:parquet-common:$parquet_version" testImplementation "org.apache.iceberg:iceberg-data:$iceberg_version" testImplementation project(path: ":sdks:java:core", configuration: "shadowTest") testImplementation project(":sdks:java:extensions:google-cloud-platform-core") From 1f9272fbf65035a2a54103d32eb26faf73329088 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 24 Mar 2026 18:12:51 -0400 Subject: [PATCH 13/15] add a few more tests; add error handling output --- .../apache/beam/sdk/io/iceberg/AddFiles.java | 123 +++++++-- .../AddFilesSchemaTransformProvider.java | 48 ++-- .../apache/beam/sdk/io/iceberg/ReadUtils.java | 30 +++ .../beam/sdk/io/iceberg/AddFilesIT.java | 119 ++++++--- .../beam/sdk/io/iceberg/AddFilesTest.java | 235 +++++++++++++++++- 5 files changed, 486 insertions(+), 69 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index 55d716ef3f8e..6e58d12ff57b 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -29,9 +29,11 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.concurrent.Semaphore; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.beam.sdk.coders.VarLongCoder; @@ -77,7 +79,9 @@ import org.apache.iceberg.Table; import org.apache.iceberg.avro.Avro; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.Record; import org.apache.iceberg.exceptions.AlreadyExistsException; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.mapping.MappingUtil; import org.apache.iceberg.mapping.NameMapping; @@ -87,6 +91,8 @@ import org.apache.iceberg.transforms.Transform; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; @@ -102,6 +108,8 @@ * partition metadata and metrics, then commits them to an Iceberg {@link Table}. */ public class AddFiles extends PTransform, PCollectionRowTuple> { + static final String OUTPUT_TAG = "snapshots"; + static final String ERROR_TAG = "errors"; private static final Duration DEFAULT_TRIGGER_INTERVAL = Duration.standardMinutes(10); private static final Counter numFilesAdded = counter(AddFiles.class, "numFilesAdded"); private static final Counter numErrorFiles = counter(AddFiles.class, "numErrorFiles"); @@ -184,7 +192,7 @@ public PCollectionRowTuple expand(PCollection input) { .setRowSchema(SnapshotInfo.getSchema()); return PCollectionRowTuple.of( - "snapshots", snapshots, "errors", dataFiles.get(ERRORS).setRowSchema(ERROR_SCHEMA)); + OUTPUT_TAG, snapshots, ERROR_TAG, dataFiles.get(ERRORS).setRowSchema(ERROR_SCHEMA)); } static class ConvertToDataFile extends DoFn { @@ -196,6 +204,8 @@ static class ConvertToDataFile extends DoFn { private final @Nullable List partitionFields; private final @Nullable Map tableProps; private transient @MonotonicNonNull Table table; + private static final int MAX_READERS = 5; + private static final Semaphore ACTIVE_READERS = new Semaphore(MAX_READERS); public ConvertToDataFile( IcebergCatalogConfig catalogConfig, @@ -210,12 +220,13 @@ public ConvertToDataFile( this.tableProps = tableProps; } - private static final String PREFIX_ERROR = "File path did not start with the specified prefix"; + static final String PREFIX_ERROR = "File path did not start with the specified prefix"; private static final String UNKNOWN_FORMAT_ERROR = "Could not determine the file's format"; - static final String UNKNOWN_PARTITION_ERROR = "Could not determine the file's partition"; + static final String UNKNOWN_PARTITION_ERROR = "Could not determine the file's partition: "; @ProcessElement - public void process(@Element String filePath, MultiOutputReceiver output) throws IOException { + public void process(@Element String filePath, MultiOutputReceiver output) + throws IOException, InterruptedException { FileFormat format; try { format = inferFormat(filePath); @@ -259,13 +270,13 @@ public void process(@Element String filePath, MultiOutputReceiver output) throws } else { try { // option 2: examine DataFile min/max statistics to determine partition - partitionPath = getPartitionFromMetrics(metrics, inputFile); + partitionPath = getPartitionFromMetrics(metrics, inputFile, table); } catch (UnknownPartitionException e) { output .get(ERRORS) .output( Row.withSchema(ERROR_SCHEMA) - .addValues(filePath, UNKNOWN_PARTITION_ERROR + ": " + e.getMessage()) + .addValues(filePath, UNKNOWN_PARTITION_ERROR + e.getMessage()) .build()); numErrorFiles.inc(); return; @@ -284,10 +295,14 @@ public void process(@Element String filePath, MultiOutputReceiver output) throws output.get(DATA_FILES).output(SerializableDataFile.from(df, partitionPath)); } - private T transformValue(Transform transform, Type type, ByteBuffer bytes) { + static T transformValue(Transform transform, Type type, ByteBuffer bytes) { return transform.bind(type).apply(Conversions.fromByteBuffer(type, bytes)); } + private static T transformValue(Transform transform, Type type, Object value) { + return transform.bind(type).apply((W) value); + } + private Table getOrCreateTable(org.apache.iceberg.Schema schema) { PartitionSpec spec = PartitionUtils.toPartitionSpec(partitionFields, schema); try { @@ -330,12 +345,19 @@ private String getPartitionFromFilePath(String filePath) { * Examines the min/max values of each partition column to determine the destination partition. * *

If the transformed min/max values are not equal for any given column, we won't be able to - * determine the partition. In this case, we output the DataFile to the DLQ, because assigning - * an incorrect partition may lead to it being hidden from some queries. + * determine the partition. We also cannot fall back to a "null" partition, because that will + * also get skipped by most queries. + * + *

The Bucket partition transform is an exceptional case because it is not monotonic, meaning + * it's not enough to just compare the min and max values. There may be a middle value somewhere + * that gets hashed to a different value. For this transform, we'll need to read all the values + * in the column ensure they all get transformed to the same partition value. + * + *

In these cases, we output the DataFile to the DLQ, because assigning an incorrect + * partition may lead to it being completely ignored by downstream queries. */ - private String getPartitionFromMetrics(Metrics metrics, InputFile inputFile) - throws UnknownPartitionException, IOException { - Table table = checkStateNotNull(this.table); + static String getPartitionFromMetrics(Metrics metrics, InputFile inputFile, Table table) + throws UnknownPartitionException, IOException, InterruptedException { List fields = table.spec().fields(); List sourceIds = fields.stream().map(PartitionField::sourceId).collect(Collectors.toList()); @@ -351,7 +373,7 @@ private String getPartitionFromMetrics(Metrics metrics, InputFile inputFile) // Some tables are very wide and users may not want to store excessive metadata. List sourceNames = fields.stream() - .map(pf -> checkStateNotNull(table.schema().idToName().get(pf.sourceId()))) + .map(pf -> table.schema().findColumnName(pf.sourceId())) .collect(Collectors.toList()); Map configProps = sourceNames.stream() @@ -367,16 +389,28 @@ private String getPartitionFromMetrics(Metrics metrics, InputFile inputFile) PartitionKey pk = new PartitionKey(table.spec(), table.schema()); + HashMap bucketPartitions = new HashMap<>(); for (int i = 0; i < fields.size(); i++) { PartitionField field = fields.get(i); + Transform transform = field.transform(); + if (transform.toString().contains("bucket[")) { + bucketPartitions.put(i, field); + } + } + + // first, read only metadata for the non-bucket partition types + for (int i = 0; i < fields.size(); i++) { + PartitionField field = fields.get(i); + // skip bucket partitions (we will process them below) + if (bucketPartitions.containsKey(i)) { + continue; + } Type type = table.schema().findType(field.sourceId()); Transform transform = field.transform(); // Make a best effort estimate by comparing the lower and upper transformed values. // If the transformed values are equal, assume that the DataFile's data safely // aligns with the same partition. - // TODO(ahmedabu98): is comparing min/max safe enough for bucketing transform? - // or should we compare ALL the records in a DF? ByteBuffer lowerBytes = partitionMetrics.lowerBounds().get(field.sourceId()); ByteBuffer upperBytes = partitionMetrics.upperBounds().get(field.sourceId()); if (lowerBytes == null && upperBytes == null) { @@ -392,17 +426,64 @@ private String getPartitionFromMetrics(Metrics metrics, InputFile inputFile) if (!Objects.deepEquals(lowerTransformedValue, upperTransformedValue)) { // The DataFile contains values that align to different partitions, so we cannot // safely determine a partition. - // If we commit the DataFile with an incorrect partition, downstream queries may - // completely ignore it (due to Iceberg's smart partition scan-planning). - // We also cannot commit the DataFile with a "null" partition, because that will - // also get skipped by most queries. - // The safe thing to do is to output the file to DLQ. - throw new UnknownPartitionException("Min and max transformed values were not equal"); + throw new UnknownPartitionException( + "Min and max transformed values were not equal, for column: " + field.name()); } pk.set(i, lowerTransformedValue); } + // bucket transform needs extra processing (see java doc above) + if (!bucketPartitions.isEmpty()) { + // Optimize by only reading bucket-transformed columns into memory + org.apache.iceberg.Schema bucketCols = + TypeUtil.select( + table.schema(), + bucketPartitions.values().stream() + .map(PartitionField::sourceId) + .collect(Collectors.toSet())); + + // Keep one instance of transformed value per column. Use this to compare against each + // record's transformed value. + // Values in the same columns must yield the same transformed value, otherwise we cannot + // determine a partition + // from this file. + Map transformedValues = new HashMap<>(); + + // Do a one-time read of the file and compare all bucket-transformed columns + ACTIVE_READERS.acquire(); + try (CloseableIterable reader = ReadUtils.createReader(inputFile, bucketCols)) { + for (Record record : reader) { + for (Map.Entry entry : bucketPartitions.entrySet()) { + int partitionIndex = entry.getKey(); + PartitionField partitionField = entry.getValue(); + Transform transform = partitionField.transform(); + Types.NestedField field = table.schema().findField(partitionField.sourceId()); + Object value = record.getField(field.name()); + + // set initial transformed value for this column + @Nullable Object transformedValue = transformedValues.get(partitionIndex); + Object currentTransformedValue = transformValue(transform, field.type(), value); + if (transformedValue == null) { + transformedValues.put(partitionIndex, checkStateNotNull(currentTransformedValue)); + continue; + } + + if (!Objects.deepEquals(currentTransformedValue, transformedValue)) { + throw new UnknownPartitionException( + "Found records with conflicting transformed values, for column: " + + field.name()); + } + } + } + } finally { + ACTIVE_READERS.release(); + } + + for (Map.Entry partitionCol : transformedValues.entrySet()) { + pk.set(partitionCol.getKey(), partitionCol.getValue()); + } + } return pk.toPath(); } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java index 37b2ca6834d2..a04853c8ad96 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFilesSchemaTransformProvider.java @@ -17,6 +17,8 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ERROR_TAG; +import static org.apache.beam.sdk.io.iceberg.AddFiles.OUTPUT_TAG; import static org.apache.beam.sdk.io.iceberg.AddFilesSchemaTransformProvider.Configuration; import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; @@ -31,6 +33,7 @@ import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.providers.ErrorHandling; import org.apache.beam.sdk.transforms.Filter; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.values.PCollectionRowTuple; @@ -106,6 +109,9 @@ public static Builder builder() { + " please visit https://iceberg.apache.org/docs/latest/configuration/#table-properties.") public abstract @Nullable Map getTableProperties(); + @SchemaFieldDescription("This option specifies whether and where to output unwritable rows.") + public abstract @Nullable ErrorHandling getErrorHandling(); + @AutoValue.Builder public abstract static class Builder { public abstract Builder setTable(String table); @@ -124,6 +130,8 @@ public abstract static class Builder { public abstract Builder setTableProperties(Map props); + public abstract Builder setErrorHandling(ErrorHandling errorHandling); + public abstract Configuration build(); } @@ -153,22 +161,30 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { @Nullable Integer frequency = configuration.getTriggeringFrequencySeconds(); - return input - .getSinglePCollection() - .apply("Filter empty paths", Filter.by(row -> row.getString(0) != null)) - .apply( - "ExtractPaths", - MapElements.into(TypeDescriptors.strings()) - .via(row -> checkStateNotNull(row.getString(0)))) - .apply( - new AddFiles( - configuration.getIcebergCatalog(), - configuration.getTable(), - configuration.getLocationPrefix(), - configuration.getPartitionFields(), - configuration.getTableProperties(), - configuration.getAppendBatchSize(), - frequency != null ? Duration.standardSeconds(frequency) : null)); + PCollectionRowTuple result = + input + .getSinglePCollection() + .apply("Filter empty paths", Filter.by(row -> row.getString(0) != null)) + .apply( + "ExtractPaths", + MapElements.into(TypeDescriptors.strings()) + .via(row -> checkStateNotNull(row.getString(0)))) + .apply( + new AddFiles( + configuration.getIcebergCatalog(), + configuration.getTable(), + configuration.getLocationPrefix(), + configuration.getPartitionFields(), + configuration.getTableProperties(), + configuration.getAppendBatchSize(), + frequency != null ? Duration.standardSeconds(frequency) : null)); + + PCollectionRowTuple output = PCollectionRowTuple.of("snapshots", result.get(OUTPUT_TAG)); + ErrorHandling errorHandling = configuration.getErrorHandling(); + if (errorHandling != null) { + output = output.and(errorHandling.getOutput(), result.get(ERROR_TAG)); + } + return output; } } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java index a4d95ca249b4..e7f50882f433 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java @@ -46,9 +46,11 @@ import org.apache.iceberg.encryption.EncryptedInputFile; import org.apache.iceberg.expressions.Evaluator; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.hadoop.HadoopInputFile; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.mapping.MappingUtil; import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.parquet.ParquetReader; @@ -112,6 +114,34 @@ static ParquetReader createReader(FileScanTask task, Table table, Schema true); } + static ParquetReader createReader(InputFile inputFile, Schema schema) { + ParquetReadOptions.Builder optionsBuilder; + if (inputFile instanceof HadoopInputFile) { + // remove read properties already set that may conflict with this read + Configuration conf = new Configuration(((HadoopInputFile) inputFile).getConf()); + for (String property : READ_PROPERTIES_TO_REMOVE) { + conf.unset(property); + } + optionsBuilder = HadoopReadOptions.builder(conf); + } else { + optionsBuilder = ParquetReadOptions.builder(); + } + optionsBuilder = + optionsBuilder + .withRange(0, inputFile.getLength()) + .withMaxAllocationInBytes(MAX_FILE_BUFFER_SIZE); + + return new ParquetReader<>( + inputFile, + schema, + optionsBuilder.build(), + fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema), + MappingUtil.create(schema), + Expressions.alwaysTrue(), + false, + true); + } + static Map constantsMap( FileScanTask task, BiFunction converter, diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesIT.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesIT.java index f4633ce53aa3..a67707e3dbb7 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesIT.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesIT.java @@ -24,6 +24,7 @@ import static org.apache.beam.sdk.values.TypeDescriptors.strings; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import com.google.api.services.storage.model.StorageObject; import com.google.cloud.storage.Blob; @@ -143,6 +144,11 @@ public class AddFilesIT { private static final PartitionSpec SPEC = PartitionUtils.toPartitionSpec(PARTITION_FIELDS, ROW_SCHEMA); private static final Map TABLE_PROPS = ImmutableMap.of("foo", "bar"); + private static final List TEST_ROWS = + IntStream.range(0, 20) + .mapToObj( + i -> Row.withSchema(ROW_SCHEMA).addValues((long) i, "name_" + i, i + 30).build()) + .collect(Collectors.toList()); private final RESTCatalog catalog = new RESTCatalog(); @Before @@ -235,8 +241,8 @@ public void cleanup() { } @Test - public void testAddFilesFromSrcIcebergTable() - throws IOException, InterruptedException, TimeoutException { + public void testStreamingImportFromExistingIcebergTable() + throws IOException, InterruptedException { // first create a source iceberg table catalog.createTable(srcTableId, beamSchemaToIcebergSchema(ROW_SCHEMA), SPEC); @@ -250,14 +256,8 @@ public void testAddFilesFromSrcIcebergTable() // write some rows to the source table LOG.info("Writing records to the source table"); - List rows = - IntStream.range(0, 20) - .mapToObj( - i -> Row.withSchema(ROW_SCHEMA).addValues((long) i, "name_" + i, i + 30).build()) - .collect(Collectors.toList()); - Pipeline q = Pipeline.create(); - q.apply(Create.of(rows)) + q.apply(Create.of(TEST_ROWS)) .setRowSchema(ROW_SCHEMA) .apply( Managed.write(Managed.ICEBERG) @@ -299,16 +299,7 @@ public void testAddFilesFromSrcIcebergTable() addFilesPipeline.cancel(); // check all records are there - Pipeline s = Pipeline.create(); - PCollection destRows = - s.apply( - Managed.read(Managed.ICEBERG) - .withConfig( - ImmutableMap.of( - "table", destTableId.toString(), "catalog_properties", BIGLAKE_PROPS))) - .getSinglePCollection(); - PAssert.that(destRows).containsInAnyOrder(rows); - s.run().waitUntilFinish(); + checkRecordsInDestinationTable(); } /** @@ -344,7 +335,8 @@ public void testAddFilesFromSrcIcebergTable() } @Test - public void testAddParquetFiles() throws InterruptedException, TimeoutException, IOException { + public void testStreamingParquetImport() + throws InterruptedException, TimeoutException, IOException { // start with a table that does not exist String parquetDir = format("%s/%s/", WAREHOUSE, dirName); @@ -358,15 +350,9 @@ public void testAddParquetFiles() throws InterruptedException, TimeoutException, // write some parquet files LOG.info("Writing records to the parquet dir"); - List rows = - IntStream.range(0, 20) - .mapToObj( - i -> Row.withSchema(ROW_SCHEMA).addValues((long) i, "name_" + i, i + 30).build()) - .collect(Collectors.toList()); - Pipeline q = Pipeline.create(); org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(ROW_SCHEMA); - q.apply(Create.of(rows)) + q.apply(Create.of(TEST_ROWS)) .setRowSchema(ROW_SCHEMA) .apply( MapElements.into(TypeDescriptor.of(GenericRecord.class)) @@ -385,7 +371,7 @@ record -> q.run().waitUntilFinish(); GcsUtil gcsUtil = TestPipeline.testingPipelineOptions().as(GcsOptions.class).getGcsUtil(); - // String prefix = format("%s/", dirName); + Iterable objects = gcsUtil.listObjects(WAREHOUSE.replace("gs://", ""), dirName, null).getItems(); List writtenFilePaths = @@ -414,6 +400,81 @@ record -> addFilesPipeline.cancel(); // check all records are there + checkRecordsInDestinationTable(); + } + + @Test + public void testBatchParquetImport() throws IOException { + // start with a table that does not exist + + String parquetDir = format("%s/%s/", WAREHOUSE, dirName); + String tempDir = format("%s/%s-tmp/", WAREHOUSE, dirName); + + // write some parquet files + LOG.info("Writing records to the parquet dir"); + Pipeline q = Pipeline.create(); + org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(ROW_SCHEMA); + q.apply(Create.of(TEST_ROWS)) + .setRowSchema(ROW_SCHEMA) + .apply( + MapElements.into(TypeDescriptor.of(GenericRecord.class)) + .via(AvroUtils.getRowToGenericRecordFunction(avroSchema))) + .setCoder(AvroCoder.of(avroSchema)) + .apply( + FileIO.writeDynamic() + .by( + record -> + format("%s-%s-%s", record.get("id"), record.get("name"), record.get("age"))) + .via(ParquetIO.sink(avroSchema)) + .withNaming(name -> defaultNaming(name, ".parquet")) + .withTempDirectory(tempDir) + .to(parquetDir) + .withDestinationCoder(StringUtf8Coder.of())); + q.run().waitUntilFinish(); + + GcsUtil gcsUtil = TestPipeline.testingPipelineOptions().as(GcsOptions.class).getGcsUtil(); + + Iterable objects = + gcsUtil.listObjects(WAREHOUSE.replace("gs://", ""), dirName, null).getItems(); + List writtenFilePaths = + Lists.newArrayList(objects).stream() + .map(o -> format("gs://%s/%s", o.getBucket(), o.getName())) + .collect(Collectors.toList()); + LOG.info("Written file paths: {}", writtenFilePaths); + + // before adding, confirm the destination table still does not exist + assertFalse(catalog.tableExists(destTableId)); + + // run batch AddFiles + Pipeline p = Pipeline.create(); + PCollectionRowTuple tuple = + p.apply(Create.of(writtenFilePaths)) + .apply( + new AddFiles( + IcebergCatalogConfig.builder().setCatalogProperties(BIGLAKE_PROPS).build(), + namespace + "." + destTableName, + null, + PARTITION_FIELDS, + TABLE_PROPS, + 10, + Duration.standardSeconds(10))); + PAssert.that(tuple.get("errors")).empty(); + p.run().waitUntilFinish(); + + // check that the destination table has been created + assertTrue(catalog.tableExists(destTableId)); + LOG.info("Destination table has been created"); + + LOG.info("Checking if all source files have been registered in the destination table"); + assertTrue(checkTableHasRegisteredParquetFiles(writtenFilePaths)); + LOG.info( + "Destination table has registered all source files ({} files).", writtenFilePaths.size()); + + // check all records are there + checkRecordsInDestinationTable(); + } + + private void checkRecordsInDestinationTable() { Pipeline s = Pipeline.create(); PCollection destRows = s.apply( @@ -422,7 +483,7 @@ record -> ImmutableMap.of( "table", destTableId.toString(), "catalog_properties", BIGLAKE_PROPS))) .getSinglePCollection(); - PAssert.that(destRows).containsInAnyOrder(rows); + PAssert.that(destRows).containsInAnyOrder(TEST_ROWS); s.run().waitUntilFinish(); } diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java index 7e1f23d1fb87..3433b0f07362 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java @@ -17,15 +17,21 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.PREFIX_ERROR; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.UNKNOWN_PARTITION_ERROR; +import static org.apache.beam.sdk.io.iceberg.AddFiles.ConvertToDataFile.getPartitionFromMetrics; import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.hasEntry; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; @@ -45,8 +51,12 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; import org.apache.iceberg.Files; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.PartitionData; +import org.apache.iceberg.PartitionField; import org.apache.iceberg.PartitionKey; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Snapshot; @@ -58,14 +68,19 @@ import org.apache.iceberg.data.parquet.GenericParquetWriter; import org.apache.iceberg.hadoop.HadoopCatalog; import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.mapping.MappingUtil; import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.SerializableFunction; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; import org.junit.Before; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import org.junit.rules.TemporaryFolder; import org.junit.rules.TestName; import org.junit.runner.RunWith; @@ -92,6 +107,7 @@ public class AddFilesTest { private IcebergCatalogConfig catalogConfig; @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Rule public TestName testName = new TestName(); + @Rule public ExpectedException thrown = ExpectedException.none(); @Rule public transient TestDataWarehouse warehouse = new TestDataWarehouse(TEMPORARY_FOLDER, "default"); @@ -255,7 +271,7 @@ public void testAddFilesWithPartitionFromMetrics() throws IOException { errorRow.getSchema().equals(AddFiles.ERROR_SCHEMA) && file3.equals(errorRow.getString(0)) && checkStateNotNull(errorRow.getString(1)) - .startsWith(AddFiles.ConvertToDataFile.UNKNOWN_PARTITION_ERROR)); + .startsWith(UNKNOWN_PARTITION_ERROR)); return null; }); @@ -397,15 +413,228 @@ public void testPartitionPrefixErrors() throws Exception { new AddFiles(catalogConfig, tableId.toString(), "some/prefix/", null, null, 1, null); PCollectionRowTuple outputTuple = inputFiles.apply(addFiles); + PAssert.that(outputTuple.get("errors")) + .containsInAnyOrder( + Row.withSchema(AddFiles.ERROR_SCHEMA).addValues(file1, PREFIX_ERROR).build()); + + pipeline.run().waitUntilFinish(); + } + + @Test + public void testRecognizesBucketPartitionMismatch() throws IOException { + catalog.dropTable(tableId); + + String file1 = root + "data1.parquet"; + wrapper.wrap(record(-1, "And", 30)); + DataWriter writer = createWriter(file1, wrapper.copy()); + writer.write(record(1, "Andrew", 30)); + writer.write(record(5, "Sally", 30)); + writer.write(record(10, "Ahmed", 30)); + writer.close(); + + // 1 (min) and 10 (max) will transform to bucket=0 + // 5 (some middle value) transforms to bucket=1 + // we should pick up on this and pass this file to DLQ + // to prove the transform value mapping, below is a sanity check: + List partitionFields = Arrays.asList("bucket(id, 2)", "age"); + PartitionSpec spec = PartitionUtils.toPartitionSpec(partitionFields, icebergSchema); + PartitionField bucketPartition = spec.fields().get(0); + assertEquals("id_bucket", bucketPartition.name()); + assertTrue(bucketPartition.transform().toString().contains("bucket[")); + SerializableFunction transformFunc = + (SerializableFunction) + bucketPartition.transform().bind(Types.LongType.get()); + assertEquals(0, (int) transformFunc.apply(1L)); + assertEquals(1, (int) transformFunc.apply(5L)); + assertEquals(0, (int) transformFunc.apply(10L)); + + AddFiles addFiles = + new AddFiles(catalogConfig, tableId.toString(), null, partitionFields, null, 1, null); + PCollection inputFiles = pipeline.apply("Create Input", Create.of(file1)); + PCollectionRowTuple outputTuple = inputFiles.apply(addFiles); + PAssert.that(outputTuple.get("errors")) .containsInAnyOrder( Row.withSchema(AddFiles.ERROR_SCHEMA) - .addValues(file1, "File path did not start with the specified prefix") + .addValues( + file1, + UNKNOWN_PARTITION_ERROR + + "Found records with conflicting transformed values, for column: id") .build()); - pipeline.run().waitUntilFinish(); } + @Test + public void testGetPartitionFromMetrics() throws IOException, InterruptedException { + PartitionSpec partitionSpec = + PartitionSpec.builderFor(icebergSchema) + .bucket("id", 2) + .truncate("name", 4) + .identity("age") + .build(); + + List testCases = + Arrays.asList( + PartitionTestCase.of( + root + "data_1.parquet", + record(1, "aaaa", 10), + Arrays.asList( + record(1, "aaaa123", 10), + record(10, "aaaa789", 10), + record(100, "aaaa456", 10)), + Arrays.asList(1, CharBuffer.wrap("aaaa123"), 10), + Arrays.asList(100, CharBuffer.wrap("aaaa789"), 10), + "id_bucket=0/name_trunc=aaaa/age=10"), + PartitionTestCase.of( + root + "data_2.parquet", + record(1, "bbbb", 30), + Arrays.asList( + record(5, "bbbb789", 30), + record(55, "bbbb456", 30), + record(500, "bbbb123", 30)), + Arrays.asList(5, CharBuffer.wrap("bbbb123"), 30), + Arrays.asList(500, CharBuffer.wrap("bbbb789"), 30), + "id_bucket=1/name_trunc=bbbb/age=30")); + + PartitionKey pk = new PartitionKey(partitionSpec, icebergSchema); + MetricsConfig metricsConfig = MetricsConfig.fromProperties(tableProps); + Table table = catalog.createTable(tableId, icebergSchema, partitionSpec); + + for (PartitionTestCase caze : testCases) { + List records = caze.records; + String fileName = caze.fileName; + pk.wrap(caze.partition); + DataWriter writer = createWriter(fileName, pk.copy()); + + for (Record record : records) { + writer.write(record); + } + writer.close(); + InputFile file = table.io().newInputFile(fileName); + + Metrics metrics = + AddFiles.getFileMetrics( + file, FileFormat.PARQUET, metricsConfig, MappingUtil.create(icebergSchema)); + for (int i = 0; i < partitionSpec.fields().size(); i++) { + PartitionField partitionField = partitionSpec.fields().get(i); + Types.NestedField field = icebergSchema.findField(partitionField.sourceId()); + ByteBuffer lowerBytes = metrics.lowerBounds().get(field.fieldId()); + ByteBuffer upperBytes = metrics.upperBounds().get(field.fieldId()); + + Object lower = Conversions.fromByteBuffer(field.type(), lowerBytes); + Object upper = Conversions.fromByteBuffer(field.type(), upperBytes); + + assertEquals(caze.expectedLower.get(i), lower); + assertEquals(caze.expectedUpper.get(i), upper); + } + + String partitionPath = getPartitionFromMetrics(metrics, file, table); + assertEquals(caze.expectedPartition, partitionPath); + } + } + + @Test + public void testThrowPartitionMismatchError() throws IOException, InterruptedException { + PartitionSpec partitionSpec = + PartitionSpec.builderFor(icebergSchema) + .bucket("id", 2) + .truncate("name", 4) + .identity("age") + .build(); + + List testCases = + Arrays.asList( + PartitionTestCase.of( + root + "data_1.parquet", + record(1, "aaaa", 10), + Arrays.asList( + record(1, "aaaa123", 10), record(10, "abab", 10), record(100, "aaaa789", 10)), + Arrays.asList(1, CharBuffer.wrap("aaaa123"), 10), + Arrays.asList(100, CharBuffer.wrap("abab"), 10), + "error"), + PartitionTestCase.of( + root + "data_2.parquet", + record(1, "bbbb", 30), + Arrays.asList( + record(5, "bbbb", 30), record(55, "bbbb", 30), record(500, "bbbb", 50)), + Arrays.asList(5, CharBuffer.wrap("bbbb"), 30), + Arrays.asList(500, CharBuffer.wrap("bbbb"), 50), + "error")); + + PartitionKey pk = new PartitionKey(partitionSpec, icebergSchema); + MetricsConfig metricsConfig = MetricsConfig.fromProperties(tableProps); + Table table = catalog.createTable(tableId, icebergSchema, partitionSpec); + + for (PartitionTestCase caze : testCases) { + List records = caze.records; + String fileName = caze.fileName; + pk.wrap(caze.partition); + DataWriter writer = createWriter(fileName, pk.copy()); + + for (Record record : records) { + writer.write(record); + } + writer.close(); + InputFile file = table.io().newInputFile(fileName); + + Metrics metrics = + AddFiles.getFileMetrics( + file, FileFormat.PARQUET, metricsConfig, MappingUtil.create(icebergSchema)); + // check that lower/upper stats are still fetched correctly + for (int i = 0; i < partitionSpec.fields().size(); i++) { + PartitionField partitionField = partitionSpec.fields().get(i); + Types.NestedField field = icebergSchema.findField(partitionField.sourceId()); + ByteBuffer lowerBytes = metrics.lowerBounds().get(field.fieldId()); + ByteBuffer upperBytes = metrics.upperBounds().get(field.fieldId()); + + Object lower = Conversions.fromByteBuffer(field.type(), lowerBytes); + Object upper = Conversions.fromByteBuffer(field.type(), upperBytes); + + assertEquals(caze.expectedLower.get(i), lower); + assertEquals(caze.expectedUpper.get(i), upper); + } + + assertThrows( + AddFiles.UnknownPartitionException.class, + () -> getPartitionFromMetrics(metrics, file, table)); + } + } + + static class PartitionTestCase { + String fileName; + StructLike partition; + List records; + List expectedLower; + List expectedUpper; + String expectedPartition; + + PartitionTestCase( + String fileName, + StructLike partition, + List records, + List expectedLower, + List expectedUpper, + String expectedPartition) { + this.fileName = fileName; + this.partition = partition; + this.records = records; + this.expectedLower = expectedLower; + this.expectedUpper = expectedUpper; + this.expectedPartition = expectedPartition; + } + + static PartitionTestCase of( + String fileName, + StructLike partition, + List records, + List expectedLower, + List expectedUpper, + String expectedPartition) { + return new PartitionTestCase( + fileName, partition, records, expectedLower, expectedUpper, expectedPartition); + } + } + private DataWriter createWriter(String file) throws IOException { return createWriter(file, null); } From e7f949d6dde14afb4dbf49193437f0a4bdebd37e Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 24 Mar 2026 18:16:10 -0400 Subject: [PATCH 14/15] clarify comments --- .../java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java index 3433b0f07362..287b9140e000 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java @@ -434,8 +434,8 @@ public void testRecognizesBucketPartitionMismatch() throws IOException { // 1 (min) and 10 (max) will transform to bucket=0 // 5 (some middle value) transforms to bucket=1 - // we should pick up on this and pass this file to DLQ - // to prove the transform value mapping, below is a sanity check: + // To prove this transform value mapping^, below is a sanity check. + // We should recognize that we cannot assign a partition to such a file, and pass it to DLQ. List partitionFields = Arrays.asList("bucket(id, 2)", "age"); PartitionSpec spec = PartitionUtils.toPartitionSpec(partitionFields, icebergSchema); PartitionField bucketPartition = spec.fields().get(0); From a4e4ec956f179dd2ea8c7baca9773ac7ebd10e16 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 24 Mar 2026 18:18:51 -0400 Subject: [PATCH 15/15] add comment --- .../main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java index 6e58d12ff57b..4a164700099a 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java @@ -204,7 +204,8 @@ static class ConvertToDataFile extends DoFn { private final @Nullable List partitionFields; private final @Nullable Map tableProps; private transient @MonotonicNonNull Table table; - private static final int MAX_READERS = 5; + // Limit open readers to avoid blowing up memory on one worker + private static final int MAX_READERS = 10; private static final Semaphore ACTIVE_READERS = new Semaphore(MAX_READERS); public ConvertToDataFile( @@ -354,7 +355,7 @@ private String getPartitionFromFilePath(String filePath) { * in the column ensure they all get transformed to the same partition value. * *

In these cases, we output the DataFile to the DLQ, because assigning an incorrect - * partition may lead to it being completely ignored by downstream queries. + * partition may lead to it being incorrectly ignored by downstream queries. */ static String getPartitionFromMetrics(Metrics metrics, InputFile inputFile, Table table) throws UnknownPartitionException, IOException, InterruptedException {