From 5d58750841d3a35a352fcf99d315f10626399e71 Mon Sep 17 00:00:00 2001 From: Rahil C <32500120+rahil-c@users.noreply.github.com> Date: Fri, 22 Mar 2024 17:58:14 -0400 Subject: [PATCH 01/25] Spec: Fix REST pagination requirements based on new feedback (#9917) Co-authored-by: Rahil Chertara --- open-api/rest-catalog-open-api.py | 6 +++--- open-api/rest-catalog-open-api.yaml | 26 ++++++++++++++++++++------ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index 77dcad9cfba6..7bd97b69885f 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -78,9 +78,9 @@ class Namespace(BaseModel): class PageToken(BaseModel): - __root__: str = Field( - ..., - description='An opaque token which allows clients to make use of pagination for a list API (e.g. ListTables). Clients will initiate the first paginated request by sending an empty `pageToken` e.g. `GET /tables?pageToken` or `GET /tables?pageToken=` signaling to the service that the response should be paginated.\nServers that support pagination will recognize `pageToken` and return a `next-page-token` in response if there are more results available. After the initial request, it is expected that the value of `next-page-token` from the last response is used in the subsequent request. Servers that do not support pagination will ignore `next-page-token` and return all results.', + __root__: Optional[str] = Field( + None, + description='An opaque token that allows clients to make use of pagination for list APIs (e.g. ListTables). Clients may initiate the first paginated request by sending an empty query parameter `pageToken` to the server.\nServers that support pagination should identify the `pageToken` parameter and return a `next-page-token` in the response if there are more results available. After the initial request, the value of `next-page-token` from each response must be used as the `pageToken` parameter value for the next request. The server must return `null` value for the `next-page-token` in the last response.\nServers that support pagination must return all results in a single response with the value of `next-page-token` set to `null` if the query parameter `pageToken` is not set in the request.\nServers that do not support pagination should ignore the `pageToken` parameter and return all results in a single response. The `next-page-token` must be omitted from the response.\nClients must interpret either `null` or missing response value of `next-page-token` as the end of the listing results.', ) diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 77aabc834adb..161d5e0fcff8 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -1610,14 +1610,28 @@ components: PageToken: description: - An opaque token which allows clients to make use of pagination for a list API (e.g. ListTables). - Clients will initiate the first paginated request by sending an empty `pageToken` e.g. `GET /tables?pageToken` or `GET /tables?pageToken=` - signaling to the service that the response should be paginated. + An opaque token that allows clients to make use of pagination for list APIs + (e.g. ListTables). Clients may initiate the first paginated request by sending an empty + query parameter `pageToken` to the server. + + Servers that support pagination should identify the `pageToken` parameter and return a + `next-page-token` in the response if there are more results available. After the initial + request, the value of `next-page-token` from each response must be used as the `pageToken` + parameter value for the next request. The server must return `null` value for the + `next-page-token` in the last response. + + Servers that support pagination must return all results in a single response with the value + of `next-page-token` set to `null` if the query parameter `pageToken` is not set in the + request. + + Servers that do not support pagination should ignore the `pageToken` parameter and return + all results in a single response. The `next-page-token` must be omitted from the response. + + Clients must interpret either `null` or missing response value of `next-page-token` as + the end of the listing results. - Servers that support pagination will recognize `pageToken` and return a `next-page-token` in response if there are more results available. - After the initial request, it is expected that the value of `next-page-token` from the last response is used in the subsequent request. - Servers that do not support pagination will ignore `next-page-token` and return all results. type: string + nullable: true TableIdentifier: type: object From 33838d5a4870e654829e551a1a2770ff82ac94ab Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Sat, 23 Mar 2024 11:37:10 +0100 Subject: [PATCH 02/25] docs: Add links checker (#9965) * docs: Add links checker * Comments * Fix broken paths * Fix moar links * Last few --- .github/workflows/docs-check-links.yml | 40 +++++++++++++++++++++++++ README.md | 7 ++--- docs/docs/configuration.md | 8 ++--- docs/docs/daft.md | 2 +- docs/docs/flink-actions.md | 2 +- docs/docs/flink-connector.md | 6 ++-- docs/docs/flink-ddl.md | 2 +- docs/docs/flink-queries.md | 2 +- docs/docs/flink-writes.md | 10 +++---- docs/docs/flink.md | 35 +++++++++++----------- docs/docs/spark-configuration.md | 4 +-- docs/docs/spark-ddl.md | 16 +++++----- docs/docs/spark-getting-started.md | 31 +++++++++---------- docs/docs/spark-procedures.md | 8 ++--- docs/docs/spark-queries.md | 4 +-- docs/docs/spark-structured-streaming.md | 10 +++---- docs/docs/spark-writes.md | 10 +++---- format/spec.md | 2 +- site/README.md | 2 +- site/docs/blogs.md | 11 ++++--- site/docs/how-to-release.md | 2 +- site/docs/multi-engine-support.md | 12 ++++++++ site/docs/releases.md | 7 +++-- site/docs/spark-quickstart.md | 1 + site/docs/vendors.md | 9 ++++-- site/link-checker-config.json | 23 ++++++++++++++ 26 files changed, 172 insertions(+), 94 deletions(-) create mode 100644 .github/workflows/docs-check-links.yml create mode 100644 site/link-checker-config.json diff --git a/.github/workflows/docs-check-links.yml b/.github/workflows/docs-check-links.yml new file mode 100644 index 000000000000..9de842813dc3 --- /dev/null +++ b/.github/workflows/docs-check-links.yml @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: Check Markdown docs links + +on: + push: + paths: + - docs/** + - site/** + branches: + - 'main' + pull_request: + workflow_dispatch: + +jobs: + markdown-link-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: gaurav-nelson/github-action-markdown-link-check@v1 + with: + config-file: 'site/link-checker-config.json' + use-verbose-mode: yes diff --git a/README.md b/README.md index 3ba4f74f871a..8d36b212b953 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ - under the License. --> -![Iceberg](https://iceberg.apache.org/docs/latest/img/Iceberg-logo.png) +![Iceberg](https://iceberg.apache.org/assets/images/Iceberg-logo.svg) [![](https://github.com/apache/iceberg/actions/workflows/java-ci.yml/badge.svg)](https://github.com/apache/iceberg/actions/workflows/java-ci.yml) [![Slack](https://img.shields.io/badge/chat-on%20Slack-brightgreen.svg)](https://apache-iceberg.slack.com/) @@ -37,11 +37,8 @@ The core Java library is located in this repository and is the reference impleme [Documentation][iceberg-docs] is available for all libraries and integrations. -Current work is tracked in the [roadmap][roadmap]. - [iceberg-docs]: https://iceberg.apache.org/docs/latest/ -[iceberg-spec]: https://iceberg.apache.org/spec -[roadmap]: https://iceberg.apache.org/roadmap/ +[iceberg-spec]: https://iceberg.apache.org/spec/ ## Collaboration diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md index d5e33529c0dd..ec7af06ca7f9 100644 --- a/docs/docs/configuration.md +++ b/docs/docs/configuration.md @@ -108,9 +108,9 @@ Iceberg tables support table properties to configure table behavior, like the de Reserved table properties are only used to control behaviors when creating or updating a table. The value of these properties are not persisted as a part of the table metadata. -| Property | Default | Description | -| -------------- | -------- | ------------------------------------------------------------- | -| format-version | 2 | Table's format version (can be 1 or 2) as defined in the [Spec](../../../spec/#format-versioning). Defaults to 2 since version 1.4.0. | +| Property | Default | Description | +| -------------- | -------- |--------------------------------------------------------------------------------------------------------------------------------------| +| format-version | 2 | Table's format version (can be 1 or 2) as defined in the [Spec](../../spec.md#format-versioning). Defaults to 2 since version 1.4.0. | ### Compatibility flags @@ -131,7 +131,7 @@ Iceberg catalogs support using catalog properties to configure catalog behaviors | clients | 2 | client pool size | | cache-enabled | true | Whether to cache catalog entries | | cache.expiration-interval-ms | 30000 | How long catalog entries are locally cached, in milliseconds; 0 disables caching, negative values disable expiration | -| metrics-reporter-impl | org.apache.iceberg.metrics.LoggingMetricsReporter | Custom `MetricsReporter` implementation to use in a catalog. See the [Metrics reporting](../metrics-reporting.md) section for additional details | +| metrics-reporter-impl | org.apache.iceberg.metrics.LoggingMetricsReporter | Custom `MetricsReporter` implementation to use in a catalog. See the [Metrics reporting](metrics-reporting.md) section for additional details | `HadoopCatalog` and `HiveCatalog` can access the properties in their constructors. Any other custom catalog can access the properties by implementing `Catalog.initialize(catalogName, catalogProperties)`. diff --git a/docs/docs/daft.md b/docs/docs/daft.md index da78b7eb6ccd..71030e394957 100644 --- a/docs/docs/daft.md +++ b/docs/docs/daft.md @@ -20,7 +20,7 @@ title: "Daft" # Daft -[Daft](www.getdaft.io) is a distributed query engine written in Python and Rust, two fast-growing ecosystems in the data engineering and machine learning industry. +[Daft](https://www.getdaft.io/) is a distributed query engine written in Python and Rust, two fast-growing ecosystems in the data engineering and machine learning industry. It exposes its flavor of the familiar [Python DataFrame API](https://www.getdaft.io/projects/docs/en/latest/api_docs/dataframe.html) which is a common abstraction over querying tables of data in the Python data ecosystem. diff --git a/docs/docs/flink-actions.md b/docs/docs/flink-actions.md index 4e54732c3b1a..c058795fd079 100644 --- a/docs/docs/flink-actions.md +++ b/docs/docs/flink-actions.md @@ -20,7 +20,7 @@ title: "Flink Actions" ## Rewrite files action -Iceberg provides API to rewrite small files into large files by submitting Flink batch jobs. The behavior of this Flink action is the same as Spark's [rewriteDataFiles](../maintenance.md#compact-data-files). +Iceberg provides API to rewrite small files into large files by submitting Flink batch jobs. The behavior of this Flink action is the same as Spark's [rewriteDataFiles](maintenance.md#compact-data-files). ```java import org.apache.iceberg.flink.actions.Actions; diff --git a/docs/docs/flink-connector.md b/docs/docs/flink-connector.md index 260a5c581493..025e9aee92ea 100644 --- a/docs/docs/flink-connector.md +++ b/docs/docs/flink-connector.md @@ -29,13 +29,13 @@ To create the table in Flink SQL by using SQL syntax `CREATE TABLE test (..) WIT * `connector`: Use the constant `iceberg`. * `catalog-name`: User-specified catalog name. It's required because the connector don't have any default value. * `catalog-type`: `hive` or `hadoop` for built-in catalogs (defaults to `hive`), or left unset for custom catalog implementations using `catalog-impl`. -* `catalog-impl`: The fully-qualified class name of a custom catalog implementation. Must be set if `catalog-type` is unset. See also [custom catalog](../flink.md#adding-catalogs) for more details. +* `catalog-impl`: The fully-qualified class name of a custom catalog implementation. Must be set if `catalog-type` is unset. See also [custom catalog](flink.md#adding-catalogs) for more details. * `catalog-database`: The iceberg database name in the backend catalog, use the current flink database name by default. * `catalog-table`: The iceberg table name in the backend catalog. Default to use the table name in the flink `CREATE TABLE` sentence. ## Table managed in Hive catalog. -Before executing the following SQL, please make sure you've configured the Flink SQL client correctly according to the [quick start documentation](../flink.md). +Before executing the following SQL, please make sure you've configured the Flink SQL client correctly according to the [quick start documentation](flink.md). The following SQL will create a Flink table in the current Flink catalog, which maps to the iceberg table `default_database.flink_table` managed in iceberg catalog. @@ -138,4 +138,4 @@ SELECT * FROM flink_table; 3 rows in set ``` -For more details, please refer to the Iceberg [Flink documentation](../flink.md). +For more details, please refer to the Iceberg [Flink documentation](flink.md). diff --git a/docs/docs/flink-ddl.md b/docs/docs/flink-ddl.md index 681a018865aa..c2b3051fde8d 100644 --- a/docs/docs/flink-ddl.md +++ b/docs/docs/flink-ddl.md @@ -150,7 +150,7 @@ Table create commands support the commonly used [Flink create clauses](https://n * `PARTITION BY (column1, column2, ...)` to configure partitioning, Flink does not yet support hidden partitioning. * `COMMENT 'table document'` to set a table description. -* `WITH ('key'='value', ...)` to set [table configuration](../configuration.md) which will be stored in Iceberg table properties. +* `WITH ('key'='value', ...)` to set [table configuration](configuration.md) which will be stored in Iceberg table properties. Currently, it does not support computed column and watermark definition etc. diff --git a/docs/docs/flink-queries.md b/docs/docs/flink-queries.md index 036d95a4953a..431a5554f248 100644 --- a/docs/docs/flink-queries.md +++ b/docs/docs/flink-queries.md @@ -75,7 +75,7 @@ SET table.exec.iceberg.use-flip27-source = true; ### Reading branches and tags with SQL Branch and tags can be read via SQL by specifying options. For more details -refer to [Flink Configuration](../flink-configuration.md#read-options) +refer to [Flink Configuration](flink-configuration.md#read-options) ```sql --- Read from branch b1 diff --git a/docs/docs/flink-writes.md b/docs/docs/flink-writes.md index c41b367deaed..ef1e602c8212 100644 --- a/docs/docs/flink-writes.md +++ b/docs/docs/flink-writes.md @@ -67,7 +67,7 @@ Iceberg supports `UPSERT` based on the primary key when writing data into v2 tab ) with ('format-version'='2', 'write.upsert.enabled'='true'); ``` -2. Enabling `UPSERT` mode using `upsert-enabled` in the [write options](#write-options) provides more flexibility than a table level config. Note that you still need to use v2 table format and specify the [primary key](../flink-ddl.md/#primary-key) or [identifier fields](../../spec.md#identifier-field-ids) when creating the table. +2. Enabling `UPSERT` mode using `upsert-enabled` in the [write options](#write-options) provides more flexibility than a table level config. Note that you still need to use v2 table format and specify the [primary key](flink-ddl.md/#primary-key) or [identifier fields](../../spec.md#identifier-field-ids) when creating the table. ```sql INSERT INTO tableName /*+ OPTIONS('upsert-enabled'='true') */ @@ -185,7 +185,7 @@ FlinkSink.builderFor( ### Branch Writes Writing to branches in Iceberg tables is also supported via the `toBranch` API in `FlinkSink` -For more information on branches please refer to [branches](../branching.md). +For more information on branches please refer to [branches](branching.md). ```java FlinkSink.forRowData(input) .tableLoader(tableLoader) @@ -262,13 +262,13 @@ INSERT INTO tableName /*+ OPTIONS('upsert-enabled'='true') */ ... ``` -Check out all the options here: [write-options](../flink-configuration.md#write-options) +Check out all the options here: [write-options](flink-configuration.md#write-options) ## Notes Flink streaming write jobs rely on snapshot summary to keep the last committed checkpoint ID, and -store uncommitted data as temporary files. Therefore, [expiring snapshots](../maintenance.md#expire-snapshots) -and [deleting orphan files](../maintenance.md#delete-orphan-files) could possibly corrupt +store uncommitted data as temporary files. Therefore, [expiring snapshots](maintenance.md#expire-snapshots) +and [deleting orphan files](maintenance.md#delete-orphan-files) could possibly corrupt the state of the Flink job. To avoid that, make sure to keep the last snapshot created by the Flink job (which can be identified by the `flink.job-id` property in the summary), and only delete orphan files that are old enough. diff --git a/docs/docs/flink.md b/docs/docs/flink.md index 7f27a280eb91..b8ab694ad9bc 100644 --- a/docs/docs/flink.md +++ b/docs/docs/flink.md @@ -22,22 +22,22 @@ title: "Flink Getting Started" Apache Iceberg supports both [Apache Flink](https://flink.apache.org/)'s DataStream API and Table API. See the [Multi-Engine Support](../../multi-engine-support.md#apache-flink) page for the integration of Apache Flink. -| Feature support | Flink | Notes | -| ----------------------------------------------------------- |-------|----------------------------------------------------------------------------------------| -| [SQL create catalog](../flink-ddl.md#create-catalog) | ✔️ | | -| [SQL create database](../flink-ddl.md#create-database) | ✔️ | | -| [SQL create table](../flink-ddl.md#create-table) | ✔️ | | -| [SQL create table like](../flink-ddl.md#create-table-like) | ✔️ | | -| [SQL alter table](../flink-ddl.md#alter-table) | ✔️ | Only support altering table properties, column and partition changes are not supported | -| [SQL drop_table](../flink-ddl.md#drop-table) | ✔️ | | -| [SQL select](../flink-queries.md#reading-with-sql) | ✔️ | Support both streaming and batch mode | -| [SQL insert into](../flink-writes.md#insert-into) | ✔️ ️ | Support both streaming and batch mode | -| [SQL insert overwrite](../flink-writes.md#insert-overwrite) | ✔️ ️ | | -| [DataStream read](../flink-queries.md#reading-with-datastream) | ✔️ ️ | | -| [DataStream append](../flink-writes.md#appending-data) | ✔️ ️ | | -| [DataStream overwrite](../flink-writes.md#overwrite-data) | ✔️ ️ | | -| [Metadata tables](../flink-queries.md#inspecting-tables) | ✔️ | | -| [Rewrite files action](../flink-actions.md#rewrite-files-action) | ✔️ ️ | | +| Feature support | Flink | Notes | +| -------------------------------------------------------- |-------|----------------------------------------------------------------------------------------| +| [SQL create catalog](flink-ddl.md#create-catalog) | ✔️ | | +| [SQL create database](flink-ddl.md#create-database) | ✔️ | | +| [SQL create table](flink-ddl.md#create-table) | ✔️ | | +| [SQL create table like](flink-ddl.md#create-table-like) | ✔️ | | +| [SQL alter table](flink-ddl.md#alter-table) | ✔️ | Only support altering table properties, column and partition changes are not supported | +| [SQL drop_table](flink-ddl.md#drop-table) | ✔️ | | +| [SQL select](flink-queries.md#reading-with-sql) | ✔️ | Support both streaming and batch mode | +| [SQL insert into](flink-writes.md#insert-into) | ✔️ ️ | Support both streaming and batch mode | +| [SQL insert overwrite](flink-writes.md#insert-overwrite) | ✔️ ️ | | +| [DataStream read](flink-queries.md#reading-with-datastream) | ✔️ ️ | | +| [DataStream append](flink-writes.md#appending-data) | ✔️ ️ | | +| [DataStream overwrite](flink-writes.md#overwrite-data) | ✔️ ️ | | +| [Metadata tables](flink-queries.md#inspecting-tables) | ✔️ | | +| [Rewrite files action](flink-actions.md#rewrite-files-action) | ✔️ ️ | | ## Preparation when using Flink SQL Client @@ -69,6 +69,7 @@ export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath` ./bin/start-cluster.sh ``` + Start the Flink SQL client. There is a separate `flink-runtime` module in the Iceberg project to generate a bundled jar, which could be loaded by Flink SQL client directly. To build the `flink-runtime` bundled jar manually, build the `iceberg` project, and it will generate the jar under `/flink-runtime/build/libs`. Or download the `flink-runtime` jar from the [Apache repository](https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.16/{{ icebergVersion }}/). ```bash @@ -271,7 +272,7 @@ env.execute("Test Iceberg DataStream"); ### Branch Writes Writing to branches in Iceberg tables is also supported via the `toBranch` API in `FlinkSink` -For more information on branches please refer to [branches](../branching.md). +For more information on branches please refer to [branches](branching.md). ```java FlinkSink.forRowData(input) .tableLoader(tableLoader) diff --git a/docs/docs/spark-configuration.md b/docs/docs/spark-configuration.md index 5e9c6e5d1147..6ac4f1e9c82a 100644 --- a/docs/docs/spark-configuration.md +++ b/docs/docs/spark-configuration.md @@ -78,7 +78,7 @@ Both catalogs are configured using properties nested under the catalog name. Com | spark.sql.catalog._catalog-name_.table-default._propertyKey_ | | Default Iceberg table property value for property key _propertyKey_, which will be set on tables created by this catalog if not overridden | | spark.sql.catalog._catalog-name_.table-override._propertyKey_ | | Enforced Iceberg table property value for property key _propertyKey_, which cannot be overridden by user | -Additional properties can be found in common [catalog configuration](../configuration.md#catalog-properties). +Additional properties can be found in common [catalog configuration](configuration.md#catalog-properties). ### Using catalogs @@ -185,7 +185,7 @@ df.write | fanout-enabled | false | Overrides this table's write.spark.fanout.enabled | | check-ordering | true | Checks if input schema and table schema are same | | isolation-level | null | Desired isolation level for Dataframe overwrite operations. `null` => no checks (for idempotent writes), `serializable` => check for concurrent inserts or deletes in destination partitions, `snapshot` => checks for concurrent deletes in destination partitions. | -| validate-from-snapshot-id | null | If isolation level is set, id of base snapshot from which to check concurrent write conflicts into a table. Should be the snapshot before any reads from the table. Can be obtained via [Table API](../api.md#table-metadata) or [Snapshots table](../spark-queries.md#snapshots). If null, the table's oldest known snapshot is used. | +| validate-from-snapshot-id | null | If isolation level is set, id of base snapshot from which to check concurrent write conflicts into a table. Should be the snapshot before any reads from the table. Can be obtained via [Table API](api.md#table-metadata) or [Snapshots table](spark-queries.md#snapshots). If null, the table's oldest known snapshot is used. | | compression-codec | Table write.(fileformat).compression-codec | Overrides this table's compression codec for this write | | compression-level | Table write.(fileformat).compression-level | Overrides this table's compression level for Parquet and Avro tables for this write | | compression-strategy | Table write.orc.compression-strategy | Overrides this table's compression strategy for ORC tables for this write | diff --git a/docs/docs/spark-ddl.md b/docs/docs/spark-ddl.md index e1376ddcf667..8b30710997c8 100644 --- a/docs/docs/spark-ddl.md +++ b/docs/docs/spark-ddl.md @@ -33,14 +33,14 @@ CREATE TABLE prod.db.sample ( USING iceberg; ``` -Iceberg will convert the column type in Spark to corresponding Iceberg type. Please check the section of [type compatibility on creating table](../spark-getting-started.md#spark-type-to-iceberg-type) for details. +Iceberg will convert the column type in Spark to corresponding Iceberg type. Please check the section of [type compatibility on creating table](spark-getting-started.md#spark-type-to-iceberg-type) for details. Table create commands, including CTAS and RTAS, support the full range of Spark create clauses, including: * `PARTITIONED BY (partition-expressions)` to configure partitioning * `LOCATION '(fully-qualified-uri)'` to set the table location * `COMMENT 'table documentation'` to set a table description -* `TBLPROPERTIES ('key'='value', ...)` to set [table configuration](../configuration.md) +* `TBLPROPERTIES ('key'='value', ...)` to set [table configuration](configuration.md) Create commands may also set the default format with the `USING` clause. This is only supported for `SparkCatalog` because Spark handles the `USING` clause differently for the built-in catalog. @@ -59,7 +59,7 @@ USING iceberg PARTITIONED BY (category); ``` -The `PARTITIONED BY` clause supports transform expressions to create [hidden partitions](../partitioning.md). +The `PARTITIONED BY` clause supports transform expressions to create [hidden partitions](partitioning.md). ```sql CREATE TABLE prod.db.sample ( @@ -86,7 +86,7 @@ Note: Old syntax of `years(ts)`, `months(ts)`, `days(ts)` and `hours(ts)` are al ## `CREATE TABLE ... AS SELECT` -Iceberg supports CTAS as an atomic operation when using a [`SparkCatalog`](../spark-configuration.md#catalog-configuration). CTAS is supported, but is not atomic when using [`SparkSessionCatalog`](../spark-configuration.md#replacing-the-session-catalog). +Iceberg supports CTAS as an atomic operation when using a [`SparkCatalog`](spark-configuration.md#catalog-configuration). CTAS is supported, but is not atomic when using [`SparkSessionCatalog`](spark-configuration.md#replacing-the-session-catalog). ```sql CREATE TABLE prod.db.sample @@ -106,7 +106,7 @@ AS SELECT ... ## `REPLACE TABLE ... AS SELECT` -Iceberg supports RTAS as an atomic operation when using a [`SparkCatalog`](../spark-configuration.md#catalog-configuration). RTAS is supported, but is not atomic when using [`SparkSessionCatalog`](../spark-configuration.md#replacing-the-session-catalog). +Iceberg supports RTAS as an atomic operation when using a [`SparkCatalog`](spark-configuration.md#catalog-configuration). RTAS is supported, but is not atomic when using [`SparkSessionCatalog`](spark-configuration.md#replacing-the-session-catalog). Atomic table replacement creates a new snapshot with the results of the `SELECT` query, but keeps table history. @@ -168,7 +168,7 @@ Iceberg has full `ALTER TABLE` support in Spark 3, including: * Widening the type of `int`, `float`, and `decimal` fields * Making required columns optional -In addition, [SQL extensions](../spark-configuration.md#sql-extensions) can be used to add support for partition evolution and setting a table's write order +In addition, [SQL extensions](spark-configuration.md#sql-extensions) can be used to add support for partition evolution and setting a table's write order ### `ALTER TABLE ... RENAME TO` @@ -184,7 +184,7 @@ ALTER TABLE prod.db.sample SET TBLPROPERTIES ( ); ``` -Iceberg uses table properties to control table behavior. For a list of available properties, see [Table configuration](../configuration.md). +Iceberg uses table properties to control table behavior. For a list of available properties, see [Table configuration](configuration.md). `UNSET` is used to remove properties: @@ -325,7 +325,7 @@ ALTER TABLE prod.db.sample DROP COLUMN point.z; ## `ALTER TABLE` SQL extensions -These commands are available in Spark 3 when using Iceberg [SQL extensions](../spark-configuration.md#sql-extensions). +These commands are available in Spark 3 when using Iceberg [SQL extensions](spark-configuration.md#sql-extensions). ### `ALTER TABLE ... ADD PARTITION FIELD` diff --git a/docs/docs/spark-getting-started.md b/docs/docs/spark-getting-started.md index 72642cc6e14f..2bcdbd23eb1e 100644 --- a/docs/docs/spark-getting-started.md +++ b/docs/docs/spark-getting-started.md @@ -35,12 +35,13 @@ spark-shell --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:{{ iceb ``` !!! info + If you want to include Iceberg in your Spark installation, add the [`iceberg-spark-runtime-3.5_2.12` Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.5_2.12-{{ icebergVersion }}.jar) to Spark's `jars` folder. ### Adding catalogs -Iceberg comes with [catalogs](../spark-configuration.md#catalogs) that enable SQL commands to manage tables and load them by name. Catalogs are configured using properties under `spark.sql.catalog.(catalog_name)`. +Iceberg comes with [catalogs](spark-configuration.md#catalogs) that enable SQL commands to manage tables and load them by name. Catalogs are configured using properties under `spark.sql.catalog.(catalog_name)`. This command creates a path-based catalog named `local` for tables under `$PWD/warehouse` and adds support for Iceberg tables to Spark's built-in catalog: @@ -56,7 +57,7 @@ spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:{{ iceber ### Creating a table -To create your first Iceberg table in Spark, use the `spark-sql` shell or `spark.sql(...)` to run a [`CREATE TABLE`](../spark-ddl.md#create-table) command: +To create your first Iceberg table in Spark, use the `spark-sql` shell or `spark.sql(...)` to run a [`CREATE TABLE`](spark-ddl.md#create-table) command: ```sql -- local is the path-based catalog defined above @@ -65,21 +66,21 @@ CREATE TABLE local.db.table (id bigint, data string) USING iceberg; Iceberg catalogs support the full range of SQL DDL commands, including: -* [`CREATE TABLE ... PARTITIONED BY`](../spark-ddl.md#create-table) -* [`CREATE TABLE ... AS SELECT`](../spark-ddl.md#create-table-as-select) -* [`ALTER TABLE`](../spark-ddl.md#alter-table) -* [`DROP TABLE`](../spark-ddl.md#drop-table) +* [`CREATE TABLE ... PARTITIONED BY`](spark-ddl.md#create-table) +* [`CREATE TABLE ... AS SELECT`](spark-ddl.md#create-table-as-select) +* [`ALTER TABLE`](spark-ddl.md#alter-table) +* [`DROP TABLE`](spark-ddl.md#drop-table) ### Writing -Once your table is created, insert data using [`INSERT INTO`](../spark-writes.md#insert-into): +Once your table is created, insert data using [`INSERT INTO`](spark-writes.md#insert-into): ```sql INSERT INTO local.db.table VALUES (1, 'a'), (2, 'b'), (3, 'c'); INSERT INTO local.db.table SELECT id, data FROM source WHERE length(data) = 1; ``` -Iceberg also adds row-level SQL updates to Spark, [`MERGE INTO`](../spark-writes.md#merge-into) and [`DELETE FROM`](../spark-writes.md#delete-from): +Iceberg also adds row-level SQL updates to Spark, [`MERGE INTO`](spark-writes.md#merge-into) and [`DELETE FROM`](spark-writes.md#delete-from): ```sql MERGE INTO local.db.target t USING (SELECT * FROM updates) u ON t.id = u.id @@ -87,7 +88,7 @@ WHEN MATCHED THEN UPDATE SET t.count = t.count + u.count WHEN NOT MATCHED THEN INSERT *; ``` -Iceberg supports writing DataFrames using the new [v2 DataFrame write API](../spark-writes.md#writing-with-dataframes): +Iceberg supports writing DataFrames using the new [v2 DataFrame write API](spark-writes.md#writing-with-dataframes): ```scala spark.table("source").select("id", "data") @@ -106,7 +107,7 @@ FROM local.db.table GROUP BY data; ``` -SQL is also the recommended way to [inspect tables](../spark-queries.md#inspecting-tables). To view all snapshots in a table, use the `snapshots` metadata table: +SQL is also the recommended way to [inspect tables](spark-queries.md#inspecting-tables). To view all snapshots in a table, use the `snapshots` metadata table: ```sql SELECT * FROM local.db.table.snapshots; ``` @@ -121,7 +122,7 @@ SELECT * FROM local.db.table.snapshots; +-------------------------+----------------+-----------+-----------+----------------------------------------------------+-----+ ``` -[DataFrame reads](../spark-queries.md#querying-with-dataframes) are supported and can now reference tables by name using `spark.table`: +[DataFrame reads](spark-queries.md#querying-with-dataframes) are supported and can now reference tables by name using `spark.table`: ```scala val df = spark.table("local.db.table") @@ -192,7 +193,7 @@ This type conversion table describes how Iceberg types are converted to the Spar Next, you can learn more about Iceberg tables in Spark: -* [DDL commands](../spark-ddl.md): `CREATE`, `ALTER`, and `DROP` -* [Querying data](../spark-queries.md): `SELECT` queries and metadata tables -* [Writing data](../spark-writes.md): `INSERT INTO` and `MERGE INTO` -* [Maintaining tables](../spark-procedures.md) with stored procedures +* [DDL commands](spark-ddl.md): `CREATE`, `ALTER`, and `DROP` +* [Querying data](spark-queries.md): `SELECT` queries and metadata tables +* [Writing data](spark-writes.md): `INSERT INTO` and `MERGE INTO` +* [Maintaining tables](spark-procedures.md) with stored procedures diff --git a/docs/docs/spark-procedures.md b/docs/docs/spark-procedures.md index 7dc0d1a2aab7..dc439c04c855 100644 --- a/docs/docs/spark-procedures.md +++ b/docs/docs/spark-procedures.md @@ -20,7 +20,7 @@ title: "Procedures" # Spark Procedures -To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration.md). Stored procedures are only available when using [Iceberg SQL extensions](../spark-configuration.md#sql-extensions) in Spark 3. +To use Iceberg in Spark, first configure [Spark catalogs](spark-configuration.md). Stored procedures are only available when using [Iceberg SQL extensions](spark-configuration.md#sql-extensions) in Spark 3. ## Usage @@ -272,7 +272,7 @@ the `expire_snapshots` procedure will never remove files which are still require | `stream_results` | | boolean | When true, deletion files will be sent to Spark driver by RDD partition (by default, all the files will be sent to Spark driver). This option is recommended to set to `true` to prevent Spark driver OOM from large file size | | `snapshot_ids` | | array of long | Array of snapshot IDs to expire. | -If `older_than` and `retain_last` are omitted, the table's [expiration properties](../configuration.md#table-behavior-properties) will be used. +If `older_than` and `retain_last` are omitted, the table's [expiration properties](configuration.md#table-behavior-properties) will be used. Snapshots that are still referenced by branches or tags won't be removed. By default, branches and tags never expire, but their retention policy can be changed with the table property `history.expire.max-ref-age-ms`. The `main` branch never expires. #### Output @@ -357,7 +357,7 @@ Iceberg can compact data files in parallel using Spark with the `rewriteDataFile | `partial-progress.max-commits` | 10 | Maximum amount of commits that this rewrite is allowed to produce if partial progress is enabled | | `use-starting-sequence-number` | true | Use the sequence number of the snapshot at compaction start time instead of that of the newly produced snapshot | | `rewrite-job-order` | none | Force the rewrite job order based on the value.
  • If rewrite-job-order=bytes-asc, then rewrite the smallest job groups first.
  • If rewrite-job-order=bytes-desc, then rewrite the largest job groups first.
  • If rewrite-job-order=files-asc, then rewrite the job groups with the least files first.
  • If rewrite-job-order=files-desc, then rewrite the job groups with the most files first.
  • If rewrite-job-order=none, then rewrite job groups in the order they were planned (no specific ordering).
| -| `target-file-size-bytes` | 536870912 (512 MB, default value of `write.target-file-size-bytes` from [table properties](../configuration.md#write-properties)) | Target output file size | +| `target-file-size-bytes` | 536870912 (512 MB, default value of `write.target-file-size-bytes` from [table properties](configuration.md#write-properties)) | Target output file size | | `min-file-size-bytes` | 75% of target file size | Files under this threshold will be considered for rewriting regardless of any other criteria | | `max-file-size-bytes` | 180% of target file size | Files with sizes above this threshold will be considered for rewriting regardless of any other criteria | | `min-input-files` | 5 | Any file group exceeding this number of files will be rewritten regardless of other criteria | @@ -480,7 +480,7 @@ Dangling deletes are always filtered out during rewriting. | `partial-progress.enabled` | false | Enable committing groups of files prior to the entire rewrite completing | | `partial-progress.max-commits` | 10 | Maximum amount of commits that this rewrite is allowed to produce if partial progress is enabled | | `rewrite-job-order` | none | Force the rewrite job order based on the value.
  • If rewrite-job-order=bytes-asc, then rewrite the smallest job groups first.
  • If rewrite-job-order=bytes-desc, then rewrite the largest job groups first.
  • If rewrite-job-order=files-asc, then rewrite the job groups with the least files first.
  • If rewrite-job-order=files-desc, then rewrite the job groups with the most files first.
  • If rewrite-job-order=none, then rewrite job groups in the order they were planned (no specific ordering).
| -| `target-file-size-bytes` | 67108864 (64MB, default value of `write.delete.target-file-size-bytes` from [table properties](../configuration.md#write-properties)) | Target output file size | +| `target-file-size-bytes` | 67108864 (64MB, default value of `write.delete.target-file-size-bytes` from [table properties](configuration.md#write-properties)) | Target output file size | | `min-file-size-bytes` | 75% of target file size | Files under this threshold will be considered for rewriting regardless of any other criteria | | `max-file-size-bytes` | 180% of target file size | Files with sizes above this threshold will be considered for rewriting regardless of any other criteria | | `min-input-files` | 5 | Any file group exceeding this number of files will be rewritten regardless of other criteria | diff --git a/docs/docs/spark-queries.md b/docs/docs/spark-queries.md index 092ed6b1d636..536c136d7e55 100644 --- a/docs/docs/spark-queries.md +++ b/docs/docs/spark-queries.md @@ -20,11 +20,11 @@ title: "Queries" # Spark Queries -To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration.md). Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. +To use Iceberg in Spark, first configure [Spark catalogs](spark-configuration.md). Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. ## Querying with SQL -In Spark 3, tables use identifiers that include a [catalog name](../spark-configuration.md#using-catalogs). +In Spark 3, tables use identifiers that include a [catalog name](spark-configuration.md#using-catalogs). ```sql SELECT * FROM prod.db.table; -- catalog: prod, namespace: db, table: table diff --git a/docs/docs/spark-structured-streaming.md b/docs/docs/spark-structured-streaming.md index 50799042073f..0ac753808d9e 100644 --- a/docs/docs/spark-structured-streaming.md +++ b/docs/docs/spark-structured-streaming.md @@ -68,7 +68,7 @@ Iceberg supports `append` and `complete` output modes: * `append`: appends the rows of every micro-batch to the table * `complete`: replaces the table contents every micro-batch -Prior to starting the streaming query, ensure you created the table. Refer to the [SQL create table](../spark-ddl.md#create-table) documentation to learn how to create the Iceberg table. +Prior to starting the streaming query, ensure you created the table. Refer to the [SQL create table](spark-ddl.md#create-table) documentation to learn how to create the Iceberg table. Iceberg doesn't support experimental [continuous processing](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#continuous-processing), as it doesn't provide the interface to "commit" the output. @@ -76,7 +76,7 @@ Iceberg doesn't support experimental [continuous processing](https://spark.apach Iceberg requires sorting data by partition per task prior to writing the data. In Spark tasks are split by Spark partition. against partitioned table. For batch queries you're encouraged to do explicit sort to fulfill the requirement -(see [here](../spark-writes.md#writing-distribution-modes)), but the approach would bring additional latency as +(see [here](spark-writes.md#writing-distribution-modes)), but the approach would bring additional latency as repartition and sort are considered as heavy operations for streaming workload. To avoid additional latency, you can enable fanout writer to eliminate the requirement. @@ -107,13 +107,13 @@ documents how to configure the interval. ### Expire old snapshots -Each batch written to a table produces a new snapshot. Iceberg tracks snapshots in table metadata until they are expired. Snapshots accumulate quickly with frequent commits, so it is highly recommended that tables written by streaming queries are [regularly maintained](../maintenance.md#expire-snapshots). [Snapshot expiration](../spark-procedures.md#expire_snapshots) is the procedure of removing the metadata and any data files that are no longer needed. By default, the procedure will expire the snapshots older than five days. +Each batch written to a table produces a new snapshot. Iceberg tracks snapshots in table metadata until they are expired. Snapshots accumulate quickly with frequent commits, so it is highly recommended that tables written by streaming queries are [regularly maintained](maintenance.md#expire-snapshots). [Snapshot expiration](spark-procedures.md#expire_snapshots) is the procedure of removing the metadata and any data files that are no longer needed. By default, the procedure will expire the snapshots older than five days. ### Compacting data files -The amount of data written from a streaming process is typically small, which can cause the table metadata to track lots of small files. [Compacting small files into larger files](../maintenance.md#compact-data-files) reduces the metadata needed by the table, and increases query efficiency. Iceberg and Spark [comes with the `rewrite_data_files` procedure](../spark-procedures.md#rewrite_data_files). +The amount of data written from a streaming process is typically small, which can cause the table metadata to track lots of small files. [Compacting small files into larger files](maintenance.md#compact-data-files) reduces the metadata needed by the table, and increases query efficiency. Iceberg and Spark [comes with the `rewrite_data_files` procedure](spark-procedures.md#rewrite_data_files). ### Rewrite manifests To optimize write latency on a streaming workload, Iceberg can write the new snapshot with a "fast" append that does not automatically compact manifests. -This could lead lots of small manifest files. Iceberg can [rewrite the number of manifest files to improve query performance](../maintenance.md#rewrite-manifests). Iceberg and Spark [come with the `rewrite_manifests` procedure](../spark-procedures.md#rewrite_manifests). +This could lead lots of small manifest files. Iceberg can [rewrite the number of manifest files to improve query performance](maintenance.md#rewrite-manifests). Iceberg and Spark [come with the `rewrite_manifests` procedure](spark-procedures.md#rewrite_manifests). diff --git a/docs/docs/spark-writes.md b/docs/docs/spark-writes.md index efc15e7e35fc..626dee6c96e6 100644 --- a/docs/docs/spark-writes.md +++ b/docs/docs/spark-writes.md @@ -20,9 +20,9 @@ title: "Writes" # Spark Writes -To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration.md). +To use Iceberg in Spark, first configure [Spark catalogs](spark-configuration.md). -Some plans are only available when using [Iceberg SQL extensions](../spark-configuration.md#sql-extensions) in Spark 3. +Some plans are only available when using [Iceberg SQL extensions](spark-configuration.md#sql-extensions) in Spark 3. Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions: @@ -200,7 +200,7 @@ Branch writes can also be performed as part of a write-audit-publish (WAP) workf Note WAP branch and branch identifier cannot both be specified. Also, the branch must exist before performing the write. The operation does **not** create the branch if it does not exist. -For more information on branches please refer to [branches](../branching.md). +For more information on branches please refer to [branches](branching.md). ```sql -- INSERT (1,' a') (2, 'b') into the audit branch. @@ -364,7 +364,7 @@ There are 3 options for `write.distribution-mode` This mode does not request any shuffles or sort to be performed automatically by Spark. Because no work is done automatically by Spark, the data must be *manually* sorted by partition value. The data must be sorted either within each spark task, or globally within the entire dataset. A global sort will minimize the number of output files. -A sort can be avoided by using the Spark [write fanout](../spark-configuration.md#write-options) property but this will cause all +A sort can be avoided by using the Spark [write fanout](spark-configuration.md#write-options) property but this will cause all file handles to remain open until each write task has completed. * `hash` - This mode is the new default and requests that Spark uses a hash-based exchange to shuffle the incoming write data before writing. @@ -385,7 +385,7 @@ sort-order. Further division and coalescing of tasks may take place because of When writing data to Iceberg with Spark, it's important to note that Spark cannot write a file larger than a Spark task and a file cannot span an Iceberg partition boundary. This means although Iceberg will always roll over a file -when it grows to [`write.target-file-size-bytes`](../configuration.md#write-properties), but unless the Spark task is +when it grows to [`write.target-file-size-bytes`](configuration.md#write-properties), but unless the Spark task is large enough that will not happen. The size of the file created on disk will also be much smaller than the Spark task since the on disk data will be both compressed and in columnar format as opposed to Spark's uncompressed row representation. This means a 100 megabyte Spark task will create a file much smaller than 100 megabytes even if that diff --git a/format/spec.md b/format/spec.md index 397057a97456..ab6f3494830c 100644 --- a/format/spec.md +++ b/format/spec.md @@ -57,7 +57,7 @@ In addition to row-level deletes, version 2 makes some requirements stricter for ## Overview -![Iceberg snapshot structure](assets/images/iceberg-metadata.png) +![Iceberg snapshot structure](https://iceberg.apache.org/assets/images/iceberg-metadata.png) This table format tracks individual data files in a table instead of directories. This allows writers to create data files in-place and only adds files to the table in an explicit commit. diff --git a/site/README.md b/site/README.md index 6cb4f4907ebf..b1f9310e8bf8 100644 --- a/site/README.md +++ b/site/README.md @@ -74,7 +74,6 @@ The docs are built, run, and released using [make](https://www.gnu.org/software/ > [clean](dev/clean.sh): Clean the local site. > [deploy](dev/deploy.sh): Clean, build, and deploy the Iceberg docs site. > help: Show help for each of the Makefile recipes. -> [release](dev/release.sh): Release the current `/docs` as `ICEBERG_VERSION` (`make release ICEBERG_VERSION=`). > [serve](dev/serve.sh): Clean, build, and run the site locally. To scaffold the versioned docs and build the project, run the `build` recipe. @@ -103,6 +102,7 @@ This step will generate the staged source code which blends into the original so └─.asf.yaml ``` + To run this, run the `serve` recipe, which runs the `build` recipe and calls `mkdocs serve`. This will run locally at . ``` make serve diff --git a/site/docs/blogs.md b/site/docs/blogs.md index 746eef97d663..1714ce50405d 100644 --- a/site/docs/blogs.md +++ b/site/docs/blogs.md @@ -22,6 +22,7 @@ title: "Blogs" Here is a list of company blogs that talk about Iceberg. The blogs are ordered from most recent to oldest. + ### [The Apache Iceberg Lakehouse: The Great Data Equalizer](https://amdatalakehouse.substack.com/p/the-apache-iceberg-lakehouse-the) **Date**: March 6th, 2024, **Company**: Dremio @@ -42,6 +43,7 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + ### [What is the Data Lakehouse and the Role of Apache Iceberg, Nessie and Dremio?](https://amdatalakehouse.substack.com/p/the-apache-iceberg-lakehouse-the) **Date**: February 21st, 2024, **Company**: Dremio @@ -147,6 +149,7 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Dipankar Mazumdar](https://www.linkedin.com/in/dipankar-mazumdar/) + ### [Iceberg Tables: Catalog Support Now Available](https://www.snowflake.com/blog/iceberg-tables-catalog-support-available-now/) **Date**: March 29th, 2023, **Company**: Snowflake @@ -362,6 +365,7 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Sam Redai](https://www.linkedin.com/in/sredai/), [Kyle Bendickson](https://www.linkedin.com/in/kylebendickson/) + ### [Expanding the Data Cloud with Apache Iceberg](https://www.snowflake.com/blog/expanding-the-data-cloud-with-apache-iceberg/) **Date**: January 21st, 2022, **Company**: Snowflake @@ -377,11 +381,6 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Sam Redai](https://www.linkedin.com/in/sredai/) -### [Using Flink CDC to synchronize data from MySQL sharding tables and build real-time data lake](https://ververica.github.io/flink-cdc-connectors/master/content/quickstart/build-real-time-data-lake-tutorial.html) -**Date**: November 11th, 2021, **Company**: Ververica, Alibaba Cloud - -**Author**: [Yuxia Luo](https://github.com/luoyuxia), [Jark Wu](https://github.com/wuchong), [Zheng Hu](https://www.linkedin.com/in/zheng-hu-37017683/) - ### [Metadata Indexing in Iceberg](https://tabular.io/blog/iceberg-metadata-indexing/) **Date**: October 10th, 2021, **Company**: Tabular @@ -450,7 +449,7 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f ### [High Throughput Ingestion with Iceberg](https://medium.com/adobetech/high-throughput-ingestion-with-iceberg-ccf7877a413f) **Date**: Dec 22nd, 2020, **Company**: Adobe -**Author**: [Andrei Ionescu](http://linkedin.com/in/andreiionescu), [Shone Sadler](https://www.linkedin.com/in/shonesadler/), [Anil Malkani](https://www.linkedin.com/in/anil-malkani-52861a/) +**Author**: [Andrei Ionescu](https://www.linkedin.com/in/andreiionescu), [Shone Sadler](https://www.linkedin.com/in/shonesadler/), [Anil Malkani](https://www.linkedin.com/in/anil-malkani-52861a/) ### [Optimizing data warehouse storage](https://netflixtechblog.com/optimizing-data-warehouse-storage-7b94a48fdcbe) **Date**: Dec 21st, 2020, **Company**: Netflix diff --git a/site/docs/how-to-release.md b/site/docs/how-to-release.md index de3bcf958c67..f65b9d6c3941 100644 --- a/site/docs/how-to-release.md +++ b/site/docs/how-to-release.md @@ -376,7 +376,7 @@ The last step is to update the `main` branch in `iceberg-docs` to set the latest A PR needs to be published in the `iceberg-docs` repository with the following changes: 1. Update variable `latestVersions.iceberg` to the new release version in `landing-page/config.toml` 2. Update variable `latestVersions.iceberg` to the new release version and -`versions.nessie` to the version of `org.projectnessie.nessie:*` from [versions.props](https://github.com/apache/iceberg/blob/master/versions.props) in `docs/config.toml` +`versions.nessie` to the version of `org.projectnessie.nessie:*` from [mkdocs.yml](https://github.com/apache/iceberg/blob/main/site/mkdocs.yml) in `docs/config.toml` 3. Update list `versions` with the new release in `landing-page/config.toml` 4. Update list `versions` with the new release in `docs/config.toml` 5. Mark the current latest release notes to past releases under `landing-page/content/common/release-notes.md` diff --git a/site/docs/multi-engine-support.md b/site/docs/multi-engine-support.md index b0667361ef5c..ce4de4bdc1b8 100644 --- a/site/docs/multi-engine-support.md +++ b/site/docs/multi-engine-support.md @@ -59,6 +59,8 @@ Each engine version undergoes the following lifecycle stages: ### Apache Spark + + | Version | Lifecycle Stage | Initial Iceberg Support | Latest Iceberg Support | Latest Runtime Jar | | ---------- | ------------------ | ----------------------- |------------------------| ------------------ | | 2.4 | End of Life | 0.7.0-incubating | 1.2.1 | [iceberg-spark-runtime-2.4](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-2.4/1.2.1/iceberg-spark-runtime-2.4-1.2.1.jar) | @@ -69,6 +71,8 @@ Each engine version undergoes the following lifecycle stages: | 3.4 | Maintained | 1.3.0 | {{ icebergVersion }} | [iceberg-spark-runtime-3.4_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.4_2.12-{{ icebergVersion }}.jar) | | 3.5 | Maintained | 1.4.0 | {{ icebergVersion }} | [iceberg-spark-runtime-3.5_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.5_2.12-{{ icebergVersion }}.jar) | + + * [1] Spark 3.1 shares the same runtime jar `iceberg-spark3-runtime` with Spark 3.0 before Iceberg 0.13.0 ### Apache Flink @@ -76,6 +80,8 @@ Each engine version undergoes the following lifecycle stages: Based on the guideline of the Flink community, only the latest 2 minor versions are actively maintained. Users should continuously upgrade their Flink version to stay up-to-date. + + | Version | Lifecycle Stage | Initial Iceberg Support | Latest Iceberg Support | Latest Runtime Jar | | ------- | --------------- | ----------------------- |------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | 1.11 | End of Life | 0.9.0 | 0.12.1 | [iceberg-flink-runtime](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime/0.12.1/iceberg-flink-runtime-0.12.1.jar) | @@ -87,15 +93,21 @@ Users should continuously upgrade their Flink version to stay up-to-date. | 1.17 | Maintained | 1.3.0 | {{ icebergVersion }} | [iceberg-flink-runtime-1.17](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.17/{{ icebergVersion }}/iceberg-flink-runtime-1.17-{{ icebergVersion }}.jar) | | 1.18 | Maintained | 1.5.0 | {{ icebergVersion }} | [iceberg-flink-runtime-1.18](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.18/{{ icebergVersion }}/iceberg-flink-runtime-1.18-{{ icebergVersion }}.jar) | + + * [3] Flink 1.12 shares the same runtime jar `iceberg-flink-runtime` with Flink 1.11 before Iceberg 0.13.0 ### Apache Hive + + | Version | Recommended minor version | Lifecycle Stage | Initial Iceberg Support | Latest Iceberg Support | Latest Runtime Jar | | -------------- | ------------------------- | ----------------- | ----------------------- | ---------------------- | ------------------ | | 2 | 2.3.8 | Maintained | 0.8.0-incubating | {{ icebergVersion }} | [iceberg-hive-runtime](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/{{ icebergVersion }}/iceberg-hive-runtime-{{ icebergVersion }}.jar) | | 3 | 3.1.2 | Maintained | 0.10.0 | {{ icebergVersion }} | [iceberg-hive-runtime](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/{{ icebergVersion }}/iceberg-hive-runtime-{{ icebergVersion }}.jar) | + + ## Developer Guide ### Maintaining existing engine versions diff --git a/site/docs/releases.md b/site/docs/releases.md index 89c963f25a0b..09b7f439385b 100644 --- a/site/docs/releases.md +++ b/site/docs/releases.md @@ -20,6 +20,8 @@ title: "Releases" ## Downloads + + The latest version of Iceberg is [{{ icebergVersion }}](https://github.com/apache/iceberg/releases/tag/apache-iceberg-{{ icebergVersion }}). * [{{ icebergVersion }} source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-{{ icebergVersion }}/apache-iceberg-{{ icebergVersion }}.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-{{ icebergVersion }}/apache-iceberg-{{ icebergVersion }}.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-{{ icebergVersion }}/apache-iceberg-{{ icebergVersion }}.tar.gz.sha512) @@ -34,6 +36,8 @@ The latest version of Iceberg is [{{ icebergVersion }}](https://github.com/apach * [{{ icebergVersion }} gcp-bundle Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-gcp-bundle/{{ icebergVersion }}/iceberg-gcp-bundle-{{ icebergVersion }}.jar) * [{{ icebergVersion }} azure-bundle Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-azure-bundle/{{ icebergVersion }}/iceberg-azure-bundle-{{ icebergVersion }}.jar) + + To use Iceberg in Spark or Flink, download the runtime JAR for your engine version and add it to the jars folder of your installation. To use Iceberg in Hive 2 or Hive 3, download the Hive runtime JAR and add it to Hive using `ADD JAR`. @@ -970,6 +974,3 @@ A more exhaustive list of changes is available under the [0.10.0 release milesto ### 0.7.0 * Git tag: [apache-iceberg-0.7.0-incubating](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.7.0-incubating) -* [0.7.0-incubating source tar.gz](https://www.apache.org/dyn/closer.cgi/incubator/iceberg/apache-iceberg-0.7.0-incubating/apache-iceberg-0.7.0-incubating.tar.gz) -- [signature](https://dist.apache.org/repos/dist/release/incubator/iceberg/apache-iceberg-0.7.0-incubating/apache-iceberg-0.7.0-incubating.tar.gz.asc) -- [sha512](https://dist.apache.org/repos/dist/release/incubator/iceberg/apache-iceberg-0.7.0-incubating/apache-iceberg-0.7.0-incubating.tar.gz.sha512) -* [0.7.0-incubating Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.7.0-incubating/iceberg-spark-runtime-0.7.0-incubating.jar) - diff --git a/site/docs/spark-quickstart.md b/site/docs/spark-quickstart.md index 9601bcbdb0f8..5a940009f9a3 100644 --- a/site/docs/spark-quickstart.md +++ b/site/docs/spark-quickstart.md @@ -335,6 +335,7 @@ If you already have a Spark environment, you can add Iceberg, using the `--packa If you want to include Iceberg in your Spark installation, add the Iceberg Spark runtime to Spark's `jars` folder. You can download the runtime by visiting to the [Releases](releases.md) page. + [spark-runtime-jar]: https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.5_2.12-{{ icebergVersion }}.jar #### Learn More diff --git a/site/docs/vendors.md b/site/docs/vendors.md index 7609dcdf1903..d549219e5c10 100644 --- a/site/docs/vendors.md +++ b/site/docs/vendors.md @@ -58,10 +58,13 @@ IOMETE is a fully-managed ready to use, batteries included Data Platform. IOMETE PuppyGraph is a cloud-native graph analytics engine that enables users to query one or more relational data stores as a unified graph model. This eliminates the overhead of deploying and maintaining a siloed graph database system, with no ETL required. [PuppyGraph’s native Apache Iceberg integration](https://docs.puppygraph.com/user-manual/getting-started/iceberg) adds native graph capabilities to your existing data lake in an easy and performant way. -### [Snowflake](http://snowflake.com/) -[Snowflake](https://www.snowflake.com/en/) is a single, cross-cloud platform that enables every organization to mobilize their data with Snowflake’s Data Cloud. Snowflake supports Apache Iceberg by offering [Snowflake-managed Iceberg Tables](https://docs.snowflake.com/en/user-guide/tables-iceberg#use-snowflake-as-the-iceberg-catalog) for full DML as well as [externally managed Iceberg Tables with catalog integrations](https://docs.snowflake.com/en/user-guide/tables-iceberg#use-a-catalog-integration) for read-only access. + +### [Snowflake](https://snowflake.com/) -### [Starburst](http://starburst.io) + +[Snowflake](https://www.snowflake.com/) is a single, cross-cloud platform that enables every organization to mobilize their data with Snowflake’s Data Cloud. Snowflake supports Apache Iceberg by offering [Snowflake-managed Iceberg Tables](https://docs.snowflake.com/en/user-guide/tables-iceberg#use-snowflake-as-the-iceberg-catalog) for full DML as well as [externally managed Iceberg Tables with catalog integrations](https://docs.snowflake.com/en/user-guide/tables-iceberg#use-a-catalog-integration) for read-only access. + +### [Starburst](https://starburst.io) Starburst is a commercial offering for the [Trino query engine](https://trino.io). Trino is a distributed MPP SQL query engine that can query data in Iceberg at interactive speeds. Trino also enables you to join Iceberg tables with an [array of other systems](https://trino.io/docs/current/connector.html). Starburst offers both an [enterprise deployment](https://www.starburst.io/platform/starburst-enterprise/) and a [fully managed service](https://www.starburst.io/platform/starburst-galaxy/) to make managing and scaling Trino a flawless experience. Starburst also provides customer support and houses many of the original contributors to the open-source project that know Trino best. Learn more about [the Starburst Iceberg connector](https://docs.starburst.io/latest/connector/iceberg.html). diff --git a/site/link-checker-config.json b/site/link-checker-config.json new file mode 100644 index 000000000000..8eed0c163404 --- /dev/null +++ b/site/link-checker-config.json @@ -0,0 +1,23 @@ +{ + "ignorePatterns": [ + { + "pattern": "^https://www.linkedin.com/" + }, + { + "pattern": "^https://mvnrepository.com/" + }, + { + "pattern": "^../../javadoc" + } + ], + "replacementPatterns": [ + { + "pattern": "^docs/latest/", + "replacement": "{{BASEURL}}/docs/docs/" + }, + { + "pattern": "^../../", + "replacement": "{{BASEURL}}/site/docs/" + } + ] +} \ No newline at end of file From 857590f7fa8b04212e9ace2efe5f1378ede4f96f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 24 Mar 2024 06:14:58 +0100 Subject: [PATCH 03/25] Build: Bump mkdocs-material from 9.5.14 to 9.5.15 (#10031) Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.5.14 to 9.5.15. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.5.14...9.5.15) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- site/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/requirements.txt b/site/requirements.txt index e1efc7cbb2f9..e170b507fb44 100644 --- a/site/requirements.txt +++ b/site/requirements.txt @@ -17,7 +17,7 @@ mkdocs-awesome-pages-plugin==2.9.2 mkdocs-macros-plugin==1.0.5 -mkdocs-material==9.5.14 +mkdocs-material==9.5.15 mkdocs-material-extensions==1.3.1 mkdocs-monorepo-plugin @ git+https://github.com/bitsondatadev/mkdocs-monorepo-plugin@url-fix mkdocs-redirects==1.2.1 From 8311f052caaf1bcf003c81976e227dee94012950 Mon Sep 17 00:00:00 2001 From: Alex Merced Date: Mon, 25 Mar 2024 04:41:45 -0400 Subject: [PATCH 04/25] Docs: Fix link to blog post (#10028) Co-authored-by: Fokko Driesprong --- site/docs/blogs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/docs/blogs.md b/site/docs/blogs.md index 1714ce50405d..cf4e3254981b 100644 --- a/site/docs/blogs.md +++ b/site/docs/blogs.md @@ -44,7 +44,7 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) -### [What is the Data Lakehouse and the Role of Apache Iceberg, Nessie and Dremio?](https://amdatalakehouse.substack.com/p/the-apache-iceberg-lakehouse-the) +### [What is the Data Lakehouse and the Role of Apache Iceberg, Nessie and Dremio?](https://amdatalakehouse.substack.com/p/what-is-the-data-lakehouse-and-the) **Date**: February 21st, 2024, **Company**: Dremio **Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) From 49a66348f0284076fbbc9a1d57ce5c146a6efc60 Mon Sep 17 00:00:00 2001 From: Tom Tanaka <43331405+tomtongue@users.noreply.github.com> Date: Tue, 26 Mar 2024 00:23:30 +0900 Subject: [PATCH 05/25] Core: Migrate tests to JUnit5 (#10027) --- .../java/org/apache/iceberg/TestBase.java | 9 + .../org/apache/iceberg/TestMetricsModes.java | 144 ++--- .../apache/iceberg/TestMicroBatchBuilder.java | 105 ++-- .../apache/iceberg/TestRemoveSnapshots.java | 586 ++++++++---------- .../apache/iceberg/TestRewriteManifests.java | 319 +++++----- .../java/org/apache/iceberg/TestRowDelta.java | 439 +++++-------- .../iceberg/TestSchemaAndMappingUpdate.java | 14 +- .../org/apache/iceberg/TestSchemaUpdate.java | 22 +- .../iceberg/TestSequenceNumberForV2Table.java | 45 +- .../apache/iceberg/TestSingleValueParser.java | 50 +- .../org/apache/iceberg/TestSortOrder.java | 247 ++++---- .../apache/iceberg/TestSortOrderParser.java | 39 +- .../org/apache/iceberg/TestSplitPlanning.java | 110 ++-- .../iceberg/TestV1ToV2RowDeltaDelete.java | 161 +++-- .../org/apache/iceberg/V2TableTestBase.java | 10 +- 15 files changed, 1029 insertions(+), 1271 deletions(-) diff --git a/core/src/test/java/org/apache/iceberg/TestBase.java b/core/src/test/java/org/apache/iceberg/TestBase.java index 10aa57abf6f3..dc7bf0e8d8cb 100644 --- a/core/src/test/java/org/apache/iceberg/TestBase.java +++ b/core/src/test/java/org/apache/iceberg/TestBase.java @@ -211,6 +211,15 @@ List listManifestFiles(File tableDirToList) { && Files.getFileExtension(name).equalsIgnoreCase("avro"))); } + List listManifestLists(File tableDirToList) { + return Lists.newArrayList( + new File(tableDirToList, "metadata") + .listFiles( + (dir, name) -> + name.startsWith("snap") + && Files.getFileExtension(name).equalsIgnoreCase("avro"))); + } + public static long countAllMetadataFiles(File tableDir) { return Arrays.stream(new File(tableDir, "metadata").listFiles()) .filter(f -> f.isFile()) diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsModes.java b/core/src/test/java/org/apache/iceberg/TestMetricsModes.java index 564eb03ec04b..31ae459df506 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetricsModes.java +++ b/core/src/test/java/org/apache/iceberg/TestMetricsModes.java @@ -19,9 +19,15 @@ package org.apache.iceberg; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; import java.util.Map; import org.apache.iceberg.MetricsModes.Counts; import org.apache.iceberg.MetricsModes.Full; @@ -29,56 +35,48 @@ import org.apache.iceberg.MetricsModes.Truncate; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.After; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestMetricsModes { +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; - private final int formatVersion; +@ExtendWith(ParameterizedTestExtension.class) +public class TestMetricsModes { - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; - } + @Parameter private int formatVersion; - public TestMetricsModes(int formatVersion) { - this.formatVersion = formatVersion; + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); } - @Rule public TemporaryFolder temp = new TemporaryFolder(); + @TempDir private Path temp; - @After + @AfterEach public void after() { TestTables.clearTables(); } - @Test + @TestTemplate public void testMetricsModeParsing() { - Assert.assertEquals(None.get(), MetricsModes.fromString("none")); - Assert.assertEquals(None.get(), MetricsModes.fromString("nOnE")); - Assert.assertEquals(Counts.get(), MetricsModes.fromString("counts")); - Assert.assertEquals(Counts.get(), MetricsModes.fromString("coUntS")); - Assert.assertEquals(Truncate.withLength(1), MetricsModes.fromString("truncate(1)")); - Assert.assertEquals(Truncate.withLength(10), MetricsModes.fromString("truNcAte(10)")); - Assert.assertEquals(Full.get(), MetricsModes.fromString("full")); - Assert.assertEquals(Full.get(), MetricsModes.fromString("FULL")); + assertThat(MetricsModes.fromString("none")).isEqualTo(None.get()); + assertThat(MetricsModes.fromString("nOnE")).isEqualTo(None.get()); + assertThat(MetricsModes.fromString("counts")).isEqualTo(Counts.get()); + assertThat(MetricsModes.fromString("coUntS")).isEqualTo(Counts.get()); + assertThat(MetricsModes.fromString("truncate(1)")).isEqualTo(Truncate.withLength(1)); + assertThat(MetricsModes.fromString("truNcAte(10)")).isEqualTo(Truncate.withLength(10)); + assertThat(MetricsModes.fromString("full")).isEqualTo(Full.get()); + assertThat(MetricsModes.fromString("FULL")).isEqualTo(Full.get()); } - @Test + @TestTemplate public void testInvalidTruncationLength() { - Assertions.assertThatThrownBy(() -> MetricsModes.fromString("truncate(0)")) + assertThatThrownBy(() -> MetricsModes.fromString("truncate(0)")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Truncate length should be positive"); } - @Test + @TestTemplate public void testInvalidColumnModeValue() { Map properties = ImmutableMap.of( @@ -88,13 +86,12 @@ public void testInvalidColumnModeValue() { "troncate(5)"); MetricsConfig config = MetricsConfig.fromProperties(properties); - Assert.assertEquals( - "Invalid mode should be defaulted to table default (full)", - MetricsModes.Full.get(), - config.columnMode("col")); + assertThat(config.columnMode("col")) + .as("Invalid mode should be defaulted to table default (full)") + .isEqualTo(MetricsModes.Full.get()); } - @Test + @TestTemplate public void testInvalidDefaultColumnModeValue() { Map properties = ImmutableMap.of( @@ -104,16 +101,15 @@ public void testInvalidDefaultColumnModeValue() { "troncate(5)"); MetricsConfig config = MetricsConfig.fromProperties(properties); - Assert.assertEquals( - "Invalid mode should be defaulted to library default (truncate(16))", - MetricsModes.Truncate.withLength(16), - config.columnMode("col")); + assertThat(config.columnMode("col")) + .as("Invalid mode should be defaulted to library default (truncate(16))") + .isEqualTo(MetricsModes.Truncate.withLength(16)); } - @Test + @TestTemplate public void testMetricsConfigSortedColsDefault() throws Exception { - File tableDir = temp.newFolder(); - tableDir.delete(); // created by table create + File tableDir = Files.createTempDirectory(temp, "junit").toFile(); + assertThat(tableDir.delete()).isTrue(); Schema schema = new Schema( @@ -133,26 +129,24 @@ public void testMetricsConfigSortedColsDefault() throws Exception { .commit(); MetricsConfig config = MetricsConfig.forTable(testTable); - Assert.assertEquals( - "Non-sorted existing column should not be overridden", - Counts.get(), - config.columnMode("col1")); - Assert.assertEquals( - "Sorted column defaults should not override user specified config", - None.get(), - config.columnMode("col2")); - Assert.assertEquals( - "Unspecified sorted column should use default", - Truncate.withLength(16), - config.columnMode("col3")); - Assert.assertEquals( - "Unspecified normal column should use default", Counts.get(), config.columnMode("col4")); + assertThat(config.columnMode("col1")) + .as("Non-sorted existing column should not be overridden") + .isEqualTo(Counts.get()); + assertThat(config.columnMode("col2")) + .as("Sorted column defaults should not override user specified config") + .isEqualTo(None.get()); + assertThat(config.columnMode("col3")) + .as("Unspecified sorted column should use default") + .isEqualTo(Truncate.withLength(16)); + assertThat(config.columnMode("col4")) + .as("Unspecified normal column should use default") + .isEqualTo(Counts.get()); } - @Test + @TestTemplate public void testMetricsConfigSortedColsDefaultByInvalid() throws Exception { - File tableDir = temp.newFolder(); - tableDir.delete(); // created by table create + File tableDir = Files.createTempDirectory(temp, "junit").toFile(); + assertThat(tableDir.delete()).isTrue(); Schema schema = new Schema( @@ -171,17 +165,15 @@ public void testMetricsConfigSortedColsDefaultByInvalid() throws Exception { .commit(); MetricsConfig config = MetricsConfig.forTable(testTable); - Assert.assertEquals( - "Non-sorted existing column should not be overridden by sorted column", - Full.get(), - config.columnMode("col1")); - Assert.assertEquals( - "Original default applies as user entered invalid mode for sorted column", - Counts.get(), - config.columnMode("col2")); + assertThat(config.columnMode("col1")) + .as("Non-sorted existing column should not be overridden by sorted column") + .isEqualTo(Full.get()); + assertThat(config.columnMode("col2")) + .as("Original default applies as user entered invalid mode for sorted column") + .isEqualTo(Counts.get()); } - @Test + @TestTemplate public void testMetricsConfigInferredDefaultModeLimit() throws IOException { Schema schema = new Schema( @@ -189,8 +181,8 @@ public void testMetricsConfigInferredDefaultModeLimit() throws IOException { required(2, "col2", Types.IntegerType.get()), required(3, "col3", Types.IntegerType.get())); - File tableDir = temp.newFolder(); - Assert.assertTrue(tableDir.delete()); + File tableDir = Files.createTempDirectory(temp, "junit").toFile(); + assertThat(tableDir.delete()).isTrue(); Table table = TestTables.create( @@ -209,10 +201,8 @@ public void testMetricsConfigInferredDefaultModeLimit() throws IOException { MetricsConfig config = MetricsConfig.forTable(table); - Assert.assertEquals( - "Should use default mode for col1", Truncate.withLength(16), config.columnMode("col1")); - Assert.assertEquals( - "Should use default mode for col2", Truncate.withLength(16), config.columnMode("col2")); - Assert.assertEquals("Should use None for col3", None.get(), config.columnMode("col3")); + assertThat(config.columnMode("col1")).isEqualTo(Truncate.withLength(16)); + assertThat(config.columnMode("col2")).isEqualTo(Truncate.withLength(16)); + assertThat(config.columnMode("col3")).isEqualTo(None.get()); } } diff --git a/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java b/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java index deb6e7c8ad32..733bb0bb38fd 100644 --- a/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java +++ b/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java @@ -18,34 +18,31 @@ */ package org.apache.iceberg; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Arrays; import java.util.Collections; import java.util.List; import org.apache.iceberg.MicroBatches.MicroBatch; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestMicroBatchBuilder extends TableTestBase { - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; - } - - public TestMicroBatchBuilder(int formatVersion) { - super(formatVersion); +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestMicroBatchBuilder extends TestBase { + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); } - @Before + @BeforeEach public void setupTableProperties() { table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "3").commit(); } - @Test + @TestTemplate public void testGenerateMicroBatch() { add(table.newAppend(), files("A", "B", "C", "D", "E")); @@ -53,42 +50,42 @@ public void testGenerateMicroBatch() { MicroBatches.from(table.snapshot(1L), table.io()) .specsById(table.specs()) .generate(0, 6, Long.MAX_VALUE, true); - Assert.assertEquals(batch.snapshotId(), 1L); - Assert.assertEquals(batch.startFileIndex(), 0); - Assert.assertEquals(batch.endFileIndex(), 5); - Assert.assertEquals(batch.sizeInBytes(), 50); - Assert.assertTrue(batch.lastIndexOfSnapshot()); + assertThat(batch.snapshotId()).isEqualTo(1L); + assertThat(batch.startFileIndex()).isEqualTo(0); + assertThat(batch.endFileIndex()).isEqualTo(5); + assertThat(batch.sizeInBytes()).isEqualTo(50); + assertThat(batch.lastIndexOfSnapshot()).isTrue(); filesMatch(Lists.newArrayList("A", "B", "C", "D", "E"), filesToScan(batch.tasks())); MicroBatch batch1 = MicroBatches.from(table.snapshot(1L), table.io()) .specsById(table.specs()) .generate(0, 1, 15L, true); - Assert.assertEquals(batch1.endFileIndex(), 1); - Assert.assertEquals(batch1.sizeInBytes(), 10); - Assert.assertFalse(batch1.lastIndexOfSnapshot()); + assertThat(batch1.endFileIndex()).isEqualTo(1); + assertThat(batch1.sizeInBytes()).isEqualTo(10); + assertThat(batch1.lastIndexOfSnapshot()).isFalse(); filesMatch(Lists.newArrayList("A"), filesToScan(batch1.tasks())); MicroBatch batch2 = MicroBatches.from(table.snapshot(1L), table.io()) .specsById(table.specs()) .generate(batch1.endFileIndex(), 4, 30L, true); - Assert.assertEquals(batch2.endFileIndex(), 4); - Assert.assertEquals(batch2.sizeInBytes(), 30); - Assert.assertFalse(batch2.lastIndexOfSnapshot()); + assertThat(batch2.endFileIndex()).isEqualTo(4); + assertThat(batch2.sizeInBytes()).isEqualTo(30); + assertThat(batch2.lastIndexOfSnapshot()).isFalse(); filesMatch(Lists.newArrayList("B", "C", "D"), filesToScan(batch2.tasks())); MicroBatch batch3 = MicroBatches.from(table.snapshot(1L), table.io()) .specsById(table.specs()) .generate(batch2.endFileIndex(), 5, 50L, true); - Assert.assertEquals(batch3.endFileIndex(), 5); - Assert.assertEquals(batch3.sizeInBytes(), 10); - Assert.assertTrue(batch3.lastIndexOfSnapshot()); + assertThat(batch3.endFileIndex()).isEqualTo(5); + assertThat(batch3.sizeInBytes()).isEqualTo(10); + assertThat(batch3.lastIndexOfSnapshot()).isTrue(); filesMatch(Lists.newArrayList("E"), filesToScan(batch3.tasks())); } - @Test + @TestTemplate public void testGenerateMicroBatchWithSmallTargetSize() { add(table.newAppend(), files("A", "B", "C", "D", "E")); @@ -96,57 +93,57 @@ public void testGenerateMicroBatchWithSmallTargetSize() { MicroBatches.from(table.snapshot(1L), table.io()) .specsById(table.specs()) .generate(0, 1, 10L, true); - Assert.assertEquals(batch.snapshotId(), 1L); - Assert.assertEquals(batch.startFileIndex(), 0); - Assert.assertEquals(batch.endFileIndex(), 1); - Assert.assertEquals(batch.sizeInBytes(), 10); - Assert.assertFalse(batch.lastIndexOfSnapshot()); + assertThat(batch.snapshotId()).isEqualTo(1L); + assertThat(batch.startFileIndex()).isEqualTo(0); + assertThat(batch.endFileIndex()).isEqualTo(1); + assertThat(batch.sizeInBytes()).isEqualTo(10); + assertThat(batch.lastIndexOfSnapshot()).isFalse(); filesMatch(Lists.newArrayList("A"), filesToScan(batch.tasks())); MicroBatch batch1 = MicroBatches.from(table.snapshot(1L), table.io()) .specsById(table.specs()) .generate(batch.endFileIndex(), 2, 5L, true); - Assert.assertEquals(batch1.endFileIndex(), 2); - Assert.assertEquals(batch1.sizeInBytes(), 10); + assertThat(batch1.endFileIndex()).isEqualTo(2); + assertThat(batch1.sizeInBytes()).isEqualTo(10); filesMatch(Lists.newArrayList("B"), filesToScan(batch1.tasks())); - Assert.assertFalse(batch1.lastIndexOfSnapshot()); + assertThat(batch1.lastIndexOfSnapshot()).isFalse(); MicroBatch batch2 = MicroBatches.from(table.snapshot(1L), table.io()) .specsById(table.specs()) .generate(batch1.endFileIndex(), 3, 10L, true); - Assert.assertEquals(batch2.endFileIndex(), 3); - Assert.assertEquals(batch2.sizeInBytes(), 10); + assertThat(batch2.endFileIndex()).isEqualTo(3); + assertThat(batch2.sizeInBytes()).isEqualTo(10); filesMatch(Lists.newArrayList("C"), filesToScan(batch2.tasks())); - Assert.assertFalse(batch2.lastIndexOfSnapshot()); + assertThat(batch2.lastIndexOfSnapshot()).isFalse(); MicroBatch batch3 = MicroBatches.from(table.snapshot(1L), table.io()) .specsById(table.specs()) .generate(batch2.endFileIndex(), 4, 10L, true); - Assert.assertEquals(batch3.endFileIndex(), 4); - Assert.assertEquals(batch3.sizeInBytes(), 10); + assertThat(batch3.endFileIndex()).isEqualTo(4); + assertThat(batch3.sizeInBytes()).isEqualTo(10); filesMatch(Lists.newArrayList("D"), filesToScan(batch3.tasks())); - Assert.assertFalse(batch3.lastIndexOfSnapshot()); + assertThat(batch3.lastIndexOfSnapshot()).isFalse(); MicroBatch batch4 = MicroBatches.from(table.snapshot(1L), table.io()) .specsById(table.specs()) .generate(batch3.endFileIndex(), 5, 5L, true); - Assert.assertEquals(batch4.endFileIndex(), 5); - Assert.assertEquals(batch4.sizeInBytes(), 10); + assertThat(batch4.endFileIndex()).isEqualTo(5); + assertThat(batch4.sizeInBytes()).isEqualTo(10); filesMatch(Lists.newArrayList("E"), filesToScan(batch4.tasks())); - Assert.assertTrue(batch4.lastIndexOfSnapshot()); + assertThat(batch4.lastIndexOfSnapshot()).isTrue(); MicroBatch batch5 = MicroBatches.from(table.snapshot(1L), table.io()) .specsById(table.specs()) .generate(batch4.endFileIndex(), 5, 5L, true); - Assert.assertEquals(batch5.endFileIndex(), 5); - Assert.assertEquals(batch5.sizeInBytes(), 0); - Assert.assertTrue(Iterables.isEmpty(batch5.tasks())); - Assert.assertTrue(batch5.lastIndexOfSnapshot()); + assertThat(batch5.endFileIndex()).isEqualTo(5); + assertThat(batch5.sizeInBytes()).isEqualTo(0); + assertThat(batch5.tasks()).isEmpty(); + assertThat(batch5.lastIndexOfSnapshot()).isTrue(); } private static DataFile file(String name) { @@ -190,6 +187,6 @@ private static List filesToScan(Iterable tasks) { private static void filesMatch(List expected, List actual) { Collections.sort(expected); Collections.sort(actual); - Assert.assertEquals(expected, actual); + assertThat(actual).isEqualTo(expected); } } diff --git a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java b/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java index 11e2daca6ab0..6a47a24fb494 100644 --- a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java +++ b/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java @@ -18,11 +18,16 @@ */ package org.apache.iceberg; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; +import java.util.Arrays; import java.util.List; import java.util.Set; import java.util.UUID; @@ -42,30 +47,21 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestRemoveSnapshots extends TableTestBase { - private final boolean incrementalCleanup; - - @Parameterized.Parameters(name = "formatVersion = {0}, incrementalCleanup = {1}") - public static Object[] parameters() { - return new Object[][] { - new Object[] {1, true}, - new Object[] {2, true}, - new Object[] {1, false}, - new Object[] {2, false} - }; - } - - public TestRemoveSnapshots(int formatVersion, boolean incrementalCleanup) { - super(formatVersion); - this.incrementalCleanup = incrementalCleanup; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestRemoveSnapshots extends TestBase { + @Parameter(index = 1) + private boolean incrementalCleanup; + + @Parameters(name = "formatVersion = {0}, incrementalCleanup = {1}") + protected static List parameters() { + return Arrays.asList( + new Object[] {1, true}, + new Object[] {2, true}, + new Object[] {1, false}, + new Object[] {2, false}); } private long waitUntilAfter(long timestampMillis) { @@ -76,7 +72,7 @@ private long waitUntilAfter(long timestampMillis) { return current; } - @Test + @TestTemplate public void testExpireOlderThan() { table.newAppend().appendFile(FILE_A).commit(); @@ -94,35 +90,26 @@ public void testExpireOlderThan() { removeSnapshots(table).expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals( - "Expire should not change current snapshot", - snapshotId, - table.currentSnapshot().snapshotId()); - Assert.assertNull( - "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertEquals( - "Should remove only the expired manifest list location", - Sets.newHashSet(firstSnapshot.manifestListLocation()), - deletedFiles); + assertThat(table.currentSnapshot().snapshotId()).isEqualTo(snapshotId); + assertThat(table.snapshot(firstSnapshot.snapshotId())).isNull(); + assertThat(deletedFiles).containsExactly(firstSnapshot.manifestListLocation()); } - @Test + @TestTemplate public void testExpireOlderThanWithDelete() { table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); + assertThat(firstSnapshot.allManifests(table.io())).hasSize(1); waitUntilAfter(table.currentSnapshot().timestampMillis()); table.newDelete().deleteFile(FILE_A).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should create replace manifest with a rewritten manifest", - 1, - secondSnapshot.allManifests(table.io()).size()); + assertThat(secondSnapshot.allManifests(table.io())) + .as("Should create replace manifest with a rewritten manifest") + .hasSize(1); table.newAppend().appendFile(FILE_B).commit(); @@ -136,34 +123,29 @@ public void testExpireOlderThanWithDelete() { removeSnapshots(table).expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals( - "Expire should not change current snapshot", - snapshotId, - table.currentSnapshot().snapshotId()); - Assert.assertNull( - "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull( - "Expire should remove the second oldest snapshot", - table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals( - "Should remove expired manifest lists and deleted data file", - Sets.newHashSet( - firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot - .allManifests(table.io()) - .get(0) - .path(), // manifest was rewritten for delete - secondSnapshot.manifestListLocation(), // snapshot expired - secondSnapshot - .allManifests(table.io()) - .get(0) - .path(), // manifest contained only deletes, was dropped - FILE_A.path()), // deleted - deletedFiles); + assertThat(table.currentSnapshot().snapshotId()).isEqualTo(snapshotId); + assertThat(table.snapshot(firstSnapshot.snapshotId())).isNull(); + assertThat(table.snapshot(secondSnapshot.snapshotId())).isNull(); + + assertThat(deletedFiles) + .as("Should remove expired manifest lists and deleted data file") + .isEqualTo( + Sets.newHashSet( + firstSnapshot.manifestListLocation(), // snapshot expired + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete + secondSnapshot.manifestListLocation(), // snapshot expired + secondSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest contained only deletes, was dropped + FILE_A.path() // deleted + )); } - @Test + @TestTemplate public void testExpireOlderThanWithDeleteInMergedManifests() { // merge every commit table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); @@ -171,8 +153,7 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); + assertThat(firstSnapshot.allManifests(table.io())).hasSize(1); waitUntilAfter(table.currentSnapshot().timestampMillis()); @@ -182,10 +163,9 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should replace manifest with a rewritten manifest", - 1, - secondSnapshot.allManifests(table.io()).size()); + assertThat(secondSnapshot.allManifests(table.io())) + .as("Should replace manifest with a rewritten manifest") + .hasSize(1); table .newFastAppend() // do not merge to keep the last snapshot's manifest valid @@ -202,30 +182,25 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { removeSnapshots(table).expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals( - "Expire should not change current snapshot", - snapshotId, - table.currentSnapshot().snapshotId()); - Assert.assertNull( - "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull( - "Expire should remove the second oldest snapshot", - table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals( - "Should remove expired manifest lists and deleted data file", - Sets.newHashSet( - firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot - .allManifests(table.io()) - .get(0) - .path(), // manifest was rewritten for delete - secondSnapshot.manifestListLocation(), // snapshot expired - FILE_A.path()), // deleted - deletedFiles); + assertThat(table.currentSnapshot().snapshotId()).isEqualTo(snapshotId); + assertThat(table.snapshot(firstSnapshot.snapshotId())).isNull(); + assertThat(table.snapshot(secondSnapshot.snapshotId())).isNull(); + + assertThat(deletedFiles) + .as("Should remove expired manifest lists and deleted data file") + .isEqualTo( + Sets.newHashSet( + firstSnapshot.manifestListLocation(), // snapshot expired + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete + secondSnapshot.manifestListLocation(), // snapshot expired + FILE_A.path() // deleted + )); } - @Test + @TestTemplate public void testExpireOlderThanWithRollback() { // merge every commit table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); @@ -233,8 +208,7 @@ public void testExpireOlderThanWithRollback() { table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); + assertThat(firstSnapshot.allManifests(table.io())).hasSize(1); waitUntilAfter(table.currentSnapshot().timestampMillis()); @@ -244,8 +218,7 @@ public void testExpireOlderThanWithRollback() { Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals( - "Should add one new manifest for append", 1, secondSnapshotManifests.size()); + assertThat(secondSnapshotManifests).hasSize(1); table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); @@ -257,32 +230,30 @@ public void testExpireOlderThanWithRollback() { removeSnapshots(table).expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals( - "Expire should not change current snapshot", - snapshotId, - table.currentSnapshot().snapshotId()); - Assert.assertNotNull( - "Expire should keep the oldest snapshot, current", - table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull( - "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals( - "Should remove expired manifest lists and reverted appended data file", - Sets.newHashSet( - secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests) - .path()), // manifest is no longer referenced - deletedFiles); + assertThat(table.currentSnapshot().snapshotId()).isEqualTo(snapshotId); + assertThat(table.snapshot(firstSnapshot.snapshotId())) + .as("Expire should keep the oldest snapshot, current") + .isNotNull(); + assertThat(table.snapshot(secondSnapshot.snapshotId())) + .as("Expire should remove the orphaned snapshot") + .isNull(); + + assertThat(deletedFiles) + .as("Should remove expired manifest lists and reverted appended data file") + .isEqualTo( + Sets.newHashSet( + secondSnapshot.manifestListLocation(), // snapshot expired + Iterables.getOnlyElement(secondSnapshotManifests) + .path()) // manifest is no longer referenced + ); } - @Test + @TestTemplate public void testExpireOlderThanWithRollbackAndMergedManifests() { table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); + assertThat(firstSnapshot.allManifests(table.io())).hasSize(1); waitUntilAfter(table.currentSnapshot().timestampMillis()); @@ -292,8 +263,7 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals( - "Should add one new manifest for append", 1, secondSnapshotManifests.size()); + assertThat(secondSnapshotManifests).hasSize(1); table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); @@ -305,27 +275,28 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { removeSnapshots(table).expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals( - "Expire should not change current snapshot", - snapshotId, - table.currentSnapshot().snapshotId()); - Assert.assertNotNull( - "Expire should keep the oldest snapshot, current", - table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull( - "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals( - "Should remove expired manifest lists and reverted appended data file", - Sets.newHashSet( - secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests) - .path(), // manifest is no longer referenced - FILE_B.path()), // added, but rolled back - deletedFiles); + assertThat(table.currentSnapshot().snapshotId()).isEqualTo(snapshotId); + assertThat(table.snapshot(firstSnapshot.snapshotId())) + .as("Expire should keep the oldest snapshot, current") + .isNotNull(); + assertThat(table.snapshot(secondSnapshot.snapshotId())) + .as("Expire should remove the orphaned snapshot") + .isNull(); + + assertThat(deletedFiles) + .as("Should remove expired manifest lists and reverted appended data file") + .isEqualTo( + Sets.newHashSet( + secondSnapshot.manifestListLocation(), // snapshot expired + secondSnapshotManifests.stream() + .findFirst() + .get() + .path(), // manifest is no longer referenced + FILE_B.path()) // added, but rolled back + ); } - @Test + @TestTemplate public void testRetainLastWithExpireOlderThan() { long t0 = System.currentTimeMillis(); table @@ -361,13 +332,11 @@ public void testRetainLastWithExpireOlderThan() { // Retain last 2 snapshots removeSnapshots(table).expireOlderThan(t3).retainLast(2).commit(); - Assert.assertEquals( - "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals( - "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + assertThat(table.snapshots()).hasSize(2); + assertThat(table.snapshot(firstSnapshotId)).isNull(); } - @Test + @TestTemplate public void testRetainLastWithExpireById() { long t0 = System.currentTimeMillis(); table @@ -403,13 +372,11 @@ public void testRetainLastWithExpireById() { // Retain last 3 snapshots, but explicitly remove the first snapshot removeSnapshots(table).expireSnapshotId(firstSnapshotId).retainLast(3).commit(); - Assert.assertEquals( - "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals( - "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + assertThat(table.snapshots()).hasSize(2); + assertThat(table.snapshot(firstSnapshotId)).isNull(); } - @Test + @TestTemplate public void testRetainNAvailableSnapshotsWithTransaction() { long t0 = System.currentTimeMillis(); table @@ -442,23 +409,19 @@ public void testRetainNAvailableSnapshotsWithTransaction() { t3 = System.currentTimeMillis(); } - Assert.assertEquals( - "Should be 3 manifest lists", 3, listManifestLists(table.location()).size()); + assertThat(listManifestFiles(new File(table.location()))).hasSize(3); // Retain last 2 snapshots, which means 1 is deleted. Transaction tx = table.newTransaction(); removeSnapshots(tx.table()).expireOlderThan(t3).retainLast(2).commit(); tx.commitTransaction(); - Assert.assertEquals( - "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals( - "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); - Assert.assertEquals( - "Should be 2 manifest lists", 2, listManifestLists(table.location()).size()); + assertThat(table.snapshots()).hasSize(2); + assertThat(table.snapshot(firstSnapshotId)).isNull(); + assertThat(listManifestLists(new File(table.location()))).hasSize(2); } - @Test + @TestTemplate public void testRetainLastWithTooFewSnapshots() { long t0 = System.currentTimeMillis(); table @@ -486,15 +449,11 @@ public void testRetainLastWithTooFewSnapshots() { // Retain last 3 snapshots removeSnapshots(table).expireOlderThan(t2).retainLast(3).commit(); - Assert.assertEquals( - "Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals( - "First snapshot should still present", - firstSnapshotId, - table.snapshot(firstSnapshotId).snapshotId()); + assertThat(table.snapshots()).hasSize(2); + assertThat(table.snapshot(firstSnapshotId).snapshotId()).isEqualTo(firstSnapshotId); } - @Test + @TestTemplate public void testRetainNLargerThanCurrentSnapshots() { // Append 3 files table @@ -532,11 +491,10 @@ public void testRetainNLargerThanCurrentSnapshots() { removeSnapshots(tx.table()).expireOlderThan(t3).retainLast(4).commit(); tx.commitTransaction(); - Assert.assertEquals( - "Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); + assertThat(table.snapshots()).hasSize(3); } - @Test + @TestTemplate public void testRetainLastKeepsExpiringSnapshot() { long t0 = System.currentTimeMillis(); table @@ -582,13 +540,11 @@ public void testRetainLastKeepsExpiringSnapshot() { // Retain last 2 snapshots and expire older than t3 removeSnapshots(table).expireOlderThan(secondSnapshot.timestampMillis()).retainLast(2).commit(); - Assert.assertEquals( - "Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNotNull( - "Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); + assertThat(table.snapshots()).hasSize(3); + assertThat(table.snapshot(secondSnapshot.snapshotId())).isNotNull(); } - @Test + @TestTemplate public void testExpireOlderThanMultipleCalls() { long t0 = System.currentTimeMillis(); table @@ -628,13 +584,11 @@ public void testExpireOlderThanMultipleCalls() { .expireOlderThan(thirdSnapshot.timestampMillis()) .commit(); - Assert.assertEquals( - "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull( - "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + assertThat(table.snapshots()).hasSize(1); + assertThat(table.snapshot(secondSnapshot.snapshotId())).isNull(); } - @Test + @TestTemplate public void testRetainLastMultipleCalls() { long t0 = System.currentTimeMillis(); table @@ -670,20 +624,18 @@ public void testRetainLastMultipleCalls() { // Retain last 2 snapshots and expire older than t3 removeSnapshots(table).expireOlderThan(t3).retainLast(2).retainLast(1).commit(); - Assert.assertEquals( - "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull( - "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + assertThat(table.snapshots()).hasSize(1); + assertThat(table.snapshot(secondSnapshot.snapshotId())).isNull(); } - @Test + @TestTemplate public void testRetainZeroSnapshots() { - Assertions.assertThatThrownBy(() -> removeSnapshots(table).retainLast(0).commit()) + assertThatThrownBy(() -> removeSnapshots(table).retainLast(0).commit()) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Number of snapshots to retain must be at least 1, cannot be: 0"); } - @Test + @TestTemplate public void testScanExpiredManifestInValidSnapshotAppend() { table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); @@ -700,10 +652,10 @@ public void testScanExpiredManifestInValidSnapshotAppend() { removeSnapshots(table).expireOlderThan(t3).deleteWith(deletedFiles::add).commit(); - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); + assertThat(deletedFiles).contains(FILE_A.path().toString()); } - @Test + @TestTemplate public void testScanExpiredManifestInValidSnapshotFastAppend() { table .updateProperties() @@ -726,10 +678,10 @@ public void testScanExpiredManifestInValidSnapshotFastAppend() { removeSnapshots(table).expireOlderThan(t3).deleteWith(deletedFiles::add).commit(); - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); + assertThat(deletedFiles).contains(FILE_A.path().toString()); } - @Test + @TestTemplate public void dataFilesCleanup() throws IOException { table.newFastAppend().appendFile(FILE_A).commit(); @@ -763,11 +715,11 @@ public void dataFilesCleanup() throws IOException { removeSnapshots(table).expireOlderThan(t4).deleteWith(deletedFiles::add).commit(); - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); - Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); + assertThat(deletedFiles).contains(FILE_A.path().toString()); + assertThat(deletedFiles).contains(FILE_B.path().toString()); } - @Test + @TestTemplate public void dataFilesCleanupWithParallelTasks() throws IOException { table.newFastAppend().appendFile(FILE_A).commit(); @@ -833,17 +785,18 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { // Verifies that the delete methods ran in the threads created by the provided ExecutorService // ThreadFactory - Assert.assertEquals( - deleteThreads, - Sets.newHashSet( - "remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); - - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); - Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); - Assert.assertTrue("Thread should be created in provided pool", planThreadsIndex.get() > 0); + assertThat(deleteThreads) + .containsExactly( + "remove-snapshot-3", "remove-snapshot-2", "remove-snapshot-1", "remove-snapshot-0"); + + assertThat(deletedFiles).contains(FILE_A.path().toString()); + assertThat(deletedFiles).contains(FILE_B.path().toString()); + assertThat(planThreadsIndex.get()) + .as("Thread should be created in provided pool") + .isGreaterThan(0); } - @Test + @TestTemplate public void noDataFileCleanup() throws IOException { table.newFastAppend().appendFile(FILE_A).commit(); @@ -866,14 +819,14 @@ public void noDataFileCleanup() throws IOException { .deleteWith(deletedFiles::add) .commit(); - Assert.assertTrue("No files should have been deleted", deletedFiles.isEmpty()); + assertThat(deletedFiles).isEmpty(); } /** * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. Table: A - * C ` B (staged) */ - @Test + @TestTemplate public void testWithExpiringDanglingStageCommit() { // `A` commit table.newAppend().appendFile(FILE_A).commit(); @@ -918,18 +871,17 @@ public void testWithExpiringDanglingStageCommit() { expectedDeletes.add(file.path()); } }); - Assert.assertSame( - "Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); + assertThat(deletedFiles).isEqualTo(expectedDeletes); // Take the diff expectedDeletes.removeAll(deletedFiles); - Assert.assertTrue("Exactly same files should be deleted", expectedDeletes.isEmpty()); + assertThat(expectedDeletes).isEmpty(); } /** * Expire cherry-pick the commit as shown below, when `B` is in table's current state Table: A - B * - C <--current snapshot `- D (source=B) */ - @Test + @TestTemplate public void testWithCherryPickTableSnapshot() { // `A` commit table.newAppend().appendFile(FILE_A).commit(); @@ -938,7 +890,7 @@ public void testWithCherryPickTableSnapshot() { // `B` commit Set deletedAFiles = Sets.newHashSet(); table.newOverwrite().addFile(FILE_B).deleteFile(FILE_A).deleteWith(deletedAFiles::add).commit(); - Assert.assertTrue("No files should be physically deleted", deletedAFiles.isEmpty()); + assertThat(deletedAFiles).isEmpty(); // pick the snapshot 'B` Snapshot snapshotB = readMetadata().currentSnapshot(); @@ -971,7 +923,7 @@ public void testWithCherryPickTableSnapshot() { i.addedDataFiles(table.io()) .forEach( item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); + assertThat(deletedFiles).doesNotContain(item.path().toString()); }); }); } @@ -980,7 +932,7 @@ public void testWithCherryPickTableSnapshot() { * Test on table below, and expiring `B` which is not in current table state. 1) Expire `B` 2) All * commit Table: A - C - D (B) ` B (staged) */ - @Test + @TestTemplate public void testWithExpiringStagedThenCherrypick() { // `A` commit table.newAppend().appendFile(FILE_A).commit(); @@ -1016,7 +968,7 @@ public void testWithExpiringStagedThenCherrypick() { i.addedDataFiles(table.io()) .forEach( item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); + assertThat(deletedFiles).doesNotContain(item.path().toString()); }); }); @@ -1033,23 +985,23 @@ public void testWithExpiringStagedThenCherrypick() { i.addedDataFiles(table.io()) .forEach( item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); + assertThat(deletedFiles).doesNotContain(item.path().toString()); }); }); } - @Test + @TestTemplate public void testExpireSnapshotsWhenGarbageCollectionDisabled() { table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); table.newAppend().appendFile(FILE_A).commit(); - Assertions.assertThatThrownBy(() -> table.expireSnapshots()) + assertThatThrownBy(() -> table.expireSnapshots()) .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot expire snapshots: GC is disabled"); } - @Test + @TestTemplate public void testExpireWithDefaultRetainLast() { table.newAppend().appendFile(FILE_A).commit(); @@ -1057,7 +1009,7 @@ public void testExpireWithDefaultRetainLast() { table.newAppend().appendFile(FILE_C).commit(); - Assert.assertEquals("Expected 3 snapshots", 3, Iterables.size(table.snapshots())); + assertThat(table.snapshots()).hasSize(3); table.updateProperties().set(TableProperties.MIN_SNAPSHOTS_TO_KEEP, "3").commit(); @@ -1070,13 +1022,12 @@ public void testExpireWithDefaultRetainLast() { .deleteWith(deletedFiles::add) .commit(); - Assert.assertEquals( - "Should not change current snapshot", snapshotBeforeExpiration, table.currentSnapshot()); - Assert.assertEquals("Should keep 3 snapshots", 3, Iterables.size(table.snapshots())); - Assert.assertTrue("Should not delete data", deletedFiles.isEmpty()); + assertThat(table.currentSnapshot()).isEqualTo(snapshotBeforeExpiration); + assertThat(table.snapshots()).hasSize(3); + assertThat(deletedFiles).isEmpty(); } - @Test + @TestTemplate public void testExpireWithDefaultSnapshotAge() { table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); @@ -1093,7 +1044,7 @@ public void testExpireWithDefaultSnapshotAge() { waitUntilAfter(thirdSnapshot.timestampMillis()); - Assert.assertEquals("Expected 3 snapshots", 3, Iterables.size(table.snapshots())); + assertThat(table.snapshots()).hasSize(3); table.updateProperties().set(TableProperties.MAX_SNAPSHOT_AGE_MS, "1").commit(); @@ -1102,19 +1053,16 @@ public void testExpireWithDefaultSnapshotAge() { // rely solely on default configs removeSnapshots(table).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals( - "Should not change current snapshot", thirdSnapshot, table.currentSnapshot()); - Assert.assertEquals("Should keep 1 snapshot", 1, Iterables.size(table.snapshots())); - Assert.assertEquals( - "Should remove expired manifest lists", - Sets.newHashSet( - firstSnapshot.manifestListLocation(), secondSnapshot.manifestListLocation()), - deletedFiles); + assertThat(table.currentSnapshot()).isEqualTo(thirdSnapshot); + assertThat(table.snapshots()).hasSize(1); + assertThat(deletedFiles) + .containsExactlyInAnyOrder( + firstSnapshot.manifestListLocation(), secondSnapshot.manifestListLocation()); } - @Test + @TestTemplate public void testExpireWithDeleteFiles() { - Assume.assumeTrue("Delete files only supported in V2 spec", formatVersion == 2); + assumeThat(formatVersion).as("Delete files only supported in V2 spec").isEqualTo(2); // Data Manifest => File_A table.newAppend().appendFile(FILE_A).commit(); @@ -1124,10 +1072,8 @@ public void testExpireWithDeleteFiles() { // Delete Manifest => FILE_A_DELETES table.newRowDelta().addDeletes(FILE_A_DELETES).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should have 1 data manifest", 1, secondSnapshot.dataManifests(table.io()).size()); - Assert.assertEquals( - "Should have 1 delete manifest", 1, secondSnapshot.deleteManifests(table.io()).size()); + assertThat(secondSnapshot.dataManifests(table.io())).hasSize(1); + assertThat(secondSnapshot.deleteManifests(table.io())).hasSize(1); // FILE_A and FILE_A_DELETES move into "DELETED" state table @@ -1142,8 +1088,7 @@ public void testExpireWithDeleteFiles() { thirdSnapshot.allManifests(table.io()).stream() .filter(ManifestFile::hasDeletedFiles) .collect(Collectors.toSet()); - Assert.assertEquals( - "Should have two manifests of deleted files", 2, manifestOfDeletedFiles.size()); + assertThat(manifestOfDeletedFiles).hasSize(2); // Need one more commit before manifests of files of DELETED state get cleared from current // snapshot. @@ -1155,24 +1100,24 @@ public void testExpireWithDeleteFiles() { Set deletedFiles = Sets.newHashSet(); removeSnapshots(table).expireOlderThan(fourthSnapshotTs).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals( - "Should remove old delete files and delete file manifests", - ImmutableSet.builder() - .add(FILE_A.path()) - .add(FILE_A_DELETES.path()) - .add(firstSnapshot.manifestListLocation()) - .add(secondSnapshot.manifestListLocation()) - .add(thirdSnapshot.manifestListLocation()) - .addAll(manifestPaths(secondSnapshot, table.io())) - .addAll( - manifestOfDeletedFiles.stream() - .map(ManifestFile::path) - .collect(Collectors.toList())) - .build(), - deletedFiles); + assertThat(deletedFiles) + .as("Should remove old delete files and delete file manifests") + .isEqualTo( + ImmutableSet.builder() + .add(FILE_A.path()) + .add(FILE_A_DELETES.path()) + .add(firstSnapshot.manifestListLocation()) + .add(secondSnapshot.manifestListLocation()) + .add(thirdSnapshot.manifestListLocation()) + .addAll(manifestPaths(secondSnapshot, table.io())) + .addAll( + manifestOfDeletedFiles.stream() + .map(ManifestFile::path) + .collect(Collectors.toList())) + .build()); } - @Test + @TestTemplate public void testTagExpiration() { table.newAppend().appendFile(FILE_A).commit(); @@ -1194,12 +1139,12 @@ public void testTagExpiration() { removeSnapshots(table).cleanExpiredFiles(false).commit(); - Assert.assertNull(table.ops().current().ref("tag")); - Assert.assertNotNull(table.ops().current().ref("branch")); - Assert.assertNotNull(table.ops().current().ref(SnapshotRef.MAIN_BRANCH)); + assertThat(table.ops().current().ref("tag")).isNull(); + assertThat(table.ops().current().ref("branch")).isNotNull(); + assertThat(table.ops().current().ref(SnapshotRef.MAIN_BRANCH)).isNotNull(); } - @Test + @TestTemplate public void testBranchExpiration() { table.newAppend().appendFile(FILE_A).commit(); @@ -1221,12 +1166,12 @@ public void testBranchExpiration() { removeSnapshots(table).cleanExpiredFiles(false).commit(); - Assert.assertNull(table.ops().current().ref("branch")); - Assert.assertNotNull(table.ops().current().ref("tag")); - Assert.assertNotNull(table.ops().current().ref(SnapshotRef.MAIN_BRANCH)); + assertThat(table.ops().current().ref("branch")).isNull(); + assertThat(table.ops().current().ref("tag")).isNotNull(); + assertThat(table.ops().current().ref(SnapshotRef.MAIN_BRANCH)).isNotNull(); } - @Test + @TestTemplate public void testMultipleRefsAndCleanExpiredFilesFailsForIncrementalCleanup() { table.newAppend().appendFile(FILE_A).commit(); table.newDelete().deleteFile(FILE_A).commit(); @@ -1234,7 +1179,7 @@ public void testMultipleRefsAndCleanExpiredFilesFailsForIncrementalCleanup() { waitUntilAfter(table.currentSnapshot().timestampMillis()); RemoveSnapshots removeSnapshots = (RemoveSnapshots) table.expireSnapshots(); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> removeSnapshots .withIncrementalCleanup(true) @@ -1245,7 +1190,7 @@ public void testMultipleRefsAndCleanExpiredFilesFailsForIncrementalCleanup() { .hasMessage("Cannot incrementally clean files for tables with more than 1 ref"); } - @Test + @TestTemplate public void testExpireWithStatisticsFiles() throws IOException { table.newAppend().appendFile(FILE_A).commit(); String statsFileLocation1 = statsFileLocation(table.location()); @@ -1266,24 +1211,24 @@ public void testExpireWithStatisticsFiles() throws IOException { statsFileLocation2, table.io()); commitStats(table, statisticsFile2); - Assert.assertEquals("Should have 2 statistics file", 2, table.statisticsFiles().size()); + assertThat(table.statisticsFiles()).hasSize(2); long tAfterCommits = waitUntilAfter(table.currentSnapshot().timestampMillis()); removeSnapshots(table).expireOlderThan(tAfterCommits).commit(); // only the current snapshot and its stats file should be retained - Assert.assertEquals("Should keep 1 snapshot", 1, Iterables.size(table.snapshots())); - Assertions.assertThat(table.statisticsFiles()) + assertThat(table.snapshots()).hasSize(1); + assertThat(table.statisticsFiles()) .hasSize(1) .extracting(StatisticsFile::snapshotId) .as("Should contain only the statistics file of snapshot2") .isEqualTo(Lists.newArrayList(statisticsFile2.snapshotId())); - Assertions.assertThat(new File(statsFileLocation1)).doesNotExist(); - Assertions.assertThat(new File(statsFileLocation2)).exists(); + assertThat(new File(statsFileLocation1)).doesNotExist(); + assertThat(new File(statsFileLocation2)).exists(); } - @Test + @TestTemplate public void testExpireWithStatisticsFilesWithReuse() throws IOException { table.newAppend().appendFile(FILE_A).commit(); String statsFileLocation1 = statsFileLocation(table.location()); @@ -1303,24 +1248,24 @@ public void testExpireWithStatisticsFilesWithReuse() throws IOException { reuseStatsFile(table.currentSnapshot().snapshotId(), statisticsFile1); commitStats(table, statisticsFile2); - Assert.assertEquals("Should have 2 statistics file", 2, table.statisticsFiles().size()); + assertThat(table.statisticsFiles()).hasSize(2); long tAfterCommits = waitUntilAfter(table.currentSnapshot().timestampMillis()); removeSnapshots(table).expireOlderThan(tAfterCommits).commit(); // only the current snapshot and its stats file (reused from previous snapshot) should be // retained - Assert.assertEquals("Should keep 1 snapshot", 1, Iterables.size(table.snapshots())); - Assertions.assertThat(table.statisticsFiles()) + assertThat(table.snapshots()).hasSize(1); + assertThat(table.statisticsFiles()) .hasSize(1) .extracting(StatisticsFile::snapshotId) .as("Should contain only the statistics file of snapshot2") .isEqualTo(Lists.newArrayList(statisticsFile2.snapshotId())); // the reused stats file should exist. - Assertions.assertThat(new File(statsFileLocation1)).exists(); + assertThat(new File(statsFileLocation1)).exists(); } - @Test + @TestTemplate public void testExpireWithPartitionStatisticsFiles() throws IOException { table.newAppend().appendFile(FILE_A).commit(); String statsFileLocation1 = statsFileLocation(table.location()); @@ -1335,25 +1280,24 @@ public void testExpireWithPartitionStatisticsFiles() throws IOException { writePartitionStatsFile( table.currentSnapshot().snapshotId(), statsFileLocation2, table.io()); commitPartitionStats(table, statisticsFile2); - Assert.assertEquals( - "Should have 2 partition statistics file", 2, table.partitionStatisticsFiles().size()); + assertThat(table.partitionStatisticsFiles()).hasSize(2); long tAfterCommits = waitUntilAfter(table.currentSnapshot().timestampMillis()); removeSnapshots(table).expireOlderThan(tAfterCommits).commit(); // only the current snapshot and its stats file should be retained - Assert.assertEquals("Should keep 1 snapshot", 1, Iterables.size(table.snapshots())); - Assertions.assertThat(table.partitionStatisticsFiles()) + assertThat(table.snapshots()).hasSize(1); + assertThat(table.partitionStatisticsFiles()) .hasSize(1) .extracting(PartitionStatisticsFile::snapshotId) .as("Should contain only the statistics file of snapshot2") .isEqualTo(Lists.newArrayList(statisticsFile2.snapshotId())); - Assertions.assertThat(new File(statsFileLocation1)).doesNotExist(); - Assertions.assertThat(new File(statsFileLocation2)).exists(); + assertThat(new File(statsFileLocation1)).doesNotExist(); + assertThat(new File(statsFileLocation2)).exists(); } - @Test + @TestTemplate public void testExpireWithPartitionStatisticsFilesWithReuse() throws IOException { table.newAppend().appendFile(FILE_A).commit(); String statsFileLocation1 = statsFileLocation(table.location()); @@ -1370,25 +1314,24 @@ public void testExpireWithPartitionStatisticsFilesWithReuse() throws IOException reusePartitionStatsFile(table.currentSnapshot().snapshotId(), statisticsFile1); commitPartitionStats(table, statisticsFile2); - Assert.assertEquals( - "Should have 2 partition statistics file", 2, table.partitionStatisticsFiles().size()); + assertThat(table.partitionStatisticsFiles()).hasSize(2); long tAfterCommits = waitUntilAfter(table.currentSnapshot().timestampMillis()); removeSnapshots(table).expireOlderThan(tAfterCommits).commit(); // only the current snapshot and its stats file (reused from previous snapshot) should be // retained - Assert.assertEquals("Should keep 1 snapshot", 1, Iterables.size(table.snapshots())); - Assertions.assertThat(table.partitionStatisticsFiles()) + assertThat(table.snapshots()).hasSize(1); + assertThat(table.partitionStatisticsFiles()) .hasSize(1) .extracting(PartitionStatisticsFile::snapshotId) .as("Should contain only the statistics file of snapshot2") .isEqualTo(Lists.newArrayList(statisticsFile2.snapshotId())); // the reused stats file should exist. - Assertions.assertThat(new File(statsFileLocation1)).exists(); + assertThat(new File(statsFileLocation1)).exists(); } - @Test + @TestTemplate public void testFailRemovingSnapshotWhenStillReferencedByBranch() { table.newAppend().appendFile(FILE_A).commit(); @@ -1400,13 +1343,12 @@ public void testFailRemovingSnapshotWhenStillReferencedByBranch() { table.manageSnapshots().createBranch("branch", snapshotId).commit(); - Assertions.assertThatThrownBy( - () -> removeSnapshots(table).expireSnapshotId(snapshotId).commit()) + assertThatThrownBy(() -> removeSnapshots(table).expireSnapshotId(snapshotId).commit()) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot expire 2. Still referenced by refs: [branch]"); } - @Test + @TestTemplate public void testFailRemovingSnapshotWhenStillReferencedByTag() { table.newAppend().appendFile(FILE_A).commit(); @@ -1417,13 +1359,12 @@ public void testFailRemovingSnapshotWhenStillReferencedByTag() { // commit another snapshot so the first one isn't referenced by main table.newAppend().appendFile(FILE_B).commit(); - Assertions.assertThatThrownBy( - () -> removeSnapshots(table).expireSnapshotId(snapshotId).commit()) + assertThatThrownBy(() -> removeSnapshots(table).expireSnapshotId(snapshotId).commit()) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot expire 1. Still referenced by refs: [tag]"); } - @Test + @TestTemplate public void testRetainUnreferencedSnapshotsWithinExpirationAge() { table.newAppend().appendFile(FILE_A).commit(); @@ -1436,10 +1377,10 @@ public void testRetainUnreferencedSnapshotsWithinExpirationAge() { removeSnapshots(table).expireOlderThan(expireTimestampSnapshotA).commit(); - Assert.assertEquals(2, table.ops().current().snapshots().size()); + assertThat(table.ops().current().snapshots()).hasSize(2); } - @Test + @TestTemplate public void testUnreferencedSnapshotParentOfTag() { table.newAppend().appendFile(FILE_A).commit(); @@ -1468,12 +1409,13 @@ public void testUnreferencedSnapshotParentOfTag() { .cleanExpiredFiles(false) .commit(); - Assert.assertNull( - "Should remove unreferenced snapshot beneath a tag", table.snapshot(expiredSnapshotId)); - Assert.assertEquals(2, table.ops().current().snapshots().size()); + assertThat(table.snapshot(expiredSnapshotId)) + .as("Should remove unreferenced snapshot beneath a tag") + .isNull(); + assertThat(table.ops().current().snapshots()).hasSize(2); } - @Test + @TestTemplate public void testSnapshotParentOfBranchNotUnreferenced() { // similar to testUnreferencedSnapshotParentOfTag, but checks that branch history is not // considered unreferenced @@ -1505,11 +1447,13 @@ public void testSnapshotParentOfBranchNotUnreferenced() { .cleanExpiredFiles(false) .commit(); - Assert.assertNotNull("Should not remove snapshot beneath a branch", table.snapshot(snapshotId)); - Assert.assertEquals(3, table.ops().current().snapshots().size()); + assertThat(table.snapshot(snapshotId)) + .as("Should not remove snapshot beneath a branch") + .isNotNull(); + assertThat(table.ops().current().snapshots()).hasSize(3); } - @Test + @TestTemplate public void testMinSnapshotsToKeepMultipleBranches() { table.newAppend().appendFile(FILE_A).commit(); long initialSnapshotId = table.currentSnapshot().snapshotId(); @@ -1520,7 +1464,7 @@ public void testMinSnapshotsToKeepMultipleBranches() { long branchSnapshotId = append.apply().snapshotId(); append.commit(); - Assert.assertEquals("Should have 3 snapshots", 3, Iterables.size(table.snapshots())); + assertThat(table.snapshots()).hasSize(3); long maxSnapshotAgeMs = 1; long expirationTime = System.currentTimeMillis() + maxSnapshotAgeMs; @@ -1543,20 +1487,18 @@ public void testMinSnapshotsToKeepMultipleBranches() { waitUntilAfter(expirationTime); table.expireSnapshots().cleanExpiredFiles(false).commit(); - Assert.assertEquals( - "Should have 3 snapshots (none removed)", 3, Iterables.size(table.snapshots())); + assertThat(table.snapshots()).hasSize(3); // stop retaining snapshots from the branch table.manageSnapshots().setMinSnapshotsToKeep("branch", 1).commit(); removeSnapshots(table).cleanExpiredFiles(false).commit(); - Assert.assertEquals( - "Should have 2 snapshots (initial removed)", 2, Iterables.size(table.snapshots())); - Assert.assertNull(table.ops().current().snapshot(initialSnapshotId)); + assertThat(table.snapshots()).hasSize(2); + assertThat(table.ops().current().snapshot(initialSnapshotId)).isNull(); } - @Test + @TestTemplate public void testMaxSnapshotAgeMultipleBranches() { table.newAppend().appendFile(FILE_A).commit(); long initialSnapshotId = table.currentSnapshot().snapshotId(); @@ -1580,7 +1522,7 @@ public void testMaxSnapshotAgeMultipleBranches() { long branchSnapshotId = append.apply().snapshotId(); append.commit(); - Assert.assertEquals("Should have 3 snapshots", 3, Iterables.size(table.snapshots())); + assertThat(table.snapshots()).hasSize(3); // retain all snapshots on branch (including the initial snapshot) table @@ -1592,20 +1534,18 @@ public void testMaxSnapshotAgeMultipleBranches() { removeSnapshots(table).cleanExpiredFiles(false).commit(); - Assert.assertEquals( - "Should have 3 snapshots (none removed)", 3, Iterables.size(table.snapshots())); + assertThat(table.snapshots()).hasSize(3); // allow the initial snapshot to age off from branch table.manageSnapshots().setMaxSnapshotAgeMs("branch", ageMs).commit(); table.expireSnapshots().cleanExpiredFiles(false).commit(); - Assert.assertEquals( - "Should have 2 snapshots (initial removed)", 2, Iterables.size(table.snapshots())); - Assert.assertNull(table.ops().current().snapshot(initialSnapshotId)); + assertThat(table.snapshots()).hasSize(2); + assertThat(table.ops().current().snapshot(initialSnapshotId)).isNull(); } - @Test + @TestTemplate public void testRetainFilesOnRetainedBranches() { // Append a file to main and test branch String testBranch = "test-branch"; @@ -1628,8 +1568,8 @@ public void testRetainFilesOnRetainedBranches() { expectedDeletes.addAll(manifestPaths(deletionA, table.io())); table.expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals(2, Iterables.size(table.snapshots())); - Assert.assertEquals(expectedDeletes, deletedFiles); + assertThat(table.snapshots()).hasSize(2); + assertThat(deletedFiles).isEqualTo(expectedDeletes); // Delete A on test branch table.newDelete().deleteFile(FILE_A).toBranch(testBranch).commit(); @@ -1655,8 +1595,8 @@ public void testRetainFilesOnRetainedBranches() { expectedDeletes.addAll(manifestPaths(branchDelete, table.io())); expectedDeletes.add(FILE_A.path().toString()); - Assert.assertEquals(2, Iterables.size(table.snapshots())); - Assert.assertEquals(expectedDeletes, deletedFiles); + assertThat(table.snapshots()).hasSize(2); + assertThat(deletedFiles).isEqualTo(expectedDeletes); } private Set manifestPaths(Snapshot snapshot, FileIO io) { diff --git a/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java b/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java index 8cc7e440686d..ef42fc1793da 100644 --- a/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java +++ b/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java @@ -21,6 +21,7 @@ import static org.apache.iceberg.TableProperties.MANIFEST_MERGE_ENABLED; import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.when; @@ -40,24 +41,17 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestRewriteManifests extends TableTestBase { - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestRewriteManifests extends TestBase { + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); } - public TestRewriteManifests(int formatVersion) { - super(formatVersion); - } - - @Test + @TestTemplate public void testRewriteManifestsAppendedDirectly() throws IOException { Table table = load(); @@ -70,18 +64,18 @@ public void testRewriteManifestsAppendedDirectly() throws IOException { table.newFastAppend().appendManifest(newManifest).commit(); long appendId = table.currentSnapshot().snapshotId(); - Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); table.rewriteManifests().clusterBy(file -> "").commit(); List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals(1, manifests.size()); + assertThat(manifests).hasSize(1); validateManifestEntries( manifests.get(0), ids(appendId), files(FILE_A), statuses(ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testRewriteManifestsWithScanExecutor() throws IOException { Table table = load(); @@ -93,7 +87,7 @@ public void testRewriteManifestsWithScanExecutor() throws IOException { table.newFastAppend().appendManifest(newManifest).commit(); - Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); AtomicInteger scanThreadsIndex = new AtomicInteger(0); table .rewriteManifests() @@ -111,11 +105,13 @@ public void testRewriteManifestsWithScanExecutor() throws IOException { .commit(); List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals(1, manifests.size()); - Assert.assertTrue("Thread should be created in provided pool", scanThreadsIndex.get() > 0); + assertThat(manifests).hasSize(1); + assertThat(scanThreadsIndex.get()) + .as("Thread should be created in provided pool") + .isGreaterThan(0); } - @Test + @TestTemplate public void testRewriteManifestsGeneratedAndAppendedDirectly() throws IOException { Table table = load(); @@ -131,12 +127,12 @@ public void testRewriteManifestsGeneratedAndAppendedDirectly() throws IOExceptio table.newFastAppend().appendFile(FILE_B).commit(); long fileAppendId = table.currentSnapshot().snapshotId(); - Assert.assertEquals(2, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(2); table.rewriteManifests().clusterBy(file -> "").commit(); List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Manifests must be merged into 1", 1, manifests.size()); + assertThat(manifests).hasSize(1); // get the correct file order List files; @@ -158,20 +154,20 @@ public void testRewriteManifestsGeneratedAndAppendedDirectly() throws IOExceptio statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testReplaceManifestsSeparate() { Table table = load(); table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); long appendId = table.currentSnapshot().snapshotId(); - Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); // cluster by path will split the manifest into two table.rewriteManifests().clusterBy(file -> file.path()).commit(); List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals(2, manifests.size()); + assertThat(manifests).hasSize(2); manifests.sort(Comparator.comparing(ManifestFile::path)); validateManifestEntries( @@ -180,7 +176,7 @@ public void testReplaceManifestsSeparate() { manifests.get(1), ids(appendId), files(FILE_B), statuses(ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testReplaceManifestsConsolidate() throws IOException { Table table = load(); @@ -189,14 +185,14 @@ public void testReplaceManifestsConsolidate() throws IOException { table.newFastAppend().appendFile(FILE_B).commit(); long appendIdB = table.currentSnapshot().snapshotId(); - Assert.assertEquals(2, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(2); // cluster by constant will combine manifests into one table.rewriteManifests().clusterBy(file -> "file").commit(); List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals(1, manifests.size()); + assertThat(manifests).hasSize(1); // get the file order correct List files; @@ -218,7 +214,7 @@ public void testReplaceManifestsConsolidate() throws IOException { statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testReplaceManifestsWithFilter() throws IOException { Table table = load(); @@ -231,7 +227,7 @@ public void testReplaceManifestsWithFilter() throws IOException { table.newFastAppend().appendFile(FILE_C).commit(); long appendIdC = table.currentSnapshot().snapshotId(); - Assert.assertEquals(3, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(3); // keep the file A manifest, combine the other two @@ -249,7 +245,7 @@ public void testReplaceManifestsWithFilter() throws IOException { .commit(); List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals(2, manifests.size()); + assertThat(manifests).hasSize(2); // get the file order correct List files; @@ -273,13 +269,13 @@ public void testReplaceManifestsWithFilter() throws IOException { manifests.get(1), ids(appendIdA), files(FILE_A), statuses(ManifestEntry.Status.ADDED)); } - @Test + @TestTemplate public void testReplaceManifestsMaxSize() { Table table = load(); table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); long appendId = table.currentSnapshot().snapshotId(); - Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); // cluster by constant will combine manifests into one but small target size will create one per // entry @@ -288,7 +284,7 @@ public void testReplaceManifestsMaxSize() { rewriteManifests.clusterBy(file -> "file").commit(); List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals(2, manifests.size()); + assertThat(manifests).hasSize(2); manifests.sort(Comparator.comparing(ManifestFile::path)); validateManifestEntries( @@ -297,7 +293,7 @@ public void testReplaceManifestsMaxSize() { manifests.get(1), ids(appendId), files(FILE_B), statuses(ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testConcurrentRewriteManifest() throws IOException { Table table = load(); table.newFastAppend().appendFile(FILE_A).commit(); @@ -323,14 +319,14 @@ public void testConcurrentRewriteManifest() throws IOException { }) .commit(); - Assert.assertEquals(2, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(2); // commit the rewrite manifests in progress - this should perform a full rewrite as the manifest // with file B is no longer part of the snapshot rewrite.commit(); List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals(1, manifests.size()); + assertThat(manifests).hasSize(1); // get the file order correct List files; @@ -352,7 +348,7 @@ public void testConcurrentRewriteManifest() throws IOException { statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testAppendDuringRewriteManifest() { Table table = load(); table.newFastAppend().appendFile(FILE_A).commit(); @@ -366,7 +362,7 @@ public void testAppendDuringRewriteManifest() { table.newFastAppend().appendFile(FILE_B).commit(); long appendIdB = table.currentSnapshot().snapshotId(); - Assert.assertEquals(2, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(2); // commit the rewrite manifests in progress rewrite.commit(); @@ -376,7 +372,7 @@ public void testAppendDuringRewriteManifest() { // have a single cluster key, rewritten one should be the first in the list List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals(2, manifests.size()); + assertThat(manifests).hasSize(2); validateManifestEntries( manifests.get(0), ids(appendIdA), files(FILE_A), statuses(ManifestEntry.Status.EXISTING)); @@ -384,7 +380,7 @@ public void testAppendDuringRewriteManifest() { manifests.get(1), ids(appendIdB), files(FILE_B), statuses(ManifestEntry.Status.ADDED)); } - @Test + @TestTemplate public void testRewriteManifestDuringAppend() { Table table = load(); table.newFastAppend().appendFile(FILE_A).commit(); @@ -397,14 +393,14 @@ public void testRewriteManifestDuringAppend() { // rewrite the manifests - only affects the first table.rewriteManifests().clusterBy(file -> "file").commit(); - Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); // commit the append in progress append.commit(); long appendIdB = table.currentSnapshot().snapshotId(); List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals(2, manifests.size()); + assertThat(manifests).hasSize(2); // last append should be the first in the list @@ -414,15 +410,15 @@ public void testRewriteManifestDuringAppend() { manifests.get(1), ids(appendIdA), files(FILE_A), statuses(ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testBasicManifestReplacement() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); - Assert.assertEquals(1, firstSnapshotManifests.size()); + assertThat(firstSnapshotManifests).hasSize(1); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); @@ -445,7 +441,7 @@ public void testBasicManifestReplacement() throws IOException { Snapshot snapshot = table.currentSnapshot(); List manifests = snapshot.allManifests(table.io()); - Assert.assertEquals(3, manifests.size()); + assertThat(manifests).hasSize(3); if (formatVersion == 1) { assertThat(manifests.get(0).path()).isNotEqualTo(firstNewManifest.path()); @@ -476,9 +472,9 @@ public void testBasicManifestReplacement() throws IOException { statuses(ManifestEntry.Status.ADDED, ManifestEntry.Status.ADDED)); } - @Test + @TestTemplate public void testBasicManifestReplacementWithSnapshotIdInheritance() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.updateProperties().set(SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); @@ -486,7 +482,7 @@ public void testBasicManifestReplacementWithSnapshotIdInheritance() throws IOExc Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); - Assert.assertEquals(1, firstSnapshotManifests.size()); + assertThat(firstSnapshotManifests).hasSize(1); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); @@ -509,7 +505,7 @@ public void testBasicManifestReplacementWithSnapshotIdInheritance() throws IOExc Snapshot snapshot = table.currentSnapshot(); List manifests = snapshot.allManifests(table.io()); - Assert.assertEquals(3, manifests.size()); + assertThat(manifests).hasSize(3); assertThat(manifests.get(0).path()).isEqualTo(firstNewManifest.path()); assertThat(manifests.get(1).path()).isEqualTo(secondNewManifest.path()); @@ -538,17 +534,14 @@ public void testBasicManifestReplacementWithSnapshotIdInheritance() throws IOExc table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); } - @Test + @TestTemplate public void testWithMultiplePartitionSpec() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); - Assert.assertEquals( - "Should create 1 manifest for initial write", - 1, - base.currentSnapshot().allManifests(table.io()).size()); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(1); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); int initialPartitionSpecId = initialManifest.partitionSpecId(); @@ -580,8 +573,7 @@ public void testWithMultiplePartitionSpec() throws IOException { table.newAppend().appendFile(newFileZ).commit(); - Assert.assertEquals( - "Should use 3 manifest files", 3, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(3); RewriteManifests rewriteManifests = table.rewriteManifests(); // try to cluster in 1 manifest file, but because of 2 partition specs @@ -589,40 +581,33 @@ public void testWithMultiplePartitionSpec() throws IOException { rewriteManifests.clusterBy(dataFile -> "file").commit(); List manifestFiles = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals( - "Rewrite manifest should produce 2 manifest files", 2, manifestFiles.size()); + assertThat(manifestFiles).as("Rewrite manifest should produce 2 manifest files").hasSize(2); - Assert.assertEquals( - "2 manifest files should have different partitionSpecId", - true, - manifestFiles.get(0).partitionSpecId() != manifestFiles.get(1).partitionSpecId()); + assertThat(manifestFiles.get(1).partitionSpecId()) + .as("2 manifest files should have different partitionSpecId") + .isNotEqualTo(manifestFiles.get(0).partitionSpecId()); matchNumberOfManifestFileWithSpecId(manifestFiles, initialPartitionSpecId, 1); matchNumberOfManifestFileWithSpecId(manifestFiles, table.ops().current().spec().specId(), 1); - Assert.assertEquals( - "first manifest file should have 2 data files", - Integer.valueOf(2), - manifestFiles.get(0).existingFilesCount()); + assertThat(manifestFiles.get(0).existingFilesCount()) + .as("first manifest file should have 2 data files") + .isEqualTo(2); - Assert.assertEquals( - "second manifest file should have 2 data files", - Integer.valueOf(2), - manifestFiles.get(1).existingFilesCount()); + assertThat(manifestFiles.get(1).existingFilesCount()) + .as("second manifest file should have 2 data files") + .isEqualTo(2); } - @Test + @TestTemplate public void testManifestSizeWithMultiplePartitionSpec() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); - Assert.assertEquals( - "Should create 1 manifest for initial write", - 1, - base.currentSnapshot().allManifests(table.io()).size()); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(1); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); int initialPartitionSpecId = initialManifest.partitionSpecId(); @@ -653,10 +638,9 @@ public void testManifestSizeWithMultiplePartitionSpec() throws IOException { table.newAppend().appendFile(newFileZ).commit(); - Assert.assertEquals( - "Rewrite manifests should produce 3 manifest files", - 3, - table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())) + .as("Rewrite manifests should produce 3 manifest files") + .hasSize(3); // cluster by constant will combine manifests into one but small target size will create one per // entry @@ -667,42 +651,28 @@ public void testManifestSizeWithMultiplePartitionSpec() throws IOException { rewriteManifests.clusterBy(dataFile -> "file").commit(); List manifestFiles = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should use 4 manifest files", 4, manifestFiles.size()); + assertThat(manifestFiles).hasSize(4); matchNumberOfManifestFileWithSpecId(manifestFiles, initialPartitionSpecId, 2); matchNumberOfManifestFileWithSpecId(manifestFiles, table.ops().current().spec().specId(), 2); - Assert.assertEquals( - "first manifest file should have 1 data files", - Integer.valueOf(1), - manifestFiles.get(0).existingFilesCount()); - - Assert.assertEquals( - "second manifest file should have 1 data files", - Integer.valueOf(1), - manifestFiles.get(1).existingFilesCount()); - - Assert.assertEquals( - "third manifest file should have 1 data files", - Integer.valueOf(1), - manifestFiles.get(2).existingFilesCount()); - - Assert.assertEquals( - "fourth manifest file should have 1 data files", - Integer.valueOf(1), - manifestFiles.get(3).existingFilesCount()); + assertThat(manifestFiles.get(0).existingFilesCount()).isEqualTo(1); + + assertThat(manifestFiles.get(1).existingFilesCount()).isEqualTo(1); + assertThat(manifestFiles.get(2).existingFilesCount()).isEqualTo(1); + assertThat(manifestFiles.get(3).existingFilesCount()).isEqualTo(1); } - @Test + @TestTemplate public void testManifestReplacementConcurrentAppend() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); - Assert.assertEquals(1, firstSnapshotManifests.size()); + assertThat(firstSnapshotManifests).hasSize(1); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); ManifestFile firstNewManifest = @@ -722,13 +692,13 @@ public void testManifestReplacementConcurrentAppend() throws IOException { table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals(2, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(2); rewriteManifests.commit(); Snapshot snapshot = table.currentSnapshot(); List manifests = snapshot.allManifests(table.io()); - Assert.assertEquals(3, manifests.size()); + assertThat(manifests).hasSize(3); validateSummary(snapshot, 1, 1, 2, 0); @@ -751,9 +721,9 @@ public void testManifestReplacementConcurrentAppend() throws IOException { statuses(ManifestEntry.Status.ADDED, ManifestEntry.Status.ADDED)); } - @Test + @TestTemplate public void testManifestReplacementConcurrentDelete() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.updateProperties().set(MANIFEST_MERGE_ENABLED, "false").commit(); @@ -761,7 +731,7 @@ public void testManifestReplacementConcurrentDelete() throws IOException { Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); - Assert.assertEquals(1, firstSnapshotManifests.size()); + assertThat(firstSnapshotManifests).hasSize(1); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); @@ -788,7 +758,7 @@ public void testManifestReplacementConcurrentDelete() throws IOException { Snapshot snapshot = table.currentSnapshot(); List manifests = snapshot.allManifests(table.io()); - Assert.assertEquals(3, manifests.size()); + assertThat(manifests).hasSize(3); validateSummary(snapshot, 1, 1, 2, 0); @@ -811,15 +781,15 @@ public void testManifestReplacementConcurrentDelete() throws IOException { statuses(ManifestEntry.Status.DELETED, ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testManifestReplacementConcurrentConflictingDelete() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); - Assert.assertEquals(1, firstSnapshotManifests.size()); + assertThat(firstSnapshotManifests).hasSize(1); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); ManifestFile firstNewManifest = @@ -838,20 +808,20 @@ public void testManifestReplacementConcurrentConflictingDelete() throws IOExcept table.newDelete().deleteFile(FILE_A).commit(); - Assertions.assertThatThrownBy(rewriteManifests::commit) + assertThatThrownBy(rewriteManifests::commit) .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Manifest is missing"); } - @Test + @TestTemplate public void testManifestReplacementCombinedWithRewrite() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.newFastAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); - Assert.assertEquals(1, firstSnapshotManifests.size()); + assertThat(firstSnapshotManifests).hasSize(1); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); table.newFastAppend().appendFile(FILE_B).commit(); @@ -862,7 +832,7 @@ public void testManifestReplacementCombinedWithRewrite() throws IOException { table.newFastAppend().appendFile(FILE_D).commit(); - Assert.assertEquals(4, Iterables.size(table.snapshots())); + assertThat(table.snapshots()).hasSize(4); ManifestFile newManifest = writeManifest( @@ -886,7 +856,7 @@ public void testManifestReplacementCombinedWithRewrite() throws IOException { Snapshot snapshot = table.currentSnapshot(); List manifests = snapshot.allManifests(table.io()); - Assert.assertEquals(3, manifests.size()); + assertThat(manifests).hasSize(3); validateSummary(snapshot, 3, 1, 2, 2); @@ -903,9 +873,9 @@ public void testManifestReplacementCombinedWithRewrite() throws IOException { statuses(ManifestEntry.Status.ADDED)); } - @Test + @TestTemplate public void testManifestReplacementCombinedWithRewriteConcurrentDelete() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.updateProperties().set(MANIFEST_MERGE_ENABLED, "false").commit(); @@ -913,7 +883,7 @@ public void testManifestReplacementCombinedWithRewriteConcurrentDelete() throws Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); - Assert.assertEquals(1, firstSnapshotManifests.size()); + assertThat(firstSnapshotManifests).hasSize(1); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); table.newFastAppend().appendFile(FILE_B).commit(); @@ -922,7 +892,7 @@ public void testManifestReplacementCombinedWithRewriteConcurrentDelete() throws table.newFastAppend().appendFile(FILE_C).commit(); - Assert.assertEquals(3, Iterables.size(table.snapshots())); + assertThat(table.snapshots()).hasSize(3); ManifestEntry entry = manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A); @@ -945,7 +915,7 @@ public void testManifestReplacementCombinedWithRewriteConcurrentDelete() throws Snapshot snapshot = table.currentSnapshot(); List manifests = snapshot.allManifests(table.io()); - Assert.assertEquals(2, manifests.size()); + assertThat(manifests).hasSize(2); validateSummary(snapshot, 3, 0, 2, 1); @@ -962,15 +932,15 @@ public void testManifestReplacementCombinedWithRewriteConcurrentDelete() throws statuses(ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testInvalidUsage() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.newFastAppend().appendFile(FILE_A).commit(); Snapshot snapshot = table.currentSnapshot(); List manifests = snapshot.allManifests(table.io()); - Assert.assertEquals(1, manifests.size()); + assertThat(manifests).hasSize(1); ManifestFile manifest = manifests.get(0); ManifestEntry appendEntry = @@ -980,7 +950,7 @@ public void testInvalidUsage() throws IOException { ManifestFile invalidAddedFileManifest = writeManifest("manifest-file-2.avro", appendEntry); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> table .rewriteManifests() @@ -997,7 +967,7 @@ public void testInvalidUsage() throws IOException { ManifestFile invalidDeletedFileManifest = writeManifest("manifest-file-3.avro", deleteEntry); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> table .rewriteManifests() @@ -1007,28 +977,28 @@ public void testInvalidUsage() throws IOException { .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot add manifest with deleted files"); - Assertions.assertThatThrownBy(() -> table.rewriteManifests().deleteManifest(manifest).commit()) + assertThatThrownBy(() -> table.rewriteManifests().deleteManifest(manifest).commit()) .isInstanceOf(ValidationException.class) .hasMessageStartingWith( "Replaced and created manifests must have the same number of active files"); } - @Test + @TestTemplate public void testManifestReplacementFailure() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.newFastAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); - Assert.assertEquals(1, firstSnapshotManifests.size()); + assertThat(firstSnapshotManifests).hasSize(1); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); table.newFastAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); List secondSnapshotManifests = secondSnapshot.allManifests(table.io()); - Assert.assertEquals(2, secondSnapshotManifests.size()); + assertThat(secondSnapshotManifests).hasSize(2); ManifestFile secondSnapshotManifest = secondSnapshotManifests.get(0); ManifestFile newManifest = @@ -1046,16 +1016,16 @@ public void testManifestReplacementFailure() throws IOException { rewriteManifests.deleteManifest(secondSnapshotManifest); rewriteManifests.addManifest(newManifest); - Assertions.assertThatThrownBy(rewriteManifests::commit) + assertThatThrownBy(rewriteManifests::commit) .isInstanceOf(CommitFailedException.class) .hasMessage("Injected failure"); - Assert.assertTrue("New manifest should not be deleted", new File(newManifest.path()).exists()); + assertThat(new File(newManifest.path())).exists(); } - @Test + @TestTemplate public void testManifestReplacementFailureWithSnapshotIdInheritance() throws IOException { - Assert.assertNull("Table should be empty", table.currentSnapshot()); + assertThat(table.currentSnapshot()).isNull(); table.updateProperties().set(SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); @@ -1063,14 +1033,14 @@ public void testManifestReplacementFailureWithSnapshotIdInheritance() throws IOE Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); - Assert.assertEquals(1, firstSnapshotManifests.size()); + assertThat(firstSnapshotManifests).hasSize(1); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); table.newFastAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); List secondSnapshotManifests = secondSnapshot.allManifests(table.io()); - Assert.assertEquals(2, secondSnapshotManifests.size()); + assertThat(secondSnapshotManifests).hasSize(2); ManifestFile secondSnapshotManifest = secondSnapshotManifests.get(0); ManifestFile newManifest = @@ -1088,27 +1058,27 @@ public void testManifestReplacementFailureWithSnapshotIdInheritance() throws IOE rewriteManifests.deleteManifest(secondSnapshotManifest); rewriteManifests.addManifest(newManifest); - Assertions.assertThatThrownBy(rewriteManifests::commit) + assertThatThrownBy(rewriteManifests::commit) .isInstanceOf(CommitFailedException.class) .hasMessage("Injected failure"); - Assert.assertTrue("New manifest should not be deleted", new File(newManifest.path()).exists()); + assertThat(new File(newManifest.path())).exists(); } - @Test + @TestTemplate public void testRewriteManifestsOnBranchUnsupported() { table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); - Assertions.assertThatThrownBy(() -> table.rewriteManifests().toBranch("someBranch").commit()) + assertThatThrownBy(() -> table.rewriteManifests().toBranch("someBranch").commit()) .isInstanceOf(UnsupportedOperationException.class) .hasMessage( "Cannot commit to branch someBranch: org.apache.iceberg.BaseRewriteManifests does not support branch commits"); } - @Test + @TestTemplate public void testRewriteDataManifestsPreservesDeletes() { assumeThat(formatVersion).isGreaterThan(1); @@ -1170,7 +1140,7 @@ public void testRewriteDataManifestsPreservesDeletes() { statuses(ManifestEntry.Status.ADDED, ManifestEntry.Status.ADDED)); } - @Test + @TestTemplate public void testReplaceDeleteManifestsOnly() throws IOException { assumeThat(formatVersion).isGreaterThan(1); @@ -1256,7 +1226,7 @@ public void testReplaceDeleteManifestsOnly() throws IOException { statuses(ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testReplaceDataAndDeleteManifests() throws IOException { assumeThat(formatVersion).isGreaterThan(1); @@ -1375,7 +1345,7 @@ public void testReplaceDataAndDeleteManifests() throws IOException { statuses(ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testDeleteManifestReplacementConcurrentAppend() throws IOException { assumeThat(formatVersion).isGreaterThan(1); @@ -1478,7 +1448,7 @@ public void testDeleteManifestReplacementConcurrentAppend() throws IOException { statuses(ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testDeleteManifestReplacementConcurrentDeleteFileRemoval() throws IOException { assumeThat(formatVersion).isGreaterThan(1); @@ -1586,7 +1556,7 @@ public void testDeleteManifestReplacementConcurrentDeleteFileRemoval() throws IO statuses(ManifestEntry.Status.DELETED, ManifestEntry.Status.EXISTING)); } - @Test + @TestTemplate public void testDeleteManifestReplacementConflictingDeleteFileRemoval() throws IOException { assumeThat(formatVersion).isGreaterThan(1); @@ -1632,12 +1602,12 @@ public void testDeleteManifestReplacementConflictingDeleteFileRemoval() throws I table.newRewrite().deleteFile(FILE_A_DELETES).commit(); // the rewrite must fail as the original delete manifest was replaced concurrently - Assertions.assertThatThrownBy(rewriteManifests::commit) + assertThatThrownBy(rewriteManifests::commit) .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Manifest is missing"); } - @Test + @TestTemplate public void testDeleteManifestReplacementFailure() throws IOException { assumeThat(formatVersion).isGreaterThan(1); @@ -1693,7 +1663,7 @@ public void testDeleteManifestReplacementFailure() throws IOException { rewriteManifests.addManifest(newDeleteManifest); // the rewrite must fail - Assertions.assertThatThrownBy(rewriteManifests::commit) + assertThatThrownBy(rewriteManifests::commit) .isInstanceOf(CommitFailedException.class) .hasMessage("Injected failure"); @@ -1717,18 +1687,11 @@ private List sortedDataManifests(FileIO io, Snapshot snapshot) { private void validateSummary( Snapshot snapshot, int replaced, int kept, int created, int entryCount) { Map summary = snapshot.summary(); - Assert.assertEquals( - "Replaced manifest count should match", - replaced, - Integer.parseInt(summary.get("manifests-replaced"))); - Assert.assertEquals( - "Kept manifest count should match", kept, Integer.parseInt(summary.get("manifests-kept"))); - Assert.assertEquals( - "Created manifest count should match", - created, - Integer.parseInt(summary.get("manifests-created"))); - Assert.assertEquals( - "Entry count should match", entryCount, Integer.parseInt(summary.get("entries-processed"))); + assertThat(summary) + .containsEntry("manifests-replaced", String.valueOf(replaced)) + .containsEntry("manifests-kept", String.valueOf(kept)) + .containsEntry("manifests-created", String.valueOf(created)) + .containsEntry("entries-processed", String.valueOf(entryCount)); } private void matchNumberOfManifestFileWithSpecId( @@ -1740,12 +1703,12 @@ private void matchNumberOfManifestFileWithSpecId( .filter(m -> m.partitionSpecId() == toBeMatchedPartitionSpecId) .count(); - Assert.assertEquals( - "manifest list should have " - + numberOfManifestWithPartitionSpecID - + " manifests matching this partitionSpecId " - + toBeMatchedPartitionSpecId, - numberOfManifestWithPartitionSpecID, - matchedManifestsCounter); + assertThat(matchedManifestsCounter) + .as( + "manifest list should have " + + numberOfManifestWithPartitionSpecID + + " manifests matching this partitionSpecId " + + toBeMatchedPartitionSpecId) + .isEqualTo(numberOfManifestWithPartitionSpecID); } } diff --git a/core/src/test/java/org/apache/iceberg/TestRowDelta.java b/core/src/test/java/org/apache/iceberg/TestRowDelta.java index 56bab52edaf9..867e4b062f4d 100644 --- a/core/src/test/java/org/apache/iceberg/TestRowDelta.java +++ b/core/src/test/java/org/apache/iceberg/TestRowDelta.java @@ -27,7 +27,10 @@ import static org.apache.iceberg.SnapshotSummary.TOTAL_DELETE_FILES_PROP; import static org.apache.iceberg.SnapshotSummary.TOTAL_POS_DELETES_PROP; import static org.apache.iceberg.util.SnapshotUtil.latestSnapshot; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Set; @@ -38,44 +41,34 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; -@RunWith(Parameterized.class) +@ExtendWith(ParameterizedTestExtension.class) public class TestRowDelta extends V2TableTestBase { - private final String branch; + @Parameter(index = 1) + private String branch; - @Parameterized.Parameters(name = "branch = {0}") - public static Object[] parameters() { - return new Object[][] { - new Object[] {"main"}, new Object[] {"testBranch"}, - }; + @Parameters(name = "formatVersion = {0}, branch = {1}") + protected static List parameters() { + return Arrays.asList(new Object[] {2, "main"}, new Object[] {2, "testBranch"}); } - public TestRowDelta(String branch) { - this.branch = branch; - } - - @Test + @TestTemplate public void testAddDeleteFile() { SnapshotUpdate rowDelta = table.newRowDelta().addRows(FILE_A).addDeletes(FILE_A_DELETES).addDeletes(FILE_B_DELETES); commit(table, rowDelta, branch); Snapshot snap = latestSnapshot(table, branch); - Assert.assertEquals("Commit should produce sequence number 1", 1, snap.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); - Assert.assertEquals( - "Delta commit should use operation 'overwrite'", - DataOperations.OVERWRITE, - snap.operation()); - - Assert.assertEquals("Should produce 1 data manifest", 1, snap.dataManifests(table.io()).size()); + assertThat(snap.sequenceNumber()).isEqualTo(1); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(1); + assertThat(snap.operation()) + .as("Delta commit should use operation 'overwrite'") + .isEqualTo(DataOperations.OVERWRITE); + assertThat(snap.dataManifests(table.io())).hasSize(1); + validateManifest( snap.dataManifests(table.io()).get(0), dataSeqs(1L), @@ -84,8 +77,7 @@ public void testAddDeleteFile() { files(FILE_A), statuses(Status.ADDED)); - Assert.assertEquals( - "Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + assertThat(snap.deleteManifests(table.io())).hasSize(1); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), dataSeqs(1L, 1L), @@ -95,7 +87,7 @@ public void testAddDeleteFile() { statuses(Status.ADDED, Status.ADDED)); } - @Test + @TestTemplate public void testValidateDataFilesExistDefaults() { SnapshotUpdate rowDelta1 = table.newAppend().appendFile(FILE_A).appendFile(FILE_B); @@ -116,7 +108,7 @@ public void testValidateDataFilesExistDefaults() { long deleteSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> commit( table, @@ -129,15 +121,11 @@ public void testValidateDataFilesExistDefaults() { .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot commit, missing data files"); - Assert.assertEquals( - "Table state should not be modified by failed RowDelta operation", - deleteSnapshotId, - latestSnapshot(table, branch).snapshotId()); + assertThat(latestSnapshot(table, branch).snapshotId()) + .as("Table state should not be modified by failed RowDelta operation") + .isEqualTo(deleteSnapshotId); - Assert.assertEquals( - "Table should not have any delete manifests", - 0, - latestSnapshot(table, branch).deleteManifests(table.io()).size()); + assertThat(latestSnapshot(table, branch).deleteManifests(table.io())).isEmpty(); commit( table, @@ -148,10 +136,7 @@ public void testValidateDataFilesExistDefaults() { .validateFromSnapshot(validateFromSnapshotId), branch); - Assert.assertEquals( - "Table should have one new delete manifest", - 1, - latestSnapshot(table, branch).deleteManifests(table.io()).size()); + assertThat(latestSnapshot(table, branch).deleteManifests(table.io())).hasSize(1); ManifestFile deletes = latestSnapshot(table, branch).deleteManifests(table.io()).get(0); validateDeleteManifest( deletes, @@ -162,7 +147,7 @@ public void testValidateDataFilesExistDefaults() { statuses(Status.ADDED)); } - @Test + @TestTemplate public void testValidateDataFilesExistOverwrite() { commit(table, table.newAppend().appendFile(FILE_A).appendFile(FILE_B), branch); @@ -174,7 +159,7 @@ public void testValidateDataFilesExistOverwrite() { long deleteSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> commit( table, @@ -187,18 +172,14 @@ public void testValidateDataFilesExistOverwrite() { .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot commit, missing data files"); - Assert.assertEquals( - "Table state should not be modified by failed RowDelta operation", - deleteSnapshotId, - latestSnapshot(table, branch).snapshotId()); + assertThat(latestSnapshot(table, branch).snapshotId()) + .as("Table state should not be modified by failed RowDelta operation") + .isEqualTo(deleteSnapshotId); - Assert.assertEquals( - "Table should not have any delete manifests", - 0, - latestSnapshot(table, branch).deleteManifests(table.io()).size()); + assertThat(latestSnapshot(table, branch).deleteManifests(table.io())).isEmpty(); } - @Test + @TestTemplate public void testValidateDataFilesExistReplacePartitions() { commit(table, table.newAppend().appendFile(FILE_A).appendFile(FILE_B), branch); @@ -210,7 +191,7 @@ public void testValidateDataFilesExistReplacePartitions() { long deleteSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> commit( table, @@ -223,18 +204,14 @@ public void testValidateDataFilesExistReplacePartitions() { .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot commit, missing data files"); - Assert.assertEquals( - "Table state should not be modified by failed RowDelta operation", - deleteSnapshotId, - latestSnapshot(table, branch).snapshotId()); + assertThat(latestSnapshot(table, branch).snapshotId()) + .as("Table state should not be modified by failed RowDelta operation") + .isEqualTo(deleteSnapshotId); - Assert.assertEquals( - "Table should not have any delete manifests", - 0, - latestSnapshot(table, branch).deleteManifests(table.io()).size()); + assertThat(latestSnapshot(table, branch).deleteManifests(table.io())).isEmpty(); } - @Test + @TestTemplate public void testValidateDataFilesExistFromSnapshot() { commit(table, table.newAppend().appendFile(FILE_A).appendFile(FILE_B), branch); @@ -259,11 +236,10 @@ public void testValidateDataFilesExistFromSnapshot() { branch); Snapshot snap = latestSnapshot(table, branch); - Assert.assertEquals("Commit should produce sequence number 2", 3, snap.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); + assertThat(snap.sequenceNumber()).isEqualTo(3); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(3); - Assert.assertEquals("Should have 2 data manifests", 2, snap.dataManifests(table.io()).size()); + assertThat(snap.dataManifests(table.io())).hasSize(2); // manifest with FILE_A2 added validateManifest( snap.dataManifests(table.io()).get(0), @@ -282,8 +258,7 @@ public void testValidateDataFilesExistFromSnapshot() { files(FILE_A, FILE_B), statuses(Status.DELETED, Status.EXISTING)); - Assert.assertEquals( - "Should have 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + assertThat(snap.deleteManifests(table.io())).hasSize(1); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), dataSeqs(3L), @@ -293,7 +268,7 @@ public void testValidateDataFilesExistFromSnapshot() { statuses(Status.ADDED)); } - @Test + @TestTemplate public void testValidateDataFilesExistRewrite() { commit(table, table.newAppend().appendFile(FILE_A).appendFile(FILE_B), branch); @@ -308,7 +283,7 @@ public void testValidateDataFilesExistRewrite() { long deleteSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> commit( table, @@ -321,18 +296,14 @@ public void testValidateDataFilesExistRewrite() { .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot commit, missing data files"); - Assert.assertEquals( - "Table state should not be modified by failed RowDelta operation", - deleteSnapshotId, - latestSnapshot(table, branch).snapshotId()); + assertThat(latestSnapshot(table, branch).snapshotId()) + .as("Table state should not be modified by failed RowDelta operation") + .isEqualTo(deleteSnapshotId); - Assert.assertEquals( - "Table should not have any delete manifests", - 0, - latestSnapshot(table, branch).deleteManifests(table.io()).size()); + assertThat(latestSnapshot(table, branch).deleteManifests(table.io())).isEmpty(); } - @Test + @TestTemplate public void testValidateDataFilesExistValidateDeletes() { commit(table, table.newAppend().appendFile(FILE_A).appendFile(FILE_B), branch); @@ -344,7 +315,7 @@ public void testValidateDataFilesExistValidateDeletes() { long deleteSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> commit( table, @@ -358,18 +329,14 @@ public void testValidateDataFilesExistValidateDeletes() { .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot commit, missing data files"); - Assert.assertEquals( - "Table state should not be modified by failed RowDelta operation", - deleteSnapshotId, - latestSnapshot(table, branch).snapshotId()); + assertThat(latestSnapshot(table, branch).snapshotId()) + .as("Table state should not be modified by failed RowDelta operation") + .isEqualTo(deleteSnapshotId); - Assert.assertEquals( - "Table should not have any delete manifests", - 0, - latestSnapshot(table, branch).deleteManifests(table.io()).size()); + assertThat(latestSnapshot(table, branch).deleteManifests(table.io())).isEmpty(); } - @Test + @TestTemplate public void testValidateNoConflicts() { commit(table, table.newAppend().appendFile(FILE_A), branch); @@ -381,7 +348,7 @@ public void testValidateNoConflicts() { long appendSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> commit( table, @@ -396,18 +363,14 @@ public void testValidateNoConflicts() { .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Found conflicting files"); - Assert.assertEquals( - "Table state should not be modified by failed RowDelta operation", - appendSnapshotId, - latestSnapshot(table, branch).snapshotId()); + assertThat(latestSnapshot(table, branch).snapshotId()) + .as("Table state should not be modified by failed RowDelta operation") + .isEqualTo(appendSnapshotId); - Assert.assertEquals( - "Table should not have any delete manifests", - 0, - latestSnapshot(table, branch).deleteManifests(table.io()).size()); + assertThat(latestSnapshot(table, branch).deleteManifests(table.io())).isEmpty(); } - @Test + @TestTemplate public void testValidateNoConflictsFromSnapshot() { commit(table, table.newAppend().appendFile(FILE_A), branch); @@ -433,11 +396,10 @@ public void testValidateNoConflictsFromSnapshot() { branch); Snapshot snap = latestSnapshot(table, branch); - Assert.assertEquals("Commit should produce sequence number 2", 3, snap.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); + assertThat(snap.sequenceNumber()).isEqualTo(3); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(3); - Assert.assertEquals("Should have 2 data manifests", 2, snap.dataManifests(table.io()).size()); + assertThat(snap.dataManifests(table.io())).hasSize(2); // manifest with FILE_A2 added validateManifest( snap.dataManifests(table.io()).get(0), @@ -456,8 +418,7 @@ public void testValidateNoConflictsFromSnapshot() { files(FILE_A), statuses(Status.ADDED)); - Assert.assertEquals( - "Should have 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + assertThat(snap.deleteManifests(table.io())).hasSize(1); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), dataSeqs(3L), @@ -467,7 +428,7 @@ public void testValidateNoConflictsFromSnapshot() { statuses(Status.ADDED)); } - @Test + @TestTemplate public void testOverwriteWithDeleteFile() { commit( table, @@ -475,12 +436,8 @@ public void testOverwriteWithDeleteFile() { branch); long deltaSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assert.assertEquals( - "Commit should produce sequence number 1", - 1, - latestSnapshot(table, branch).sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + assertThat(latestSnapshot(table, branch).sequenceNumber()).isEqualTo(1); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(1); // overwriting by a filter will also remove delete files that match because all matching data // files are removed. @@ -492,11 +449,10 @@ public void testOverwriteWithDeleteFile() { branch); Snapshot snap = latestSnapshot(table, branch); - Assert.assertEquals("Commit should produce sequence number 2", 2, snap.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + assertThat(snap.sequenceNumber()).isEqualTo(2); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(2); - Assert.assertEquals("Should produce 1 data manifest", 1, snap.dataManifests(table.io()).size()); + assertThat(snap.dataManifests(table.io())).hasSize(1); validateManifest( snap.dataManifests(table.io()).get(0), dataSeqs(1L), @@ -505,8 +461,7 @@ public void testOverwriteWithDeleteFile() { files(FILE_A), statuses(Status.DELETED)); - Assert.assertEquals( - "Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + assertThat(snap.deleteManifests(table.io())).hasSize(1); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), dataSeqs(1L, 1L), @@ -516,7 +471,7 @@ public void testOverwriteWithDeleteFile() { statuses(Status.DELETED, Status.EXISTING)); } - @Test + @TestTemplate public void testReplacePartitionsWithDeleteFile() { commit( table, @@ -524,24 +479,18 @@ public void testReplacePartitionsWithDeleteFile() { branch); long deltaSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assert.assertEquals( - "Commit should produce sequence number 1", - 1, - latestSnapshot(table, branch).sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + assertThat(latestSnapshot(table, branch).sequenceNumber()).isEqualTo(1); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(1); // overwriting the partition will also remove delete files that match because all matching data // files are removed. commit(table, table.newReplacePartitions().addFile(FILE_A2), branch); Snapshot snap = latestSnapshot(table, branch); - Assert.assertEquals("Commit should produce sequence number 2", 2, snap.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + assertThat(snap.sequenceNumber()).isEqualTo(2); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(2); - Assert.assertEquals( - "Should produce 2 data manifests", 2, snap.dataManifests(table.io()).size()); + assertThat(snap.dataManifests(table.io())).hasSize(2); int deleteManifestPos = snap.dataManifests(table.io()).get(0).deletedFilesCount() > 0 ? 0 : 1; validateManifest( snap.dataManifests(table.io()).get(deleteManifestPos), @@ -559,8 +508,7 @@ public void testReplacePartitionsWithDeleteFile() { files(FILE_A2), statuses(Status.ADDED)); - Assert.assertEquals( - "Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + assertThat(snap.deleteManifests(table.io())).hasSize(1); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), dataSeqs(1L, 1L), @@ -570,7 +518,7 @@ public void testReplacePartitionsWithDeleteFile() { statuses(Status.DELETED, Status.EXISTING)); } - @Test + @TestTemplate public void testDeleteByExpressionWithDeleteFile() { commit( table, @@ -578,23 +526,18 @@ public void testDeleteByExpressionWithDeleteFile() { branch); long deltaSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assert.assertEquals( - "Commit should produce sequence number 1", - 1, - latestSnapshot(table, branch).sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + assertThat(latestSnapshot(table, branch).sequenceNumber()).isEqualTo(1); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(1); // deleting with a filter will also remove delete files that match because all matching data // files are removed. commit(table, table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()), branch); Snapshot snap = latestSnapshot(table, branch); - Assert.assertEquals("Commit should produce sequence number 2", 2, snap.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + assertThat(snap.sequenceNumber()).isEqualTo(2); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(2); - Assert.assertEquals("Should produce 1 data manifest", 1, snap.dataManifests(table.io()).size()); + assertThat(snap.deleteManifests(table.io())).hasSize(1); validateManifest( snap.dataManifests(table.io()).get(0), dataSeqs(1L), @@ -603,8 +546,7 @@ public void testDeleteByExpressionWithDeleteFile() { files(FILE_A), statuses(Status.DELETED)); - Assert.assertEquals( - "Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + assertThat(snap.deleteManifests(table.io())).hasSize(1); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), dataSeqs(1L, 1L), @@ -614,28 +556,22 @@ public void testDeleteByExpressionWithDeleteFile() { statuses(Status.DELETED, Status.DELETED)); } - @Test + @TestTemplate public void testDeleteDataFileWithDeleteFile() { commit(table, table.newRowDelta().addRows(FILE_A).addDeletes(FILE_A_DELETES), branch); long deltaSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assert.assertEquals( - "Commit should produce sequence number 1", - 1, - latestSnapshot(table, branch).sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + assertThat(latestSnapshot(table, branch).sequenceNumber()).isEqualTo(1); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(1); // deleting a specific data file will not affect a delete file commit(table, table.newDelete().deleteFile(FILE_A), branch); Snapshot deleteSnap = latestSnapshot(table, branch); - Assert.assertEquals("Commit should produce sequence number 2", 2, deleteSnap.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + assertThat(deleteSnap.sequenceNumber()).isEqualTo(2); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(2); - Assert.assertEquals( - "Should produce 1 data manifest", 1, deleteSnap.dataManifests(table.io()).size()); + assertThat(deleteSnap.deleteManifests(table.io())).hasSize(1); validateManifest( deleteSnap.dataManifests(table.io()).get(0), dataSeqs(1L), @@ -644,8 +580,7 @@ public void testDeleteDataFileWithDeleteFile() { files(FILE_A), statuses(Status.DELETED)); - Assert.assertEquals( - "Should produce 1 delete manifest", 1, deleteSnap.deleteManifests(table.io()).size()); + assertThat(deleteSnap.deleteManifests(table.io())).hasSize(1); validateDeleteManifest( deleteSnap.deleteManifests(table.io()).get(0), dataSeqs(1L), @@ -662,14 +597,11 @@ public void testDeleteDataFileWithDeleteFile() { commit(table, table.newDelete().deleteFile("no-such-file"), branch); Snapshot nextSnap = latestSnapshot(table, branch); - Assert.assertEquals("Append should produce sequence number 3", 3, nextSnap.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); - - Assert.assertEquals( - "Should have 0 data manifests", 0, nextSnap.dataManifests(table.io()).size()); - Assert.assertEquals( - "Should produce 1 delete manifest", 1, nextSnap.deleteManifests(table.io()).size()); + assertThat(nextSnap.sequenceNumber()).isEqualTo(3); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(3); + + assertThat(nextSnap.dataManifests(table.io())).isEmpty(); + assertThat(nextSnap.deleteManifests(table.io())).hasSize(1); validateDeleteManifest( nextSnap.deleteManifests(table.io()).get(0), dataSeqs(1L), @@ -679,28 +611,22 @@ public void testDeleteDataFileWithDeleteFile() { statuses(Status.DELETED)); } - @Test + @TestTemplate public void testFastAppendDoesNotRemoveStaleDeleteFiles() { commit(table, table.newRowDelta().addRows(FILE_A).addDeletes(FILE_A_DELETES), branch); long deltaSnapshotId = latestSnapshot(table, branch).snapshotId(); - Assert.assertEquals( - "Commit should produce sequence number 1", - 1, - latestSnapshot(table, branch).sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + assertThat(latestSnapshot(table, branch).sequenceNumber()).isEqualTo(1); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(1); // deleting a specific data file will not affect a delete file commit(table, table.newDelete().deleteFile(FILE_A), branch); Snapshot deleteSnap = latestSnapshot(table, branch); - Assert.assertEquals("Commit should produce sequence number 2", 2, deleteSnap.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + assertThat(deleteSnap.sequenceNumber()).isEqualTo(2); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(2); - Assert.assertEquals( - "Should produce 1 data manifest", 1, deleteSnap.dataManifests(table.io()).size()); + assertThat(deleteSnap.deleteManifests(table.io())).hasSize(1); validateManifest( deleteSnap.dataManifests(table.io()).get(0), dataSeqs(1L), @@ -709,8 +635,7 @@ public void testFastAppendDoesNotRemoveStaleDeleteFiles() { files(FILE_A), statuses(Status.DELETED)); - Assert.assertEquals( - "Should produce 1 delete manifest", 1, deleteSnap.deleteManifests(table.io()).size()); + assertThat(deleteSnap.deleteManifests(table.io())).hasSize(1); validateDeleteManifest( deleteSnap.deleteManifests(table.io()).get(0), dataSeqs(1L), @@ -724,12 +649,10 @@ public void testFastAppendDoesNotRemoveStaleDeleteFiles() { commit(table, table.newFastAppend().appendFile(FILE_B), branch); Snapshot nextSnap = latestSnapshot(table, branch); - Assert.assertEquals("Append should produce sequence number 3", 3, nextSnap.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); + assertThat(nextSnap.sequenceNumber()).isEqualTo(3); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(3); - Assert.assertEquals( - "Should have 2 data manifests", 2, nextSnap.dataManifests(table.io()).size()); + assertThat(nextSnap.dataManifests(table.io())).hasSize(2); int deleteManifestPos = nextSnap.dataManifests(table.io()).get(0).deletedFilesCount() > 0 ? 0 : 1; validateManifest( @@ -748,8 +671,7 @@ public void testFastAppendDoesNotRemoveStaleDeleteFiles() { files(FILE_B), statuses(Status.ADDED)); - Assert.assertEquals( - "Should produce 1 delete manifest", 1, nextSnap.deleteManifests(table.io()).size()); + assertThat(nextSnap.deleteManifests(table.io())).hasSize(1); validateDeleteManifest( nextSnap.deleteManifests(table.io()).get(0), dataSeqs(1L), @@ -759,7 +681,7 @@ public void testFastAppendDoesNotRemoveStaleDeleteFiles() { statuses(Status.ADDED)); } - @Test + @TestTemplate public void testValidateDataFilesExistWithConflictDetectionFilter() { // change the spec to be partitioned by data table @@ -820,10 +742,7 @@ public void testValidateDataFilesExistWithConflictDetectionFilter() { // commit the delta for partition A commit(table, rowDelta, branch); - Assert.assertEquals( - "Table should have one new delete manifest", - 1, - latestSnapshot(table, branch).deleteManifests(table.io()).size()); + assertThat(latestSnapshot(table, branch).deleteManifests(table.io())).hasSize(1); ManifestFile deletes = latestSnapshot(table, branch).deleteManifests(table.io()).get(0); validateDeleteManifest( deletes, @@ -834,7 +753,7 @@ public void testValidateDataFilesExistWithConflictDetectionFilter() { statuses(Status.ADDED)); } - @Test + @TestTemplate public void testValidateDataFilesDoNotExistWithConflictDetectionFilter() { // change the spec to be partitioned by data table @@ -881,12 +800,12 @@ public void testValidateDataFilesDoNotExistWithConflictDetectionFilter() { // concurrently delete the file for partition A commit(table, table.newDelete().deleteFile(dataFile1), branch); - Assertions.assertThatThrownBy(() -> commit(table, rowDelta, branch)) + assertThatThrownBy(() -> commit(table, rowDelta, branch)) .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot commit, missing data files"); } - @Test + @TestTemplate public void testAddDeleteFilesMultipleSpecs() { // enable partition summaries table.updateProperties().set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "10").commit(); @@ -898,7 +817,7 @@ public void testAddDeleteFilesMultipleSpecs() { // remove the only partition field to make the spec unpartitioned table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); - Assert.assertTrue("Spec must be unpartitioned", table.spec().isUnpartitioned()); + assertThat(table.spec().isUnpartitioned()).isTrue(); // append an unpartitioned data file DataFile secondSnapshotDataFile = newDataFile(""); @@ -911,7 +830,7 @@ public void testAddDeleteFilesMultipleSpecs() { DataFile thirdSnapshotDataFile = newDataFile("data=abc"); commit(table, table.newAppend().appendFile(thirdSnapshotDataFile), branch); - Assert.assertEquals("Should have 3 specs", 3, table.specs().size()); + assertThat(table.specs()).hasSize(3); // commit a row delta with 1 data file and 3 delete files where delete files have different // specs @@ -931,40 +850,32 @@ public void testAddDeleteFilesMultipleSpecs() { branch); Snapshot snapshot = latestSnapshot(table, branch); - Assert.assertEquals("Commit should produce sequence number 4", 4, snapshot.sequenceNumber()); - Assert.assertEquals( - "Last sequence number should be 4", 4, table.ops().current().lastSequenceNumber()); - Assert.assertEquals( - "Delta commit should be 'overwrite'", DataOperations.OVERWRITE, snapshot.operation()); + assertThat(snapshot.sequenceNumber()).isEqualTo(4); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(4); + assertThat(snapshot.operation()).isEqualTo(DataOperations.OVERWRITE); Map summary = snapshot.summary(); - Assert.assertEquals( - "Should change 4 partitions", "4", summary.get(CHANGED_PARTITION_COUNT_PROP)); - Assert.assertEquals("Should add 1 data file", "1", summary.get(ADDED_FILES_PROP)); - Assert.assertEquals("Should have 4 data files", "4", summary.get(TOTAL_DATA_FILES_PROP)); - Assert.assertEquals("Should add 3 delete files", "3", summary.get(ADDED_DELETE_FILES_PROP)); - Assert.assertEquals("Should have 3 delete files", "3", summary.get(TOTAL_DELETE_FILES_PROP)); - Assert.assertEquals("Should add 3 position deletes", "3", summary.get(ADDED_POS_DELETES_PROP)); - Assert.assertEquals("Should have 3 position deletes", "3", summary.get(TOTAL_POS_DELETES_PROP)); - - Assert.assertTrue( - "Partition metrics must be correct", - summary - .get(CHANGED_PARTITION_PREFIX + "data_bucket=0") - .contains(ADDED_DELETE_FILES_PROP + "=1")); - Assert.assertTrue( - "Partition metrics must be correct", - summary - .get(CHANGED_PARTITION_PREFIX + "data=abc") - .contains(ADDED_DELETE_FILES_PROP + "=1")); - Assert.assertTrue( - "Partition metrics must be correct", - summary.get(CHANGED_PARTITION_PREFIX + "data=xyz").contains(ADDED_FILES_PROP + "=1")); + assertThat(summary) + .containsEntry(CHANGED_PARTITION_COUNT_PROP, "4") + .containsEntry(ADDED_FILES_PROP, "1") + .containsEntry(TOTAL_DATA_FILES_PROP, "4") + .containsEntry(ADDED_DELETE_FILES_PROP, "3") + .containsEntry(TOTAL_DELETE_FILES_PROP, "3") + .containsEntry(ADDED_POS_DELETES_PROP, "3") + .containsEntry(TOTAL_POS_DELETES_PROP, "3") + .hasEntrySatisfying( + CHANGED_PARTITION_PREFIX + "data_bucket=0", + v -> assertThat(v).contains(ADDED_DELETE_FILES_PROP + "=1")) + .hasEntrySatisfying( + CHANGED_PARTITION_PREFIX + "data=abc", + v -> assertThat(v).contains(ADDED_DELETE_FILES_PROP + "=1")) + .hasEntrySatisfying( + CHANGED_PARTITION_PREFIX + "data=xyz", + v -> assertThat(v).contains(ADDED_FILES_PROP + "=1")); // 3 appends + 1 row delta - Assert.assertEquals( - "Should have 4 data manifest", 4, snapshot.dataManifests(table.io()).size()); + assertThat(snapshot.dataManifests(table.io())).hasSize(4); validateManifest( snapshot.dataManifests(table.io()).get(0), dataSeqs(4L), @@ -974,12 +885,10 @@ public void testAddDeleteFilesMultipleSpecs() { statuses(Status.ADDED)); // each delete file goes into a separate manifest as the specs are different - Assert.assertEquals( - "Should produce 3 delete manifest", 3, snapshot.deleteManifests(table.io()).size()); + assertThat(snapshot.deleteManifests(table.io())).hasSize(3); ManifestFile firstDeleteManifest = snapshot.deleteManifests(table.io()).get(2); - Assert.assertEquals( - "Spec must match", firstSnapshotDataFile.specId(), firstDeleteManifest.partitionSpecId()); + assertThat(firstDeleteManifest.partitionSpecId()).isEqualTo(firstSnapshotDataFile.specId()); validateDeleteManifest( firstDeleteManifest, dataSeqs(4L), @@ -989,8 +898,7 @@ public void testAddDeleteFilesMultipleSpecs() { statuses(Status.ADDED)); ManifestFile secondDeleteManifest = snapshot.deleteManifests(table.io()).get(1); - Assert.assertEquals( - "Spec must match", secondSnapshotDataFile.specId(), secondDeleteManifest.partitionSpecId()); + assertThat(secondDeleteManifest.partitionSpecId()).isEqualTo(secondSnapshotDataFile.specId()); validateDeleteManifest( secondDeleteManifest, dataSeqs(4L), @@ -1000,8 +908,7 @@ public void testAddDeleteFilesMultipleSpecs() { statuses(Status.ADDED)); ManifestFile thirdDeleteManifest = snapshot.deleteManifests(table.io()).get(0); - Assert.assertEquals( - "Spec must match", thirdSnapshotDataFile.specId(), thirdDeleteManifest.partitionSpecId()); + assertThat(thirdDeleteManifest.partitionSpecId()).isEqualTo(thirdSnapshotDataFile.specId()); validateDeleteManifest( thirdDeleteManifest, dataSeqs(4L), @@ -1011,7 +918,7 @@ public void testAddDeleteFilesMultipleSpecs() { statuses(Status.ADDED)); } - @Test + @TestTemplate public void testManifestMergingMultipleSpecs() { // make sure we enable manifest merging table @@ -1027,7 +934,7 @@ public void testManifestMergingMultipleSpecs() { // remove the only partition field to make the spec unpartitioned table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); - Assert.assertTrue("Spec must be unpartitioned", table.spec().isUnpartitioned()); + assertThat(table.spec().isUnpartitioned()).isTrue(); // append an unpartitioned data file DataFile secondSnapshotDataFile = newDataFile(""); @@ -1045,10 +952,8 @@ public void testManifestMergingMultipleSpecs() { Snapshot thirdSnapshot = latestSnapshot(table, branch); // 2 appends and 1 row delta where delete files belong to different specs - Assert.assertEquals( - "Should have 2 data manifest", 2, thirdSnapshot.dataManifests(table.io()).size()); - Assert.assertEquals( - "Should have 2 delete manifest", 2, thirdSnapshot.deleteManifests(table.io()).size()); + assertThat(thirdSnapshot.dataManifests(table.io())).hasSize(2); + assertThat(thirdSnapshot.deleteManifests(table.io())).hasSize(2); // commit two more delete files to the same specs to trigger merging DeleteFile thirdDeleteFile = newDeleteFile(firstSnapshotDataFile.specId(), "data_bucket=0"); @@ -1062,14 +967,11 @@ public void testManifestMergingMultipleSpecs() { Snapshot fourthSnapshot = latestSnapshot(table, branch); // make sure merging respects spec boundaries - Assert.assertEquals( - "Should have 2 data manifest", 2, fourthSnapshot.dataManifests(table.io()).size()); - Assert.assertEquals( - "Should have 2 delete manifest", 2, fourthSnapshot.deleteManifests(table.io()).size()); + assertThat(fourthSnapshot.dataManifests(table.io())).hasSize(2); + assertThat(fourthSnapshot.deleteManifests(table.io())).hasSize(2); ManifestFile firstDeleteManifest = fourthSnapshot.deleteManifests(table.io()).get(1); - Assert.assertEquals( - "Spec must match", firstSnapshotDataFile.specId(), firstDeleteManifest.partitionSpecId()); + assertThat(firstDeleteManifest.partitionSpecId()).isEqualTo(firstSnapshotDataFile.specId()); validateDeleteManifest( firstDeleteManifest, dataSeqs(4L, 3L), @@ -1079,8 +981,7 @@ public void testManifestMergingMultipleSpecs() { statuses(Status.ADDED, Status.EXISTING)); ManifestFile secondDeleteManifest = fourthSnapshot.deleteManifests(table.io()).get(0); - Assert.assertEquals( - "Spec must match", secondSnapshotDataFile.specId(), secondDeleteManifest.partitionSpecId()); + assertThat(secondDeleteManifest.partitionSpecId()).isEqualTo(secondSnapshotDataFile.specId()); validateDeleteManifest( secondDeleteManifest, dataSeqs(4L, 3L), @@ -1090,7 +991,7 @@ public void testManifestMergingMultipleSpecs() { statuses(Status.ADDED, Status.EXISTING)); } - @Test + @TestTemplate public void testAbortMultipleSpecs() { // append a partitioned data file DataFile firstSnapshotDataFile = newDataFile("data_bucket=0"); @@ -1099,7 +1000,7 @@ public void testAbortMultipleSpecs() { // remove the only partition field to make the spec unpartitioned table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); - Assert.assertTrue("Spec must be unpartitioned", table.spec().isUnpartitioned()); + assertThat(table.spec().isUnpartitioned()).isTrue(); // append an unpartitioned data file DataFile secondSnapshotDataFile = newDataFile(""); @@ -1127,15 +1028,15 @@ public void testAbortMultipleSpecs() { // perform a conflicting concurrent operation commit(table, table.newDelete().deleteFile(firstSnapshotDataFile), branch); - Assertions.assertThatThrownBy(() -> commit(table, rowDelta, branch)) + assertThatThrownBy(() -> commit(table, rowDelta, branch)) .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot commit, missing data files"); // we should clean up 1 manifest list and 2 delete manifests - Assert.assertEquals("Should delete 3 files", 3, deletedFiles.size()); + assertThat(deletedFiles).hasSize(3); } - @Test + @TestTemplate public void testConcurrentConflictingRowDelta() { commit(table, table.newAppend().appendFile(FILE_A), branch); @@ -1164,12 +1065,12 @@ public void testConcurrentConflictingRowDelta() { .validateNoConflictingDataFiles() .commit(); - Assertions.assertThatThrownBy(() -> commit(table, rowDelta, branch)) + assertThatThrownBy(() -> commit(table, rowDelta, branch)) .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Found new conflicting delete files"); } - @Test + @TestTemplate public void testConcurrentConflictingRowDeltaWithoutAppendValidation() { commit(table, table.newAppend().appendFile(FILE_A), branch); @@ -1195,12 +1096,12 @@ public void testConcurrentConflictingRowDeltaWithoutAppendValidation() { .validateNoConflictingDataFiles() .commit(); - Assertions.assertThatThrownBy(() -> commit(table, rowDelta, branch)) + assertThatThrownBy(() -> commit(table, rowDelta, branch)) .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Found new conflicting delete files"); } - @Test + @TestTemplate public void testConcurrentNonConflictingRowDelta() { // change the spec to be partitioned by data table @@ -1278,7 +1179,7 @@ public void testConcurrentNonConflictingRowDelta() { validateBranchDeleteFiles(table, branch, deleteFile1, deleteFile2); } - @Test + @TestTemplate public void testConcurrentNonConflictingRowDeltaAndRewriteFilesWithSequenceNumber() { // change the spec to be partitioned by data table @@ -1328,7 +1229,7 @@ public void testConcurrentNonConflictingRowDeltaAndRewriteFilesWithSequenceNumbe validateBranchFiles(table, branch, dataFile2); } - @Test + @TestTemplate public void testRowDeltaAndRewriteFilesMergeManifestsWithSequenceNumber() { table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1").commit(); // change the spec to be partitioned by data @@ -1376,9 +1277,9 @@ public void testRowDeltaAndRewriteFilesMergeManifestsWithSequenceNumber() { table.refresh(); List dataManifests = latestSnapshot(table, branch).dataManifests(table.io()); - Assert.assertEquals("should have 1 data manifest", 1, dataManifests.size()); + assertThat(dataManifests).hasSize(1); ManifestFile mergedDataManifest = dataManifests.get(0); - Assert.assertEquals("Manifest seq number must match", 3L, mergedDataManifest.sequenceNumber()); + assertThat(mergedDataManifest.sequenceNumber()).isEqualTo(3); long currentSnapshotId = latestSnapshot(table, branch).snapshotId(); @@ -1391,7 +1292,7 @@ public void testRowDeltaAndRewriteFilesMergeManifestsWithSequenceNumber() { statuses(Status.ADDED, Status.DELETED)); } - @Test + @TestTemplate public void testConcurrentConflictingRowDeltaAndRewriteFilesWithSequenceNumber() { // change the spec to be partitioned by data table @@ -1433,12 +1334,12 @@ public void testConcurrentConflictingRowDeltaAndRewriteFilesWithSequenceNumber() commit(table, rowDelta, branch); - Assertions.assertThatThrownBy(() -> commit(table, rewriteFiles, branch)) + assertThatThrownBy(() -> commit(table, rewriteFiles, branch)) .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot commit, found new position delete for replaced data file"); } - @Test + @TestTemplate public void testRowDeltaCaseSensitivity() { commit(table, table.newAppend().appendFile(FILE_A).appendFile(FILE_A2), branch); @@ -1448,7 +1349,7 @@ public void testRowDeltaCaseSensitivity() { Expression conflictDetectionFilter = Expressions.equal(Expressions.bucket("dAtA", 16), 0); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> table .newRowDelta() @@ -1463,7 +1364,7 @@ public void testRowDeltaCaseSensitivity() { .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot find field 'dAtA'"); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> table .newRowDelta() @@ -1480,7 +1381,7 @@ public void testRowDeltaCaseSensitivity() { .hasMessageStartingWith("Cannot find field 'dAtA'"); // binding should succeed and trigger the validation - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> table .newRowDelta() diff --git a/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java b/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java index 3697678d63f0..1bd1761ffc60 100644 --- a/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java +++ b/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java @@ -260,7 +260,7 @@ public void testDeleteAndRenameColumnReassign() { MappedField newMapping = updated.find("id"); assertThat(newMapping).isNotNull(); assertThat(newMapping.id()).isEqualTo(idColumnId); - assertThat(newMapping.names()).isEqualTo(Sets.newHashSet("id", "data")); + assertThat(newMapping.names()).containsExactly("data", "id"); assertThat(newMapping.nestedMapping()).isNull(); MappedField updatedMapping = updated.find(startIdColumnId); @@ -283,8 +283,7 @@ public void testRenameAndAddColumnReassign() { NameMapping afterRename = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - assertThat(afterRename.find(startIdColumnId).names()) - .isEqualTo(Sets.newHashSet("id", "object_id")); + assertThat(afterRename.find(startIdColumnId).names()).containsExactly("id", "object_id"); // add a new column with the renamed column's old name // also, rename the original column again to ensure its names are handled correctly @@ -312,7 +311,7 @@ public void testRenameAndAddColumnReassign() { MappedField updatedMapping = updated.find(startIdColumnId); assertThat(updatedMapping).isNotNull(); assertThat(updatedMapping.id()).isEqualTo(startIdColumnId); - assertThat(updatedMapping.names()).isEqualTo(Sets.newHashSet("object_id", "oid")); + assertThat(updatedMapping.names()).containsExactly("oid", "object_id"); assertThat(updatedMapping.nestedMapping()).isNull(); } @@ -329,8 +328,7 @@ public void testRenameAndRenameColumnReassign() { NameMapping afterRename = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - assertThat(afterRename.find(startIdColumnId).names()) - .isEqualTo(Sets.newHashSet("id", "object_id")); + assertThat(afterRename.find(startIdColumnId).names()).containsExactly("id", "object_id"); // rename the data column to the renamed column's old name // also, rename the original column again to ensure its names are handled correctly @@ -348,14 +346,14 @@ public void testRenameAndRenameColumnReassign() { MappedField newMapping = updated.find("id"); assertThat(newMapping).isNotNull(); - assertThat(newMapping.names()).isEqualTo(Sets.newHashSet("id", "data")); + assertThat(newMapping.names()).containsExactly("data", "id"); assertThat(newMapping.id()).isEqualTo(idColumnId); assertThat(newMapping.nestedMapping()).isNull(); MappedField updatedMapping = updated.find(startIdColumnId); assertThat(updatedMapping).isNotNull(); assertThat(updatedMapping.id()).isEqualTo(startIdColumnId); - assertThat(updatedMapping.names()).isEqualTo(Sets.newHashSet("object_id", "oid")); + assertThat(updatedMapping.names()).containsExactly("oid", "object_id"); assertThat(updatedMapping.nestedMapping()).isNull(); } diff --git a/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java b/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java index 943ff9f51acd..2b91a408850e 100644 --- a/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java +++ b/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java @@ -1601,9 +1601,8 @@ public void testAddNewIdentifierFieldColumns() { assertThat(newSchema.identifierFieldIds()) .as("add column then set as identifier should succeed") - .isEqualTo( - Sets.newHashSet( - newSchema.findField("id").fieldId(), newSchema.findField("new_field").fieldId())); + .containsExactly( + newSchema.findField("id").fieldId(), newSchema.findField("new_field").fieldId()); newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) @@ -1614,9 +1613,8 @@ public void testAddNewIdentifierFieldColumns() { assertThat(newSchema.identifierFieldIds()) .as("set identifier then add column should succeed") - .isEqualTo( - Sets.newHashSet( - newSchema.findField("id").fieldId(), newSchema.findField("new_field").fieldId())); + .containsExactly( + newSchema.findField("id").fieldId(), newSchema.findField("new_field").fieldId()); } @Test @@ -1686,9 +1684,8 @@ public void testAddDottedIdentifierFieldColumns() { assertThat(newSchema.identifierFieldIds()) .as("add a field with dot as identifier should succeed") - .isEqualTo( - Sets.newHashSet( - newSchema.findField("id").fieldId(), newSchema.findField("dot.field").fieldId())); + .containsExactly( + newSchema.findField("id").fieldId(), newSchema.findField("dot.field").fieldId()); } @Test @@ -1708,10 +1705,9 @@ public void testRemoveIdentifierFields() { assertThat(newSchema.identifierFieldIds()) .as("remove an identifier field should succeed") - .isEqualTo( - Sets.newHashSet( - newSchema.findField("new_field").fieldId(), - newSchema.findField("new_field2").fieldId())); + .containsExactly( + newSchema.findField("new_field").fieldId(), + newSchema.findField("new_field2").fieldId()); newSchema = new SchemaUpdate(newSchema, SCHEMA_LAST_COLUMN_ID) diff --git a/core/src/test/java/org/apache/iceberg/TestSequenceNumberForV2Table.java b/core/src/test/java/org/apache/iceberg/TestSequenceNumberForV2Table.java index 86842b681278..7845f305e3c9 100644 --- a/core/src/test/java/org/apache/iceberg/TestSequenceNumberForV2Table.java +++ b/core/src/test/java/org/apache/iceberg/TestSequenceNumberForV2Table.java @@ -18,21 +18,26 @@ */ package org.apache.iceberg; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.File; +import java.util.Arrays; +import java.util.List; import java.util.Set; import java.util.stream.Collectors; import org.apache.iceberg.ManifestEntry.Status; import org.apache.iceberg.exceptions.CommitFailedException; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.assertj.core.api.Assertions; -import org.junit.Test; +import org.junit.jupiter.api.TestTemplate; -public class TestSequenceNumberForV2Table extends TableTestBase { +public class TestSequenceNumberForV2Table extends TestBase { - public TestSequenceNumberForV2Table() { - super(2); + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(2); } - @Test + @TestTemplate public void testRewrite() { table.newFastAppend().appendFile(FILE_A).commit(); Snapshot snap1 = table.currentSnapshot(); @@ -96,7 +101,7 @@ public void testRewrite() { } } - @Test + @TestTemplate public void testCommitConflict() { AppendFiles appendA = table.newFastAppend(); appendA.appendFile(FILE_A).apply(); @@ -105,7 +110,7 @@ public void testCommitConflict() { table.ops().failCommits(1); - Assertions.assertThatThrownBy(() -> table.newFastAppend().appendFile(FILE_B).commit()) + assertThatThrownBy(() -> table.newFastAppend().appendFile(FILE_B).commit()) .isInstanceOf(CommitFailedException.class) .hasMessage("Injected failure"); @@ -144,7 +149,7 @@ public void testCommitConflict() { "Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); } - @Test + @TestTemplate public void testRollBack() { table.newFastAppend().appendFile(FILE_A).commit(); Snapshot snap1 = table.currentSnapshot(); @@ -183,7 +188,7 @@ public void testRollBack() { "Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); } - @Test + @TestTemplate public void testSingleTransaction() { Transaction txn = table.newTransaction(); txn.newAppend().appendFile(FILE_A).commit(); @@ -198,7 +203,7 @@ public void testSingleTransaction() { "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); } - @Test + @TestTemplate public void testConcurrentTransaction() { Transaction txn1 = table.newTransaction(); Transaction txn2 = table.newTransaction(); @@ -264,7 +269,7 @@ public void testConcurrentTransaction() { "Last sequence number should be 4", 4, readMetadata().lastSequenceNumber()); } - @Test + @TestTemplate public void testMultipleOperationsTransaction() { Transaction txn = table.newTransaction(); txn.newFastAppend().appendFile(FILE_A).commit(); @@ -298,7 +303,7 @@ public void testMultipleOperationsTransaction() { "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); } - @Test + @TestTemplate public void testExpirationInTransaction() { table.newFastAppend().appendFile(FILE_A).commit(); Snapshot snap1 = table.currentSnapshot(); @@ -310,7 +315,7 @@ public void testExpirationInTransaction() { V2Assert.assertEquals( "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); V2Assert.assertEquals( - "Should be 1 manifest list", 1, listManifestLists(table.location()).size()); + "Should be 1 manifest list", 1, listManifestLists(new File(table.location())).size()); table.newAppend().appendFile(FILE_B).commit(); Snapshot snap2 = table.currentSnapshot(); @@ -322,7 +327,7 @@ public void testExpirationInTransaction() { V2Assert.assertEquals( "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); V2Assert.assertEquals( - "Should be 2 manifest lists", 2, listManifestLists(table.location()).size()); + "Should be 2 manifest lists", 2, listManifestLists(new File(table.location())).size()); Transaction txn = table.newTransaction(); txn.expireSnapshots().expireSnapshotId(commitId1).commit(); @@ -332,10 +337,10 @@ public void testExpirationInTransaction() { V2Assert.assertEquals( "Should be 1 manifest list as 1 was deleted", 1, - listManifestLists(table.location()).size()); + listManifestLists(new File(table.location())).size()); } - @Test + @TestTemplate public void testTransactionFailure() { table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot snap1 = table.currentSnapshot(); @@ -359,7 +364,7 @@ public void testTransactionFailure() { Transaction txn = table.newTransaction(); txn.newAppend().appendFile(FILE_C).commit(); - Assertions.assertThatThrownBy(txn::commitTransaction) + assertThatThrownBy(txn::commitTransaction) .isInstanceOf(CommitFailedException.class) .hasMessage("Injected failure"); @@ -367,7 +372,7 @@ public void testTransactionFailure() { "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); } - @Test + @TestTemplate public void testCherryPicking() { table.newAppend().appendFile(FILE_A).commit(); Snapshot snap1 = table.currentSnapshot(); @@ -415,7 +420,7 @@ public void testCherryPicking() { "Last sequence number should be 4", 4, readMetadata().lastSequenceNumber()); } - @Test + @TestTemplate public void testCherryPickFastForward() { table.newAppend().appendFile(FILE_A).commit(); Snapshot snap1 = table.currentSnapshot(); diff --git a/core/src/test/java/org/apache/iceberg/TestSingleValueParser.java b/core/src/test/java/org/apache/iceberg/TestSingleValueParser.java index e04ba440ae3f..cc1578b0e081 100644 --- a/core/src/test/java/org/apache/iceberg/TestSingleValueParser.java +++ b/core/src/test/java/org/apache/iceberg/TestSingleValueParser.java @@ -20,14 +20,15 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.IOException; import java.util.Locale; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.JsonUtil; -import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.Test; public class TestSingleValueParser { @@ -114,22 +115,18 @@ public void testValidDefaults() throws IOException { public void testInvalidFixed() { Type expectedType = Types.FixedType.ofLength(2); String defaultJson = "\"111ff\""; - Exception exception = - Assert.assertThrows( - IllegalArgumentException.class, - () -> defaultValueParseAndUnParseRoundTrip(expectedType, defaultJson)); - Assert.assertTrue(exception.getMessage().startsWith("Cannot parse default fixed[2] value")); + assertThatThrownBy(() -> defaultValueParseAndUnParseRoundTrip(expectedType, defaultJson)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot parse default fixed[2] value"); } @Test public void testInvalidUUID() { Type expectedType = Types.UUIDType.get(); String defaultJson = "\"eb26bdb1-a1d8-4aa6-990e-da940875492c-abcde\""; - Exception exception = - Assert.assertThrows( - IllegalArgumentException.class, - () -> defaultValueParseAndUnParseRoundTrip(expectedType, defaultJson)); - Assert.assertTrue(exception.getMessage().startsWith("Cannot parse default as a uuid value")); + assertThatThrownBy(() -> defaultValueParseAndUnParseRoundTrip(expectedType, defaultJson)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot parse default as a uuid value"); } @Test @@ -137,36 +134,27 @@ public void testInvalidMap() { Type expectedType = Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get()); String defaultJson = "{\"keys\": [1, 2, 3], \"values\": [\"foo\", \"bar\"]}"; - Exception exception = - Assert.assertThrows( - IllegalArgumentException.class, - () -> defaultValueParseAndUnParseRoundTrip(expectedType, defaultJson)); - Assert.assertTrue( - exception.getMessage().startsWith("Cannot parse default as a map value")); + assertThatThrownBy(() -> defaultValueParseAndUnParseRoundTrip(expectedType, defaultJson)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot parse default as a map value"); } @Test public void testInvalidDecimal() { Type expectedType = Types.DecimalType.of(5, 2); String defaultJson = "123.456"; - Exception exception = - Assert.assertThrows( - IllegalArgumentException.class, - () -> defaultValueParseAndUnParseRoundTrip(expectedType, defaultJson)); - Assert.assertTrue( - exception.getMessage().startsWith("Cannot parse default as a decimal(5, 2) value")); + assertThatThrownBy(() -> defaultValueParseAndUnParseRoundTrip(expectedType, defaultJson)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot parse default as a decimal(5, 2) value"); } @Test public void testInvalidTimestamptz() { Type expectedType = Types.TimestampType.withZone(); String defaultJson = "\"2007-12-03T10:15:30+01:00\""; - Exception exception = - Assert.assertThrows( - IllegalArgumentException.class, - () -> defaultValueParseAndUnParseRoundTrip(expectedType, defaultJson)); - Assert.assertTrue( - exception.getMessage().startsWith("Cannot parse default as a timestamptz value")); + assertThatThrownBy(() -> defaultValueParseAndUnParseRoundTrip(expectedType, defaultJson)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot parse default as a timestamptz value"); } // serialize to json and deserialize back should return the same result @@ -176,6 +164,6 @@ private static String defaultValueParseAndUnParseRoundTrip(Type type, String def } private static void jsonStringEquals(String s1, String s2) throws IOException { - Assert.assertEquals(JsonUtil.mapper().readTree(s1), JsonUtil.mapper().readTree(s2)); + assertThat(JsonUtil.mapper().readTree(s2)).isEqualTo(JsonUtil.mapper().readTree(s1)); } } diff --git a/core/src/test/java/org/apache/iceberg/TestSortOrder.java b/core/src/test/java/org/apache/iceberg/TestSortOrder.java index 8fbc4e11fbc3..a3ba69a808b3 100644 --- a/core/src/test/java/org/apache/iceberg/TestSortOrder.java +++ b/core/src/test/java/org/apache/iceberg/TestSortOrder.java @@ -24,26 +24,27 @@ import static org.apache.iceberg.expressions.Expressions.truncate; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; import java.util.Set; import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.SortOrderUtil; -import org.assertj.core.api.Assertions; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) public class TestSortOrder { // column ids will be reassigned during table creation @@ -69,59 +70,52 @@ public class TestSortOrder { required(30, "ext", Types.StringType.get()), required(42, "Ext1", Types.StringType.get())); - @Rule public TemporaryFolder temp = new TemporaryFolder(); + @TempDir private Path temp; + private File tableDir = null; - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); } - private final int formatVersion; - - public TestSortOrder(int formatVersion) { - this.formatVersion = formatVersion; - } + @Parameter private int formatVersion; - @Before + @BeforeEach public void setupTableDir() throws IOException { - this.tableDir = temp.newFolder(); + this.tableDir = Files.createTempDirectory(temp, "junit").toFile(); } - @After + @AfterEach public void cleanupTables() { TestTables.clearTables(); } - @Test + @TestTemplate public void testSortOrderBuilder() { - Assert.assertEquals( - "Should be able to build unsorted order", - SortOrder.unsorted(), - SortOrder.builderFor(SCHEMA).withOrderId(0).build()); + assertThat(SortOrder.builderFor(SCHEMA).withOrderId(0).build()).isEqualTo(SortOrder.unsorted()); - Assertions.assertThatThrownBy( - () -> SortOrder.builderFor(SCHEMA).asc("data").withOrderId(0).build()) + assertThatThrownBy(() -> SortOrder.builderFor(SCHEMA).asc("data").withOrderId(0).build()) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Sort order ID 0 is reserved for unsorted order"); - Assertions.assertThatThrownBy(() -> SortOrder.builderFor(SCHEMA).withOrderId(1).build()) + assertThatThrownBy(() -> SortOrder.builderFor(SCHEMA).withOrderId(1).build()) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Unsorted order ID must be 0"); } - @Test + @TestTemplate public void testDefaultOrder() { PartitionSpec spec = PartitionSpec.unpartitioned(); TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, formatVersion); - Assert.assertEquals("Expected 1 sort order", 1, table.sortOrders().size()); + assertThat(table.sortOrders()).hasSize(1); SortOrder actualOrder = table.sortOrder(); - Assert.assertEquals("Order ID must match", 0, actualOrder.orderId()); - Assert.assertTrue("Order must unsorted", actualOrder.isUnsorted()); + assertThat(actualOrder.orderId()).isEqualTo(0); + assertThat(actualOrder.isUnsorted()).isTrue(); } - @Test + @TestTemplate public void testFreshIds() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).withSpecId(5).identity("data").build(); SortOrder order = @@ -133,20 +127,16 @@ public void testFreshIds() { TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, formatVersion); - Assert.assertEquals("Expected 1 sort order", 1, table.sortOrders().size()); - Assert.assertTrue( - "Order ID must be fresh", - table.sortOrders().containsKey(TableMetadata.INITIAL_SORT_ORDER_ID)); + assertThat(table.sortOrders()).hasSize(1).containsKey(TableMetadata.INITIAL_SORT_ORDER_ID); SortOrder actualOrder = table.sortOrder(); - Assert.assertEquals( - "Order ID must be fresh", TableMetadata.INITIAL_SORT_ORDER_ID, actualOrder.orderId()); - Assert.assertEquals("Order must have 2 fields", 2, actualOrder.fields().size()); - Assert.assertEquals("Field id must be fresh", 8, actualOrder.fields().get(0).sourceId()); - Assert.assertEquals("Field id must be fresh", 2, actualOrder.fields().get(1).sourceId()); + assertThat(actualOrder.orderId()).isEqualTo(TableMetadata.INITIAL_SORT_ORDER_ID); + assertThat(actualOrder.fields()).hasSize(2); + assertThat(actualOrder.fields().get(0).sourceId()).isEqualTo(8); + assertThat(actualOrder.fields().get(1).sourceId()).isEqualTo(2); } - @Test + @TestTemplate public void testCompatibleOrders() { SortOrder order1 = SortOrder.builderFor(SCHEMA).withOrderId(9).asc("s.id", NULLS_LAST).build(); @@ -175,37 +165,37 @@ public void testCompatibleOrders() { SortOrder.builderFor(SCHEMA).withOrderId(11).desc("s.id", NULLS_LAST).build(); // an unsorted order satisfies only itself - Assert.assertTrue(SortOrder.unsorted().satisfies(SortOrder.unsorted())); - Assert.assertFalse(SortOrder.unsorted().satisfies(order1)); - Assert.assertFalse(SortOrder.unsorted().satisfies(order2)); - Assert.assertFalse(SortOrder.unsorted().satisfies(order3)); - Assert.assertFalse(SortOrder.unsorted().satisfies(order4)); - Assert.assertFalse(SortOrder.unsorted().satisfies(order5)); + assertThat(SortOrder.unsorted().satisfies(SortOrder.unsorted())).isTrue(); + assertThat(SortOrder.unsorted().satisfies(order1)).isFalse(); + assertThat(SortOrder.unsorted().satisfies(order2)).isFalse(); + assertThat(SortOrder.unsorted().satisfies(order3)).isFalse(); + assertThat(SortOrder.unsorted().satisfies(order4)).isFalse(); + assertThat(SortOrder.unsorted().satisfies(order5)).isFalse(); // any ordering satisfies an unsorted ordering - Assert.assertTrue(order1.satisfies(SortOrder.unsorted())); - Assert.assertTrue(order2.satisfies(SortOrder.unsorted())); - Assert.assertTrue(order3.satisfies(SortOrder.unsorted())); - Assert.assertTrue(order4.satisfies(SortOrder.unsorted())); - Assert.assertTrue(order5.satisfies(SortOrder.unsorted())); + assertThat(order1.satisfies(SortOrder.unsorted())).isTrue(); + assertThat(order2.satisfies(SortOrder.unsorted())).isTrue(); + assertThat(order3.satisfies(SortOrder.unsorted())).isTrue(); + assertThat(order4.satisfies(SortOrder.unsorted())).isTrue(); + assertThat(order5.satisfies(SortOrder.unsorted())).isTrue(); // order1 has the same fields but different sort direction compared to order5 - Assert.assertFalse(order1.satisfies(order5)); + assertThat(order1.satisfies(order5)).isFalse(); // order2 has more fields than order1 and is compatible - Assert.assertTrue(order2.satisfies(order1)); + assertThat(order2.satisfies(order1)).isTrue(); // order2 has more fields than order5 but is incompatible - Assert.assertFalse(order2.satisfies(order5)); + assertThat(order2.satisfies(order5)).isFalse(); // order2 has the same fields but different null order compared to order3 - Assert.assertFalse(order2.satisfies(order3)); + assertThat(order2.satisfies(order3)).isFalse(); // order2 has the same fields but different sort direction compared to order4 - Assert.assertFalse(order2.satisfies(order4)); + assertThat(order2.satisfies(order4)).isFalse(); // order1 has fewer fields than order2 and is incompatible - Assert.assertFalse(order1.satisfies(order2)); + assertThat(order1.satisfies(order2)).isFalse(); } - @Test + @TestTemplate public void testSatisfiesTruncateFieldOrder() { SortOrder id = SortOrder.builderFor(SCHEMA).asc("data", NULLS_LAST).build(); SortOrder truncate4 = @@ -213,36 +203,35 @@ public void testSatisfiesTruncateFieldOrder() { SortOrder truncate2 = SortOrder.builderFor(SCHEMA).asc(Expressions.truncate("data", 2), NULLS_LAST).build(); - Assert.assertTrue(id.satisfies(truncate2)); - Assert.assertTrue(id.satisfies(truncate4)); - Assert.assertFalse(truncate2.satisfies(id)); - Assert.assertFalse(truncate4.satisfies(id)); - Assert.assertTrue(truncate4.satisfies(truncate2)); - Assert.assertFalse(truncate2.satisfies(truncate4)); + assertThat(id.satisfies(truncate2)).isTrue(); + assertThat(truncate2.satisfies(id)).isFalse(); + assertThat(truncate4.satisfies(id)).isFalse(); + assertThat(truncate4.satisfies(truncate2)).isTrue(); + assertThat(truncate2.satisfies(truncate4)).isFalse(); } - @Test + @TestTemplate public void testSatisfiesDateFieldOrder() { SortOrder id = SortOrder.builderFor(SCHEMA).asc("d", NULLS_LAST).build(); SortOrder year = SortOrder.builderFor(SCHEMA).asc(Expressions.year("d"), NULLS_LAST).build(); SortOrder month = SortOrder.builderFor(SCHEMA).asc(Expressions.month("d"), NULLS_LAST).build(); SortOrder day = SortOrder.builderFor(SCHEMA).asc(Expressions.day("d"), NULLS_LAST).build(); - Assert.assertTrue(id.satisfies(year)); - Assert.assertTrue(id.satisfies(month)); - Assert.assertTrue(id.satisfies(day)); - Assert.assertFalse(year.satisfies(id)); - Assert.assertFalse(month.satisfies(id)); - Assert.assertFalse(day.satisfies(id)); - Assert.assertTrue(day.satisfies(year)); - Assert.assertTrue(day.satisfies(month)); - Assert.assertTrue(month.satisfies(year)); - Assert.assertFalse(month.satisfies(day)); - Assert.assertFalse(year.satisfies(day)); - Assert.assertFalse(year.satisfies(month)); + assertThat(id.satisfies(year)).isTrue(); + assertThat(id.satisfies(month)).isTrue(); + assertThat(id.satisfies(day)).isTrue(); + assertThat(year.satisfies(id)).isFalse(); + assertThat(month.satisfies(id)).isFalse(); + assertThat(day.satisfies(id)).isFalse(); + assertThat(day.satisfies(year)).isTrue(); + assertThat(day.satisfies(month)).isTrue(); + assertThat(month.satisfies(year)).isTrue(); + assertThat(month.satisfies(day)).isFalse(); + assertThat(year.satisfies(day)).isFalse(); + assertThat(year.satisfies(month)).isFalse(); } - @Test + @TestTemplate public void testSatisfiesTimestampFieldOrder() { SortOrder id = SortOrder.builderFor(SCHEMA).asc("ts", NULLS_LAST).build(); SortOrder year = SortOrder.builderFor(SCHEMA).asc(Expressions.year("ts"), NULLS_LAST).build(); @@ -250,41 +239,40 @@ public void testSatisfiesTimestampFieldOrder() { SortOrder day = SortOrder.builderFor(SCHEMA).asc(Expressions.day("ts"), NULLS_LAST).build(); SortOrder hour = SortOrder.builderFor(SCHEMA).asc(Expressions.hour("ts"), NULLS_LAST).build(); - Assert.assertTrue(id.satisfies(year)); - Assert.assertTrue(id.satisfies(month)); - Assert.assertTrue(id.satisfies(day)); - Assert.assertTrue(id.satisfies(hour)); - Assert.assertFalse(year.satisfies(id)); - Assert.assertFalse(month.satisfies(id)); - Assert.assertFalse(day.satisfies(id)); - Assert.assertFalse(hour.satisfies(id)); - Assert.assertTrue(hour.satisfies(year)); - Assert.assertTrue(hour.satisfies(month)); - Assert.assertTrue(hour.satisfies(day)); - Assert.assertTrue(day.satisfies(year)); - Assert.assertTrue(day.satisfies(month)); - Assert.assertFalse(day.satisfies(hour)); - Assert.assertTrue(month.satisfies(year)); - Assert.assertFalse(month.satisfies(day)); - Assert.assertFalse(month.satisfies(hour)); - Assert.assertFalse(year.satisfies(day)); - Assert.assertFalse(year.satisfies(month)); - Assert.assertFalse(year.satisfies(hour)); + assertThat(id.satisfies(year)).isTrue(); + assertThat(id.satisfies(month)).isTrue(); + assertThat(id.satisfies(day)).isTrue(); + assertThat(id.satisfies(hour)).isTrue(); + assertThat(year.satisfies(id)).isFalse(); + assertThat(month.satisfies(id)).isFalse(); + assertThat(day.satisfies(id)).isFalse(); + assertThat(hour.satisfies(id)).isFalse(); + assertThat(hour.satisfies(year)).isTrue(); + assertThat(hour.satisfies(month)).isTrue(); + assertThat(hour.satisfies(day)).isTrue(); + assertThat(day.satisfies(year)).isTrue(); + assertThat(day.satisfies(month)).isTrue(); + assertThat(day.satisfies(hour)).isFalse(); + assertThat(month.satisfies(year)).isTrue(); + assertThat(month.satisfies(day)).isFalse(); + assertThat(month.satisfies(hour)).isFalse(); + assertThat(year.satisfies(day)).isFalse(); + assertThat(year.satisfies(month)).isFalse(); + assertThat(year.satisfies(hour)).isFalse(); } - @Test + @TestTemplate public void testSameOrder() { SortOrder order1 = SortOrder.builderFor(SCHEMA).withOrderId(9).asc("s.id", NULLS_LAST).build(); SortOrder order2 = SortOrder.builderFor(SCHEMA).withOrderId(10).asc("s.id", NULLS_LAST).build(); // orders have different ids but are logically the same - Assert.assertNotEquals("Orders must not be equal", order1, order2); - Assert.assertTrue("Orders must be equivalent", order1.sameOrder(order2)); - Assert.assertTrue("Orders must be equivalent", order2.sameOrder(order1)); + assertThat(order2).isNotEqualTo(order1); + assertThat(order2.fields()).isEqualTo(order1.fields()); } - @Test + @TestTemplate public void testSchemaEvolutionWithSortOrder() { PartitionSpec spec = PartitionSpec.unpartitioned(); SortOrder order = @@ -295,14 +283,13 @@ public void testSchemaEvolutionWithSortOrder() { table.updateSchema().renameColumn("s.id", "s.id2").commit(); SortOrder actualOrder = table.sortOrder(); - Assert.assertEquals( - "Order ID must match", TableMetadata.INITIAL_SORT_ORDER_ID, actualOrder.orderId()); - Assert.assertEquals("Order must have 2 fields", 2, actualOrder.fields().size()); - Assert.assertEquals("Field id must match", 8, actualOrder.fields().get(0).sourceId()); - Assert.assertEquals("Field id must match", 2, actualOrder.fields().get(1).sourceId()); + assertThat(actualOrder.orderId()).isEqualTo(TableMetadata.INITIAL_SORT_ORDER_ID); + assertThat(actualOrder.fields()).hasSize(2); + assertThat(actualOrder.fields().get(0).sourceId()).isEqualTo(8); + assertThat(actualOrder.fields().get(1).sourceId()).isEqualTo(2); } - @Test + @TestTemplate public void testColumnDropWithSortOrder() { PartitionSpec spec = PartitionSpec.unpartitioned(); @@ -316,16 +303,14 @@ public void testColumnDropWithSortOrder() { table.updateSchema().deleteColumn("id").commit(); SortOrder actualOrder = table.sortOrder(); - Assert.assertEquals( - "Order ID must match", TableMetadata.INITIAL_SORT_ORDER_ID + 1, actualOrder.orderId()); - Assert.assertEquals( - "Schema must have one less column", initialColSize - 1, table.schema().columns().size()); + assertThat(actualOrder.orderId()).isEqualTo(TableMetadata.INITIAL_SORT_ORDER_ID + 1); + assertThat(table.schema().columns()).hasSize(initialColSize - 1); // ensure that the table metadata can be serialized and reloaded with an invalid order TableMetadataParser.fromJson(TableMetadataParser.toJson(table.ops().current())); } - @Test + @TestTemplate public void testIncompatibleSchemaEvolutionWithSortOrder() { PartitionSpec spec = PartitionSpec.unpartitioned(); SortOrder order = @@ -333,26 +318,26 @@ public void testIncompatibleSchemaEvolutionWithSortOrder() { TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, formatVersion); - Assertions.assertThatThrownBy(() -> table.updateSchema().deleteColumn("s.id").commit()) + assertThatThrownBy(() -> table.updateSchema().deleteColumn("s.id").commit()) .isInstanceOf(ValidationException.class) .hasMessageStartingWith("Cannot find source column for sort field"); } - @Test + @TestTemplate public void testEmptySortOrder() { SortOrder order = SortOrder.builderFor(SCHEMA).build(); - Assert.assertEquals("Order must be unsorted", SortOrder.unsorted(), order); + assertThat(order).isEqualTo(SortOrder.unsorted()); } - @Test + @TestTemplate public void testSortedColumnNames() { SortOrder order = SortOrder.builderFor(SCHEMA).withOrderId(10).asc("s.id").desc(truncate("data", 10)).build(); Set sortedCols = SortOrderUtil.orderPreservingSortedColumns(order); - Assert.assertEquals(ImmutableSet.of("s.id", "data"), sortedCols); + assertThat(sortedCols).containsExactly("s.id", "data"); } - @Test + @TestTemplate public void testPreservingOrderSortedColumnNames() { SortOrder order = SortOrder.builderFor(SCHEMA) @@ -361,13 +346,13 @@ public void testPreservingOrderSortedColumnNames() { .desc(truncate("data", 10)) .build(); Set sortedCols = SortOrderUtil.orderPreservingSortedColumns(order); - Assert.assertEquals(ImmutableSet.of("data"), sortedCols); + assertThat(sortedCols).containsExactly("data"); } - @Test + @TestTemplate public void testCaseSensitiveSortedColumnNames() { String fieldName = "ext1"; - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> SortOrder.builderFor(SCHEMA) .caseSensitive(true) @@ -380,6 +365,6 @@ public void testCaseSensitiveSortedColumnNames() { SortOrder ext1 = SortOrder.builderFor(SCHEMA).caseSensitive(false).withOrderId(10).asc("ext1").build(); SortField sortField = ext1.fields().get(0); - Assert.assertEquals(sortField.sourceId(), SCHEMA.findField("Ext1").fieldId()); + assertThat(SCHEMA.findField("Ext1").fieldId()).isEqualTo(sortField.sourceId()); } } diff --git a/core/src/test/java/org/apache/iceberg/TestSortOrderParser.java b/core/src/test/java/org/apache/iceberg/TestSortOrderParser.java index 5aba9e9ad533..adcc130c2390 100644 --- a/core/src/test/java/org/apache/iceberg/TestSortOrderParser.java +++ b/core/src/test/java/org/apache/iceberg/TestSortOrderParser.java @@ -20,18 +20,23 @@ import static org.apache.iceberg.NullOrder.NULLS_FIRST; import static org.apache.iceberg.SortDirection.DESC; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.util.Arrays; +import java.util.List; import org.apache.iceberg.transforms.UnknownTransform; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; -public class TestSortOrderParser extends TableTestBase { - public TestSortOrderParser() { - super(1); +@ExtendWith(ParameterizedTestExtension.class) +public class TestSortOrderParser extends TestBase { + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1); } - @Test + @TestTemplate public void testUnknownTransforms() { String jsonString = "{\n" @@ -46,16 +51,18 @@ public void testUnknownTransforms() { SortOrder order = SortOrderParser.fromJson(table.schema(), jsonString); - Assert.assertEquals(10, order.orderId()); - Assert.assertEquals(1, order.fields().size()); - Assertions.assertThat(order.fields().get(0).transform()).isInstanceOf(UnknownTransform.class); - Assert.assertEquals("custom_transform", order.fields().get(0).transform().toString()); - Assert.assertEquals(2, order.fields().get(0).sourceId()); - Assert.assertEquals(DESC, order.fields().get(0).direction()); - Assert.assertEquals(NULLS_FIRST, order.fields().get(0).nullOrder()); + assertThat(order.orderId()).isEqualTo(10); + assertThat(order.fields()).hasSize(1); + assertThat(order.fields().get(0).transform()) + .isInstanceOf(UnknownTransform.class) + .asString() + .isEqualTo("custom_transform"); + assertThat(order.fields().get(0).sourceId()).isEqualTo(2); + assertThat(order.fields().get(0).direction()).isEqualTo(DESC); + assertThat(order.fields().get(0).nullOrder()).isEqualTo(NULLS_FIRST); } - @Test + @TestTemplate public void invalidSortDirection() { String jsonString = "{\n" @@ -68,7 +75,7 @@ public void invalidSortDirection() { + " } ]\n" + "}"; - Assertions.assertThatThrownBy(() -> SortOrderParser.fromJson(table.schema(), jsonString)) + assertThatThrownBy(() -> SortOrderParser.fromJson(table.schema(), jsonString)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Invalid sort direction: invalid"); } diff --git a/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java b/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java index ab83b277509c..04bb2ae215d8 100644 --- a/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java +++ b/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java @@ -19,9 +19,14 @@ package org.apache.iceberg; import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; import java.util.List; import java.util.UUID; import java.util.stream.Collectors; @@ -33,17 +38,13 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestSplitPlanning extends TableTestBase { +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestSplitPlanning extends TestBase { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); @@ -51,22 +52,19 @@ public class TestSplitPlanning extends TableTestBase { new Schema( optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule public TemporaryFolder temp = new TemporaryFolder(); - private Table table = null; + @TempDir private Path temp; - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; - } + private Table table = null; - public TestSplitPlanning(int formatVersion) { - super(formatVersion); + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); } @Override - @Before + @BeforeEach public void setupTable() throws IOException { - File tableDir = temp.newFolder(); + File tableDir = Files.createTempDirectory(temp, "junit").toFile(); String tableLocation = tableDir.toURI().toString(); table = TABLES.create(SCHEMA, tableLocation); table @@ -77,19 +75,19 @@ public void setupTable() throws IOException { .commit(); } - @Test + @TestTemplate public void testBasicSplitPlanning() { List files128Mb = newFiles(4, 128 * 1024 * 1024); appendFiles(files128Mb); // we expect 4 bins since split size is 128MB and we have 4 files 128MB each - Assert.assertEquals(4, Iterables.size(table.newScan().planTasks())); + assertThat(table.newScan().planTasks()).hasSize(4); List files32Mb = newFiles(16, 32 * 1024 * 1024); appendFiles(files32Mb); // we expect 8 bins after we add 16 files 32MB each as they will form additional 4 bins - Assert.assertEquals(8, Iterables.size(table.newScan().planTasks())); + assertThat(table.newScan().planTasks()).hasSize(8); } - @Test + @TestTemplate public void testSplitPlanningWithSmallFiles() { List files60Mb = newFiles(50, 60 * 1024 * 1024); List files5Kb = newFiles(370, 5 * 1024); @@ -101,10 +99,10 @@ public void testSplitPlanningWithSmallFiles() { // as "read.split.open-file-cost" is 4MB, each of the original 25 bins will get at most 2 files // so 50 of 370 files will be packed into the existing 25 bins and the remaining 320 files // will form additional 10 bins, resulting in 35 bins in total - Assert.assertEquals(35, Iterables.size(table.newScan().planTasks())); + assertThat(table.newScan().planTasks()).hasSize(35); } - @Test + @TestTemplate public void testSplitPlanningWithNoMinWeight() { table.updateProperties().set(TableProperties.SPLIT_OPEN_FILE_COST, "0").commit(); List files60Mb = newFiles(2, 60 * 1024 * 1024); @@ -112,30 +110,30 @@ public void testSplitPlanningWithNoMinWeight() { Iterable files = Iterables.concat(files60Mb, files5Kb); appendFiles(files); // all small files will be packed into one bin as "read.split.open-file-cost" is set to 0 - Assert.assertEquals(1, Iterables.size(table.newScan().planTasks())); + assertThat(table.newScan().planTasks()).hasSize(1); } - @Test + @TestTemplate public void testSplitPlanningWithOverridenSize() { List files128Mb = newFiles(4, 128 * 1024 * 1024); appendFiles(files128Mb); // we expect 2 bins since we are overriding split size in scan with 256MB TableScan scan = table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(256L * 1024 * 1024)); - Assert.assertEquals(2, Iterables.size(scan.planTasks())); + assertThat(scan.planTasks()).hasSize(2); } - @Test + @TestTemplate public void testSplitPlanningWithOverriddenSizeForMetadataJsonFile() { List files8Mb = newFiles(32, 8 * 1024 * 1024, FileFormat.METADATA); appendFiles(files8Mb); // we expect 16 bins since we are overriding split size in scan with 16MB TableScan scan = table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(16L * 1024 * 1024)); - Assert.assertEquals(16, Iterables.size(scan.planTasks())); + assertThat(scan.planTasks()).hasSize(16); } - @Test + @TestTemplate public void testSplitPlanningWithOverriddenSizeForLargeMetadataJsonFile() { List files128Mb = newFiles(4, 128 * 1024 * 1024, FileFormat.METADATA); appendFiles(files128Mb); @@ -143,10 +141,10 @@ public void testSplitPlanningWithOverriddenSizeForLargeMetadataJsonFile() { // splittable TableScan scan = table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(8L * 1024 * 1024)); - Assert.assertEquals(4, Iterables.size(scan.planTasks())); + assertThat(scan.planTasks()).hasSize(4); } - @Test + @TestTemplate public void testSplitPlanningWithOverridenLookback() { List files120Mb = newFiles(1, 120 * 1024 * 1024); List file128Mb = newFiles(1, 128 * 1024 * 1024); @@ -155,15 +153,15 @@ public void testSplitPlanningWithOverridenLookback() { // we expect 2 bins from non-overriden table properties TableScan scan = table.newScan().option(TableProperties.SPLIT_LOOKBACK, "1"); CloseableIterable tasks = scan.planTasks(); - Assert.assertEquals(2, Iterables.size(tasks)); + assertThat(tasks).hasSize(2); // since lookback was overridden to 1, we expect the first bin to be the largest of the two. CombinedScanTask combinedScanTask = tasks.iterator().next(); FileScanTask task = combinedScanTask.files().iterator().next(); - Assert.assertEquals(128 * 1024 * 1024, task.length()); + assertThat(task.length()).isEqualTo(128 * 1024 * 1024); } - @Test + @TestTemplate public void testSplitPlanningWithOverridenOpenCostSize() { List files16Mb = newFiles(16, 16 * 1024 * 1024); appendFiles(files16Mb); @@ -173,18 +171,18 @@ public void testSplitPlanningWithOverridenOpenCostSize() { table .newScan() .option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(32L * 1024 * 1024)); - Assert.assertEquals(4, Iterables.size(scan.planTasks())); + assertThat(scan.planTasks()).hasSize(4); } - @Test + @TestTemplate public void testSplitPlanningWithNegativeValues() { - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(-10)).planTasks()) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Split size must be > 0: -10"); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> table .newScan() @@ -193,7 +191,7 @@ public void testSplitPlanningWithNegativeValues() { .isInstanceOf(IllegalArgumentException.class) .hasMessage("Split planning lookback must be > 0: -10"); - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> table .newScan() @@ -203,7 +201,7 @@ public void testSplitPlanningWithNegativeValues() { .hasMessage("File open cost must be >= 0: -10"); } - @Test + @TestTemplate public void testSplitPlanningWithOffsets() { List files16Mb = newFiles(16, 16 * 1024 * 1024, 2); appendFiles(files16Mb); @@ -212,11 +210,10 @@ public void testSplitPlanningWithOffsets() { // 1 split per row group TableScan scan = table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(10L * 1024 * 1024)); - Assert.assertEquals( - "We should get one task per row group", 32, Iterables.size(scan.planTasks())); + assertThat(scan.planTasks()).hasSize(32); } - @Test + @TestTemplate public void testSplitPlanningWithOffsetsUnableToSplit() { List files16Mb = newFiles(16, 16 * 1024 * 1024, 2); appendFiles(files16Mb); @@ -228,11 +225,10 @@ public void testSplitPlanningWithOffsetsUnableToSplit() { .newScan() .option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(0)) .option(TableProperties.SPLIT_SIZE, String.valueOf(4L * 1024 * 1024)); - Assert.assertEquals( - "We should still only get 2 tasks per file", 32, Iterables.size(scan.planTasks())); + assertThat(scan.planTasks()).hasSize(32); } - @Test + @TestTemplate public void testBasicSplitPlanningDeleteFiles() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); List files128Mb = newDeleteFiles(4, 128 * 1024 * 1024); @@ -240,14 +236,14 @@ public void testBasicSplitPlanningDeleteFiles() { PositionDeletesTable posDeletesTable = new PositionDeletesTable(table); // we expect 4 bins since split size is 128MB and we have 4 files 128MB each - Assert.assertEquals(4, Iterables.size(posDeletesTable.newBatchScan().planTasks())); + assertThat(posDeletesTable.newBatchScan().planTasks()).hasSize(4); List files32Mb = newDeleteFiles(16, 32 * 1024 * 1024); appendDeleteFiles(files32Mb); // we expect 8 bins after we add 16 files 32MB each as they will form additional 4 bins - Assert.assertEquals(8, Iterables.size(posDeletesTable.newBatchScan().planTasks())); + assertThat(posDeletesTable.newBatchScan().planTasks()).hasSize(8); } - @Test + @TestTemplate public void testBasicSplitPlanningDeleteFilesWithSplitOffsets() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); List files128Mb = newDeleteFiles(4, 128 * 1024 * 1024, 8); @@ -266,19 +262,19 @@ public void testBasicSplitPlanningDeleteFilesWithSplitOffsets() { long previousOffset = -1; for (ScanTask task : group.tasks()) { tasksPerGroup++; - Assert.assertTrue(task instanceof SplitPositionDeletesScanTask); + assertThat(task).isInstanceOf(SplitPositionDeletesScanTask.class); SplitPositionDeletesScanTask splitPosDelTask = (SplitPositionDeletesScanTask) task; if (previousOffset != -1) { - Assert.assertEquals(splitPosDelTask.start(), previousOffset); + assertThat(previousOffset).isEqualTo(splitPosDelTask.start()); } previousOffset = splitPosDelTask.start() + splitPosDelTask.length(); } - Assert.assertEquals("Should have 1 task as result of task merge", 1, tasksPerGroup); + assertThat(tasksPerGroup).isEqualTo(1); totalTaskGroups++; } // we expect 8 bins since split size is 64MB - Assert.assertEquals(8, totalTaskGroups); + assertThat(totalTaskGroups).isEqualTo(8); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/core/src/test/java/org/apache/iceberg/TestV1ToV2RowDeltaDelete.java b/core/src/test/java/org/apache/iceberg/TestV1ToV2RowDeltaDelete.java index 6d38ab427315..28695e2fffe0 100644 --- a/core/src/test/java/org/apache/iceberg/TestV1ToV2RowDeltaDelete.java +++ b/core/src/test/java/org/apache/iceberg/TestV1ToV2RowDeltaDelete.java @@ -20,19 +20,18 @@ import static org.apache.iceberg.expressions.Expressions.bucket; import static org.apache.iceberg.expressions.Expressions.equal; +import static org.assertj.core.api.Assertions.assertThat; +import java.util.Arrays; import java.util.List; -import java.util.Optional; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.TestTemplate; -public class TestV1ToV2RowDeltaDelete extends TableTestBase { +public class TestV1ToV2RowDeltaDelete extends TestBase { - public TestV1ToV2RowDeltaDelete() { - super(1 /* table format version */); + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1); } static final DeleteFile FILE_A_POS_1 = @@ -55,20 +54,18 @@ public TestV1ToV2RowDeltaDelete() { private void verifyManifestSequenceNumber( ManifestFile mf, long sequenceNum, long minSequenceNum) { - Assert.assertEquals( - "sequence number should be " + sequenceNum, mf.sequenceNumber(), sequenceNum); - Assert.assertEquals( - "min sequence number should be " + minSequenceNum, mf.minSequenceNumber(), minSequenceNum); + assertThat(sequenceNum).isEqualTo(mf.sequenceNumber()); + assertThat(minSequenceNum).isEqualTo(mf.minSequenceNumber()); } - @Test + @TestTemplate public void testPartitionedTableWithPartitionEqDeletes() { table.newAppend().appendFile(FILE_A).appendFile(FILE_B).appendFile(FILE_C).commit(); List dataManifests = table.currentSnapshot().dataManifests(table.io()); List deleteManifests = table.currentSnapshot().deleteManifests(table.io()); - Assert.assertEquals("Should have one data manifest file", 1, dataManifests.size()); - Assert.assertEquals("Should have zero delete manifest file", 0, deleteManifests.size()); + assertThat(dataManifests).hasSize(1); + assertThat(deleteManifests).isEmpty(); ManifestFile dataManifest = dataManifests.get(0); verifyManifestSequenceNumber(dataManifest, 0, 0); @@ -81,56 +78,51 @@ public void testPartitionedTableWithPartitionEqDeletes() { dataManifests = table.currentSnapshot().dataManifests(ops.io()); deleteManifests = table.currentSnapshot().deleteManifests(ops.io()); - Assert.assertEquals("Should have one data manifest file", 1, dataManifests.size()); - Assert.assertEquals("Should have one delete manifest file", 1, deleteManifests.size()); - Assert.assertEquals(dataManifest, dataManifests.get(0)); // data manifest not changed + assertThat(dataManifests).hasSize(1).first().isEqualTo(dataManifest); + assertThat(deleteManifests).hasSize(1); ManifestFile deleteManifest = deleteManifests.get(0); verifyManifestSequenceNumber(deleteManifest, 1, 1); - List tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); - Assert.assertEquals("Should have three task", 3, tasks.size()); - Optional task = - tasks.stream().filter(t -> t.file().path().equals(FILE_A.path())).findFirst(); - Assert.assertTrue(task.isPresent()); - Assert.assertEquals("Should have one associated delete file", 1, task.get().deletes().size()); - Assert.assertEquals( - "Should have only pos delete file", FILE_A_EQ_1.path(), task.get().deletes().get(0).path()); + assertThat(table.newScan().planFiles()) + .hasSize(3) + .filteredOn(fileScanTask -> fileScanTask.file().path().equals(FILE_A.path())) + .first() + .satisfies( + fileScanTask -> { + assertThat(fileScanTask.deletes()).hasSize(1); + assertThat(fileScanTask.deletes().get(0).path()).isEqualTo(FILE_A_EQ_1.path()); + }); // first commit after row-delta changes table.newDelete().deleteFile(FILE_B).commit(); dataManifests = table.currentSnapshot().dataManifests(ops.io()); deleteManifests = table.currentSnapshot().deleteManifests(ops.io()); - Assert.assertEquals("Should have one data manifest file", 1, dataManifests.size()); - Assert.assertEquals("Should have one delete manifest file", 1, deleteManifests.size()); + assertThat(dataManifests).hasSize(1).first().isNotEqualTo(dataManifest); + assertThat(deleteManifests).hasSize(1).first().isEqualTo(deleteManifest); ManifestFile dataManifest2 = dataManifests.get(0); verifyManifestSequenceNumber(dataManifest2, 2, 0); - Assert.assertNotEquals(dataManifest, dataManifest2); - Assert.assertEquals(deleteManifest, deleteManifests.get(0)); // delete manifest not changed - tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); - Assert.assertEquals("Should have two task", 2, tasks.size()); - task = tasks.stream().filter(t -> t.file().path().equals(FILE_A.path())).findFirst(); - Assert.assertTrue(task.isPresent()); - Assert.assertEquals("Should have one associated delete file", 1, task.get().deletes().size()); + assertThat(table.newScan().planFiles()) + .hasSize(2) + .filteredOn(fileScanTask -> fileScanTask.file().path().equals(FILE_A.path())) + .first() + .satisfies(fileScanTask -> assertThat(fileScanTask.deletes()).hasSize(1)); // second commit after row-delta changes table.newDelete().deleteFile(FILE_C).commit(); dataManifests = table.currentSnapshot().dataManifests(ops.io()); deleteManifests = table.currentSnapshot().deleteManifests(ops.io()); - Assert.assertEquals("Should have one data manifest file", 1, dataManifests.size()); - Assert.assertEquals("Should have one delete manifest file", 1, deleteManifests.size()); - ManifestFile dataManifest3 = dataManifests.get(0); - verifyManifestSequenceNumber(dataManifest3, 3, 0); - Assert.assertNotEquals(dataManifest2, dataManifest3); - Assert.assertEquals(deleteManifest, deleteManifests.get(0)); // delete manifest not changed - tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); - Assert.assertEquals("Should have one task", 1, tasks.size()); - task = tasks.stream().filter(t -> t.file().path().equals(FILE_A.path())).findFirst(); - Assert.assertTrue(task.isPresent()); - Assert.assertEquals("Should have one associated delete file", 1, task.get().deletes().size()); + assertThat(dataManifests).hasSize(1).first().isNotEqualTo(dataManifest2); + assertThat(deleteManifests).hasSize(1).first().isEqualTo(deleteManifest); + verifyManifestSequenceNumber(dataManifests.get(0), 3, 0); + assertThat(table.newScan().planFiles()) + .hasSize(1) + .filteredOn(fileScanTask -> fileScanTask.file().path().equals(FILE_A.path())) + .first() + .satisfies(fileScanTask -> assertThat(fileScanTask.deletes()).hasSize(1)); } - @Test + @TestTemplate public void testPartitionedTableWithUnrelatedPartitionDeletes() { table.newAppend().appendFile(FILE_B).appendFile(FILE_C).appendFile(FILE_D).commit(); @@ -141,27 +133,29 @@ public void testPartitionedTableWithUnrelatedPartitionDeletes() { table.newRowDelta().addDeletes(FILE_A_POS_1).addDeletes(FILE_A_EQ_1).commit(); - List tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); - Assert.assertEquals("Should have three task", 3, tasks.size()); - Assert.assertEquals( - "Should have the correct data file path", FILE_B.path(), tasks.get(0).file().path()); - Assert.assertEquals( - "Should have zero associated delete file", 0, tasks.get(0).deletes().size()); + assertThat(table.newScan().planFiles()) + .hasSize(3) + .first() + .satisfies( + fileScanTask -> { + assertThat(fileScanTask.file().path()).isEqualTo(FILE_B.path()); + assertThat(fileScanTask.deletes()).isEmpty(); + }); table.newDelete().deleteFile(FILE_B).commit(); - tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); - Assert.assertEquals("Should have two task", 2, tasks.size()); - Assert.assertEquals( - "Should have zero associated delete file", 0, tasks.get(0).deletes().size()); + assertThat(table.newScan().planFiles()) + .hasSize(2) + .first() + .satisfies(fileScanTask -> assertThat(fileScanTask.deletes()).isEmpty()); table.newDelete().deleteFile(FILE_C).commit(); - tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); - Assert.assertEquals("Should have one task", 1, tasks.size()); - Assert.assertEquals( - "Should have zero associated delete file", 0, tasks.get(0).deletes().size()); + assertThat(table.newScan().planFiles()) + .hasSize(1) + .first() + .satisfies(fileScanTask -> assertThat(fileScanTask.deletes()).isEmpty()); } - @Test + @TestTemplate public void testPartitionedTableWithExistingDeleteFile() { table.updateProperties().set(TableProperties.MANIFEST_MERGE_ENABLED, "false").commit(); @@ -182,30 +176,18 @@ public void testPartitionedTableWithExistingDeleteFile() { .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") .commit(); - Assert.assertEquals( - "Should have two delete manifests", - 2, - table.currentSnapshot().deleteManifests(table.io()).size()); + assertThat(table.currentSnapshot().deleteManifests(table.io())).hasSize(2); // merge delete manifests table.newAppend().appendFile(FILE_B).commit(); - Assert.assertEquals( - "Should have one delete manifest", - 1, - table.currentSnapshot().deleteManifests(table.io()).size()); - Assert.assertEquals( - "Should have zero added delete file", - 0, - table.currentSnapshot().deleteManifests(table.io()).get(0).addedFilesCount().intValue()); - Assert.assertEquals( - "Should have zero deleted delete file", - 0, - table.currentSnapshot().deleteManifests(table.io()).get(0).deletedFilesCount().intValue()); - Assert.assertEquals( - "Should have two existing delete files", - 2, - table.currentSnapshot().deleteManifests(table.io()).get(0).existingFilesCount().intValue()); + assertThat(table.currentSnapshot().deleteManifests(table.io())).hasSize(1); + assertThat(table.currentSnapshot().deleteManifests(table.io()).get(0).addedFilesCount()) + .isEqualTo(0); + assertThat(table.currentSnapshot().deleteManifests(table.io()).get(0).deletedFilesCount()) + .isEqualTo(0); + assertThat(table.currentSnapshot().deleteManifests(table.io()).get(0).existingFilesCount()) + .isEqualTo(2); List tasks = Lists.newArrayList( @@ -214,19 +196,16 @@ public void testPartitionedTableWithExistingDeleteFile() { .filter(equal(bucket("data", BUCKETS_NUMBER), 0)) .planFiles() .iterator()); - Assert.assertEquals("Should have one task", 1, tasks.size()); + assertThat(tasks).hasSize(1); FileScanTask task = tasks.get(0); - Assert.assertEquals( - "Should have the correct data file path", FILE_A.path(), task.file().path()); - Assert.assertEquals("Should have two associated delete files", 2, task.deletes().size()); - Assert.assertEquals( - "Should have expected delete files", - Sets.newHashSet(FILE_A_EQ_1.path(), FILE_A_POS_1.path()), - Sets.newHashSet(Iterables.transform(task.deletes(), ContentFile::path))); + assertThat(task.file().path()).isEqualTo(FILE_A.path()); + assertThat(task.deletes()).hasSize(2); + assertThat(task.deletes().get(0).path()).isEqualTo(FILE_A_EQ_1.path()); + assertThat(task.deletes().get(1).path()).isEqualTo(FILE_A_POS_1.path()); } - @Test + @TestTemplate public void testSequenceNumbersInUpgradedTables() { // add initial data table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); diff --git a/core/src/test/java/org/apache/iceberg/V2TableTestBase.java b/core/src/test/java/org/apache/iceberg/V2TableTestBase.java index 5e46f927f545..7e9dd5e85a00 100644 --- a/core/src/test/java/org/apache/iceberg/V2TableTestBase.java +++ b/core/src/test/java/org/apache/iceberg/V2TableTestBase.java @@ -18,8 +18,12 @@ */ package org.apache.iceberg; -public class V2TableTestBase extends TableTestBase { - public V2TableTestBase() { - super(2); +import java.util.Arrays; +import java.util.List; + +public class V2TableTestBase extends TestBase { + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(2); } } From 83bacf501e116c0923eb6e77bff4500cd3d8042f Mon Sep 17 00:00:00 2001 From: Csenger Geza Date: Mon, 25 Mar 2024 19:01:42 +0000 Subject: [PATCH 06/25] Add Iceberg version to UserAgent in S3 requests (#9963) This allows developers to monitor which version of Iceberg they have deployed to a cluster (for example, S3 Access Logs contain the user agent field). Co-authored-by: Geza Csenger --- .../org/apache/iceberg/aws/AwsClientFactories.java | 1 + .../aws/s3/DefaultS3FileIOAwsClientFactory.java | 1 + .../apache/iceberg/aws/s3/S3FileIOProperties.java | 14 ++++++++++++++ .../iceberg/aws/s3/TestS3FileIOProperties.java | 10 ++++++++++ 4 files changed, 26 insertions(+) diff --git a/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactories.java b/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactories.java index 580e7303ffca..81c7bd6b4bab 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactories.java +++ b/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactories.java @@ -113,6 +113,7 @@ public S3Client s3() { b -> s3FileIOProperties.applyCredentialConfigurations(awsClientProperties, b)) .applyMutation(s3FileIOProperties::applySignerConfiguration) .applyMutation(s3FileIOProperties::applyS3AccessGrantsConfigurations) + .applyMutation(s3FileIOProperties::applyUserAgentConfigurations) .build(); } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/DefaultS3FileIOAwsClientFactory.java b/aws/src/main/java/org/apache/iceberg/aws/s3/DefaultS3FileIOAwsClientFactory.java index a65910612fe6..18b40000a91a 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/DefaultS3FileIOAwsClientFactory.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/DefaultS3FileIOAwsClientFactory.java @@ -54,6 +54,7 @@ public S3Client s3() { awsClientProperties, s3ClientBuilder)) .applyMutation(s3FileIOProperties::applySignerConfiguration) .applyMutation(s3FileIOProperties::applyS3AccessGrantsConfigurations) + .applyMutation(s3FileIOProperties::applyUserAgentConfigurations) .build(); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java index 9aad784be865..857f35e710ab 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java @@ -24,6 +24,7 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.apache.iceberg.EnvironmentContext; import org.apache.iceberg.aws.AwsClientProperties; import org.apache.iceberg.aws.glue.GlueCatalog; import org.apache.iceberg.aws.s3.signer.S3V4RestSignerClient; @@ -375,6 +376,14 @@ public class S3FileIOProperties implements Serializable { public static final boolean PRELOAD_CLIENT_ENABLED_DEFAULT = false; + /** + * User Agent Prefix set by the S3 client. + * + *

This allows developers to monitor which version of Iceberg they have deployed to a cluster + * (for example, through the S3 Access Logs, which contain the user agent field). + */ + private static final String S3_FILE_IO_USER_AGENT = "s3fileio/" + EnvironmentContext.get(); + private String sseType; private String sseKey; private String sseMd5; @@ -819,6 +828,11 @@ public void applyS3AccessGrantsConfigurations(T buil } } + public void applyUserAgentConfigurations(T builder) { + builder.overrideConfiguration( + c -> c.putAdvancedOption(SdkAdvancedClientOption.USER_AGENT_PREFIX, S3_FILE_IO_USER_AGENT)); + } + /** * Dynamically load the http client builder to avoid runtime deps requirements of any optional SDK * Plugins diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java index 2ed8a9471d66..658b5b781969 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java @@ -478,4 +478,14 @@ public void testApplyEndpointConfiguration() { s3FileIOProperties.applyEndpointConfigurations(mockS3ClientBuilder); Mockito.verify(mockS3ClientBuilder).endpointOverride(Mockito.any(URI.class)); } + + @Test + public void testApplyUserAgentConfigurations() { + Map properties = Maps.newHashMap(); + S3FileIOProperties s3FileIOProperties = new S3FileIOProperties(properties); + S3ClientBuilder mockS3ClientBuilder = Mockito.mock(S3ClientBuilder.class); + s3FileIOProperties.applyUserAgentConfigurations(mockS3ClientBuilder); + + Mockito.verify(mockS3ClientBuilder).overrideConfiguration(Mockito.any(Consumer.class)); + } } From 602186bedc7f5be270f9a0cd5c2c15da7bddcdb9 Mon Sep 17 00:00:00 2001 From: Amogh Jahagirdar Date: Mon, 25 Mar 2024 15:03:33 -0700 Subject: [PATCH 07/25] Core, Spark: Fix handling of null binary values when sorting with zorder (#10026) --- .../apache/iceberg/util/ZOrderByteUtils.java | 5 +++ .../iceberg/util/TestZOrderByteUtil.java | 10 ++++++ .../TestRewriteDataFilesProcedure.java | 35 +++++++++++++++++++ .../TestRewriteDataFilesProcedure.java | 35 +++++++++++++++++++ .../TestRewriteDataFilesProcedure.java | 35 +++++++++++++++++++ 5 files changed, 120 insertions(+) diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index 923f3dc2d5c6..c687fc4e03dc 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -145,6 +145,11 @@ public static ByteBuffer stringToOrderedBytes( @SuppressWarnings("ByteBufferBackingArray") public static ByteBuffer byteTruncateOrFill(byte[] val, int length, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, length); + if (val == null) { + Arrays.fill(bytes.array(), 0, length, (byte) 0x00); + return bytes; + } + if (val.length < length) { bytes.put(val, 0, val.length); Arrays.fill(bytes.array(), val.length, length, (byte) 0x00); diff --git a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java index 13e7c843c79f..d05843e20849 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java @@ -391,4 +391,14 @@ public void testByteTruncateOrFill() { .isEqualTo(stringCompare); } } + + @Test + public void testByteTruncatedOrFillNullIsZeroArray() { + ByteBuffer buffer = ByteBuffer.allocate(128); + byte[] actualBytes = ZOrderByteUtils.byteTruncateOrFill(null, 128, buffer).array(); + ByteBuffer expected = ByteBuffer.allocate(128); + Arrays.fill(expected.array(), 0, 128, (byte) 0x00); + + assertThat(actualBytes).isEqualTo(expected.array()); + } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java index 0cdde158bde3..80cacbf376ac 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java @@ -218,6 +218,41 @@ public void testRewriteDataFilesWithZOrder() { assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s", tableName)); } + @Test + public void testRewriteDataFilesWithZOrderNullBinaryColumn() { + sql("CREATE TABLE %s (c1 int, c2 string, c3 binary) USING iceberg", tableName); + + for (int i = 0; i < 5; i++) { + sql("INSERT INTO %s values (1, 'foo', null), (2, 'bar', null)", tableName); + } + + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + + "strategy => 'sort', sort_order => 'zorder(c2,c3)')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 10 data files and add 1 data files", + row(10, 1), + Arrays.copyOf(output.get(0), 2)); + assertThat(output.get(0)).hasSize(3); + assertThat(snapshotSummary()) + .containsEntry(SnapshotSummary.REMOVED_FILE_SIZE_PROP, String.valueOf(output.get(0)[2])); + assertThat(sql("SELECT * FROM %s", tableName)) + .containsExactly( + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null)); + } + @Test public void testRewriteDataFilesWithFilter() { createTable(); diff --git a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java index 477aa2a1d958..009ab41228c8 100644 --- a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java +++ b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java @@ -261,6 +261,41 @@ public void testRewriteDataFilesWithZOrder() { assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s", tableName)); } + @Test + public void testRewriteDataFilesWithZOrderNullBinaryColumn() { + sql("CREATE TABLE %s (c1 int, c2 string, c3 binary) USING iceberg", tableName); + + for (int i = 0; i < 5; i++) { + sql("INSERT INTO %s values (1, 'foo', null), (2, 'bar', null)", tableName); + } + + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + + "strategy => 'sort', sort_order => 'zorder(c2,c3)')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 10 data files and add 1 data files", + row(10, 1), + Arrays.copyOf(output.get(0), 2)); + assertThat(output.get(0)).hasSize(4); + assertThat(snapshotSummary()) + .containsEntry(SnapshotSummary.REMOVED_FILE_SIZE_PROP, String.valueOf(output.get(0)[2])); + assertThat(sql("SELECT * FROM %s", tableName)) + .containsExactly( + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null)); + } + @Test public void testRewriteDataFilesWithZOrderAndMultipleShufflePartitionsPerFile() { createTable(); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java index b01438d39dfc..9ba886db4516 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java @@ -259,6 +259,41 @@ public void testRewriteDataFilesWithZOrder() { assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s", tableName)); } + @TestTemplate + public void testRewriteDataFilesWithZOrderNullBinaryColumn() { + sql("CREATE TABLE %s (c1 int, c2 string, c3 binary) USING iceberg", tableName); + + for (int i = 0; i < 5; i++) { + sql("INSERT INTO %s values (1, 'foo', null), (2, 'bar', null)", tableName); + } + + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + + "strategy => 'sort', sort_order => 'zorder(c2,c3)')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 10 data files and add 1 data files", + row(10, 1), + Arrays.copyOf(output.get(0), 2)); + assertThat(output.get(0)).hasSize(4); + assertThat(snapshotSummary()) + .containsEntry(SnapshotSummary.REMOVED_FILE_SIZE_PROP, String.valueOf(output.get(0)[2])); + assertThat(sql("SELECT * FROM %s", tableName)) + .containsExactly( + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null)); + } + @TestTemplate public void testRewriteDataFilesWithZOrderAndMultipleShufflePartitionsPerFile() { createTable(); From 817a5e1be1616af77329965ac3742c14ca3ae116 Mon Sep 17 00:00:00 2001 From: Naveen Kumar Date: Tue, 26 Mar 2024 12:19:22 +0530 Subject: [PATCH 08/25] Hive: Extract common code to be re-used for View support (#10001) --- .../apache/iceberg/BaseMetastoreCatalog.java | 21 +--- .../iceberg/BaseMetastoreOperations.java | 118 ++++++++++++++++++ .../iceberg/BaseMetastoreTableOperations.java | 100 +++++---------- .../java/org/apache/iceberg/CatalogUtil.java | 24 ++++ .../iceberg/view/BaseViewOperations.java | 3 +- .../org/apache/iceberg/TestCatalogUtil.java | 26 ++++ .../org/apache/iceberg/hive/HiveCatalog.java | 52 +++++--- .../iceberg/hive/HiveOperationsBase.java | 68 +++++++++- .../iceberg/hive/HiveTableOperations.java | 39 ++---- 9 files changed, 314 insertions(+), 137 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/BaseMetastoreOperations.java diff --git a/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java b/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java index bb7d5a0ffd9d..e794b3121dc3 100644 --- a/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java +++ b/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java @@ -285,26 +285,7 @@ private Map tableOverrideProperties() { } protected static String fullTableName(String catalogName, TableIdentifier identifier) { - StringBuilder sb = new StringBuilder(); - - if (catalogName.contains("/") || catalogName.contains(":")) { - // use / for URI-like names: thrift://host:port/db.table - sb.append(catalogName); - if (!catalogName.endsWith("/")) { - sb.append("/"); - } - } else { - // use . for non-URI named catalogs: prod.db.table - sb.append(catalogName).append("."); - } - - for (String level : identifier.namespace().levels()) { - sb.append(level).append("."); - } - - sb.append(identifier.name()); - - return sb.toString(); + return CatalogUtil.fullTableName(catalogName, identifier); } protected MetricsReporter metricsReporter() { diff --git a/core/src/main/java/org/apache/iceberg/BaseMetastoreOperations.java b/core/src/main/java/org/apache/iceberg/BaseMetastoreOperations.java new file mode 100644 index 000000000000..09c2249046f4 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/BaseMetastoreOperations.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.apache.iceberg.TableProperties.COMMIT_NUM_STATUS_CHECKS; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_STATUS_CHECKS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MAX_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MAX_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MIN_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MIN_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS_DEFAULT; + +import java.util.Map; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Supplier; +import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.Tasks; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class BaseMetastoreOperations { + private static final Logger LOG = LoggerFactory.getLogger(BaseMetastoreOperations.class); + + public enum CommitStatus { + FAILURE, + SUCCESS, + UNKNOWN + } + + /** + * Attempt to load the content and see if any current or past metadata location matches the one we + * were attempting to set. This is used as a last resort when we are dealing with exceptions that + * may indicate the commit has failed but don't have proof that this is the case. Note that all + * the previous locations must also be searched on the chance that a second committer was able to + * successfully commit on top of our commit. + * + * @param tableOrViewName full name of the Table/View + * @param newMetadataLocation the path of the new commit file + * @param properties properties for retry + * @param commitStatusSupplier check if the latest metadata presents or not using metadata + * location for table. + * @return Commit Status of Success, Failure or Unknown + */ + protected CommitStatus checkCommitStatus( + String tableOrViewName, + String newMetadataLocation, + Map properties, + Supplier commitStatusSupplier) { + int maxAttempts = + PropertyUtil.propertyAsInt( + properties, COMMIT_NUM_STATUS_CHECKS, COMMIT_NUM_STATUS_CHECKS_DEFAULT); + long minWaitMs = + PropertyUtil.propertyAsLong( + properties, COMMIT_STATUS_CHECKS_MIN_WAIT_MS, COMMIT_STATUS_CHECKS_MIN_WAIT_MS_DEFAULT); + long maxWaitMs = + PropertyUtil.propertyAsLong( + properties, COMMIT_STATUS_CHECKS_MAX_WAIT_MS, COMMIT_STATUS_CHECKS_MAX_WAIT_MS_DEFAULT); + long totalRetryMs = + PropertyUtil.propertyAsLong( + properties, + COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS, + COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS_DEFAULT); + + AtomicReference status = new AtomicReference<>(CommitStatus.UNKNOWN); + + Tasks.foreach(newMetadataLocation) + .retry(maxAttempts) + .suppressFailureWhenFinished() + .exponentialBackoff(minWaitMs, maxWaitMs, totalRetryMs, 2.0) + .onFailure( + (location, checkException) -> + LOG.error("Cannot check if commit to {} exists.", tableOrViewName, checkException)) + .run( + location -> { + boolean commitSuccess = commitStatusSupplier.get(); + + if (commitSuccess) { + LOG.info( + "Commit status check: Commit to {} of {} succeeded", + tableOrViewName, + newMetadataLocation); + status.set(CommitStatus.SUCCESS); + } else { + LOG.warn( + "Commit status check: Commit to {} of {} unknown, new metadata location is not current " + + "or in history", + tableOrViewName, + newMetadataLocation); + } + }); + + if (status.get() == CommitStatus.UNKNOWN) { + LOG.error( + "Cannot determine commit state to {}. Failed during checking {} times. " + + "Treating commit state as unknown.", + tableOrViewName, + maxAttempts); + } + return status.get(); + } +} diff --git a/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java b/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java index 2fccef5a0ab3..6443cf6e60ea 100644 --- a/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java +++ b/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java @@ -18,15 +18,6 @@ */ package org.apache.iceberg; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_STATUS_CHECKS; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_STATUS_CHECKS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MAX_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MAX_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MIN_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MIN_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS_DEFAULT; - import java.util.Set; import java.util.UUID; import java.util.concurrent.atomic.AtomicReference; @@ -44,12 +35,12 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.util.LocationUtil; -import org.apache.iceberg.util.PropertyUtil; import org.apache.iceberg.util.Tasks; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public abstract class BaseMetastoreTableOperations implements TableOperations { +public abstract class BaseMetastoreTableOperations extends BaseMetastoreOperations + implements TableOperations { private static final Logger LOG = LoggerFactory.getLogger(BaseMetastoreTableOperations.class); public static final String TABLE_TYPE_PROP = "table_type"; @@ -291,6 +282,11 @@ public long newSnapshotId() { }; } + /** + * @deprecated since 1.6.0, will be removed in 1.7.0; Use {@link + * BaseMetastoreOperations.CommitStatus} instead + */ + @Deprecated protected enum CommitStatus { FAILURE, SUCCESS, @@ -309,65 +305,29 @@ protected enum CommitStatus { * @return Commit Status of Success, Failure or Unknown */ protected CommitStatus checkCommitStatus(String newMetadataLocation, TableMetadata config) { - int maxAttempts = - PropertyUtil.propertyAsInt( - config.properties(), COMMIT_NUM_STATUS_CHECKS, COMMIT_NUM_STATUS_CHECKS_DEFAULT); - long minWaitMs = - PropertyUtil.propertyAsLong( - config.properties(), - COMMIT_STATUS_CHECKS_MIN_WAIT_MS, - COMMIT_STATUS_CHECKS_MIN_WAIT_MS_DEFAULT); - long maxWaitMs = - PropertyUtil.propertyAsLong( - config.properties(), - COMMIT_STATUS_CHECKS_MAX_WAIT_MS, - COMMIT_STATUS_CHECKS_MAX_WAIT_MS_DEFAULT); - long totalRetryMs = - PropertyUtil.propertyAsLong( - config.properties(), - COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS, - COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS_DEFAULT); - - AtomicReference status = new AtomicReference<>(CommitStatus.UNKNOWN); - - Tasks.foreach(newMetadataLocation) - .retry(maxAttempts) - .suppressFailureWhenFinished() - .exponentialBackoff(minWaitMs, maxWaitMs, totalRetryMs, 2.0) - .onFailure( - (location, checkException) -> - LOG.error("Cannot check if commit to {} exists.", tableName(), checkException)) - .run( - location -> { - TableMetadata metadata = refresh(); - String currentMetadataFileLocation = metadata.metadataFileLocation(); - boolean commitSuccess = - currentMetadataFileLocation.equals(newMetadataLocation) - || metadata.previousFiles().stream() - .anyMatch(log -> log.file().equals(newMetadataLocation)); - if (commitSuccess) { - LOG.info( - "Commit status check: Commit to {} of {} succeeded", - tableName(), - newMetadataLocation); - status.set(CommitStatus.SUCCESS); - } else { - LOG.warn( - "Commit status check: Commit to {} of {} unknown, new metadata location is not current " - + "or in history", - tableName(), - newMetadataLocation); - } - }); - - if (status.get() == CommitStatus.UNKNOWN) { - LOG.error( - "Cannot determine commit state to {}. Failed during checking {} times. " - + "Treating commit state as unknown.", - tableName(), - maxAttempts); - } - return status.get(); + return CommitStatus.valueOf( + checkCommitStatus( + tableName(), + newMetadataLocation, + config.properties(), + () -> checkCurrentMetadataLocation(newMetadataLocation)) + .name()); + } + + /** + * Validate if the new metadata location is the current metadata location or present within + * previous metadata files. + * + * @param newMetadataLocation newly written metadata location + * @return true if the new metadata location is the current metadata location or present within + * previous metadata files. + */ + private boolean checkCurrentMetadataLocation(String newMetadataLocation) { + TableMetadata metadata = refresh(); + String currentMetadataFileLocation = metadata.metadataFileLocation(); + return currentMetadataFileLocation.equals(newMetadataLocation) + || metadata.previousFiles().stream() + .anyMatch(log -> log.file().equals(newMetadataLocation)); } private String newTableMetadataFilePath(TableMetadata meta, int newVersion) { diff --git a/core/src/main/java/org/apache/iceberg/CatalogUtil.java b/core/src/main/java/org/apache/iceberg/CatalogUtil.java index e09be748f2ee..4846dbb8e9a9 100644 --- a/core/src/main/java/org/apache/iceberg/CatalogUtil.java +++ b/core/src/main/java/org/apache/iceberg/CatalogUtil.java @@ -27,6 +27,7 @@ import java.util.Map; import java.util.Set; import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.common.DynClasses; import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.common.DynMethods; @@ -473,4 +474,27 @@ public static MetricsReporter loadMetricsReporter(Map properties return reporter; } + + public static String fullTableName(String catalogName, TableIdentifier identifier) { + StringBuilder sb = new StringBuilder(); + + if (catalogName.contains("/") || catalogName.contains(":")) { + // use / for URI-like names: thrift://host:port/db.table + sb.append(catalogName); + if (!catalogName.endsWith("/")) { + sb.append("/"); + } + } else { + // use . for non-URI named catalogs: prod.db.table + sb.append(catalogName).append("."); + } + + for (String level : identifier.namespace().levels()) { + sb.append(level).append("."); + } + + sb.append(identifier.name()); + + return sb.toString(); + } } diff --git a/core/src/main/java/org/apache/iceberg/view/BaseViewOperations.java b/core/src/main/java/org/apache/iceberg/view/BaseViewOperations.java index 766d217346e0..7a4f546b8860 100644 --- a/core/src/main/java/org/apache/iceberg/view/BaseViewOperations.java +++ b/core/src/main/java/org/apache/iceberg/view/BaseViewOperations.java @@ -22,6 +22,7 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.function.Predicate; +import org.apache.iceberg.BaseMetastoreOperations; import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.exceptions.AlreadyExistsException; import org.apache.iceberg.exceptions.CommitFailedException; @@ -35,7 +36,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public abstract class BaseViewOperations implements ViewOperations { +public abstract class BaseViewOperations extends BaseMetastoreOperations implements ViewOperations { private static final Logger LOG = LoggerFactory.getLogger(BaseViewOperations.class); private static final String METADATA_FOLDER_NAME = "metadata"; diff --git a/core/src/test/java/org/apache/iceberg/TestCatalogUtil.java b/core/src/test/java/org/apache/iceberg/TestCatalogUtil.java index 5ac0ebbafc90..878ca36a10ef 100644 --- a/core/src/test/java/org/apache/iceberg/TestCatalogUtil.java +++ b/core/src/test/java/org/apache/iceberg/TestCatalogUtil.java @@ -223,6 +223,32 @@ public void loadCustomMetricsReporter_badClass() { .hasMessageContaining("does not implement MetricsReporter"); } + @Test + public void fullTableNameWithDifferentValues() { + String uriTypeCatalogName = "thrift://host:port/db.table"; + String namespace = "ns"; + String nameSpaceWithTwoLevels = "ns.l2"; + String tableName = "tbl"; + TableIdentifier tableIdentifier = TableIdentifier.of(namespace, tableName); + Assertions.assertThat(CatalogUtil.fullTableName(uriTypeCatalogName, tableIdentifier)) + .isEqualTo(uriTypeCatalogName + "/" + namespace + "." + tableName); + + tableIdentifier = TableIdentifier.of(nameSpaceWithTwoLevels, tableName); + Assertions.assertThat(CatalogUtil.fullTableName(uriTypeCatalogName, tableIdentifier)) + .isEqualTo(uriTypeCatalogName + "/" + nameSpaceWithTwoLevels + "." + tableName); + + Assertions.assertThat(CatalogUtil.fullTableName(uriTypeCatalogName + "/", tableIdentifier)) + .isEqualTo(uriTypeCatalogName + "/" + nameSpaceWithTwoLevels + "." + tableName); + + String nonUriCatalogName = "test.db.catalog"; + Assertions.assertThat(CatalogUtil.fullTableName(nonUriCatalogName, tableIdentifier)) + .isEqualTo(nonUriCatalogName + "." + nameSpaceWithTwoLevels + "." + tableName); + + String pathStyleCatalogName = "/test/db"; + Assertions.assertThat(CatalogUtil.fullTableName(pathStyleCatalogName, tableIdentifier)) + .isEqualTo(pathStyleCatalogName + "/" + nameSpaceWithTwoLevels + "." + tableName); + } + public static class TestCatalog extends BaseMetastoreCatalog { private String catalogName; diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java index 34f9e1da4319..b4f49e29fc49 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java @@ -129,20 +129,9 @@ public List listTables(Namespace namespace) { .map(t -> TableIdentifier.of(namespace, t)) .collect(Collectors.toList()); } else { - List tableObjects = - clients.run(client -> client.getTableObjectsByName(database, tableNames)); tableIdentifiers = - tableObjects.stream() - .filter( - table -> - table.getParameters() != null - && BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE - .equalsIgnoreCase( - table - .getParameters() - .get(BaseMetastoreTableOperations.TABLE_TYPE_PROP))) - .map(table -> TableIdentifier.of(namespace, table.getTableName())) - .collect(Collectors.toList()); + listIcebergTables( + tableNames, namespace, BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE); } LOG.debug( @@ -222,6 +211,28 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { @Override public void renameTable(TableIdentifier from, TableIdentifier originalTo) { + renameTableOrView(from, originalTo, HiveOperationsBase.ContentType.TABLE); + } + + private List listIcebergTables( + List tableNames, Namespace namespace, String tableTypeProp) + throws TException, InterruptedException { + List
tableObjects = + clients.run(client -> client.getTableObjectsByName(namespace.level(0), tableNames)); + return tableObjects.stream() + .filter( + table -> + table.getParameters() != null + && tableTypeProp.equalsIgnoreCase( + table.getParameters().get(BaseMetastoreTableOperations.TABLE_TYPE_PROP))) + .map(table -> TableIdentifier.of(namespace, table.getTableName())) + .collect(Collectors.toList()); + } + + private void renameTableOrView( + TableIdentifier from, + TableIdentifier originalTo, + HiveOperationsBase.ContentType contentType) { if (!isValidIdentifier(from)) { throw new NoSuchTableException("Invalid identifier: %s", from); } @@ -239,7 +250,7 @@ public void renameTable(TableIdentifier from, TableIdentifier originalTo) { try { Table table = clients.run(client -> client.getTable(fromDatabase, fromName)); - HiveOperationsBase.validateTableIsIceberg(table, fullTableName(name, from)); + validateTableIsIcebergTableOrView(contentType, table, CatalogUtil.fullTableName(name, from)); table.setDbName(toDatabase); table.setTableName(to.name()); @@ -250,7 +261,7 @@ public void renameTable(TableIdentifier from, TableIdentifier originalTo) { return null; }); - LOG.info("Renamed table from {}, to {}", from, to); + LOG.info("Renamed {} from {}, to {}", contentType.value(), from, to); } catch (NoSuchObjectException e) { throw new NoSuchTableException("Table does not exist: %s", from); @@ -273,6 +284,17 @@ public void renameTable(TableIdentifier from, TableIdentifier originalTo) { } } + private void validateTableIsIcebergTableOrView( + HiveOperationsBase.ContentType contentType, Table table, String fullName) { + switch (contentType) { + case TABLE: + HiveOperationsBase.validateTableIsIceberg(table, fullName); + break; + case VIEW: + throw new UnsupportedOperationException("View is not supported."); + } + } + @Override public void createNamespace(Namespace namespace, Map meta) { Preconditions.checkArgument( diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveOperationsBase.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveOperationsBase.java index ea24fe4e1133..a93577a35c73 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveOperationsBase.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveOperationsBase.java @@ -22,11 +22,14 @@ import java.util.Map; import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.metastore.api.SerDeInfo; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.iceberg.BaseMetastoreOperations; import org.apache.iceberg.BaseMetastoreTableOperations; import org.apache.iceberg.ClientPool; +import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableProperties; @@ -52,6 +55,21 @@ interface HiveOperationsBase { String NO_LOCK_EXPECTED_KEY = "expected_parameter_key"; String NO_LOCK_EXPECTED_VALUE = "expected_parameter_value"; + enum ContentType { + TABLE("Table"), + VIEW("View"); + + private final String value; + + ContentType(String value) { + this.value = value; + } + + public String value() { + return value; + } + } + TableType tableType(); ClientPool metaClients(); @@ -62,6 +80,15 @@ interface HiveOperationsBase { String table(); + default Table loadHmsTable() throws TException, InterruptedException { + try { + return metaClients().run(client -> client.getTable(database(), table())); + } catch (NoSuchObjectException nte) { + LOG.trace("Table not found {}", database() + "." + table(), nte); + return null; + } + } + default Map hmsEnvContext(String metadataLocation) { return metadataLocation == null ? ImmutableMap.of() @@ -76,11 +103,19 @@ default boolean exposeInHmsProperties() { return maxHiveTablePropertySize() > 0; } + /** + * @deprecated since 1.6.0, will be removed in 1.7.0; Use {@link #setSchema(Schema, Map)} instead + */ + @Deprecated default void setSchema(TableMetadata metadata, Map parameters) { + setSchema(metadata.schema(), parameters); + } + + default void setSchema(Schema schema, Map parameters) { parameters.remove(TableProperties.CURRENT_SCHEMA); - if (exposeInHmsProperties() && metadata.schema() != null) { - String schema = SchemaParser.toJson(metadata.schema()); - setField(parameters, TableProperties.CURRENT_SCHEMA, schema); + if (exposeInHmsProperties() && schema != null) { + String jsonSchema = SchemaParser.toJson(schema); + setField(parameters, TableProperties.CURRENT_SCHEMA, jsonSchema); } } @@ -123,13 +158,23 @@ default void persistTable(Table hmsTable, boolean updateHiveTable, String metada } } + /** + * @deprecated since 1.6.0, will be removed in 1.7.0; Use {@link #storageDescriptor(Schema, + * String, boolean)} instead + */ + @Deprecated static StorageDescriptor storageDescriptor(TableMetadata metadata, boolean hiveEngineEnabled) { + return storageDescriptor(metadata.schema(), metadata.location(), hiveEngineEnabled); + } + static StorageDescriptor storageDescriptor( + Schema schema, String location, boolean hiveEngineEnabled) { final StorageDescriptor storageDescriptor = new StorageDescriptor(); - storageDescriptor.setCols(HiveSchemaUtil.convert(metadata.schema())); - storageDescriptor.setLocation(metadata.location()); + storageDescriptor.setCols(HiveSchemaUtil.convert(schema)); + storageDescriptor.setLocation(location); SerDeInfo serDeInfo = new SerDeInfo(); serDeInfo.setParameters(Maps.newHashMap()); + if (hiveEngineEnabled) { storageDescriptor.setInputFormat("org.apache.iceberg.mr.hive.HiveIcebergInputFormat"); storageDescriptor.setOutputFormat("org.apache.iceberg.mr.hive.HiveIcebergOutputFormat"); @@ -139,6 +184,7 @@ static StorageDescriptor storageDescriptor(TableMetadata metadata, boolean hiveE storageDescriptor.setInputFormat("org.apache.hadoop.mapred.FileInputFormat"); serDeInfo.setSerializationLib("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"); } + storageDescriptor.setSerdeInfo(serDeInfo); return storageDescriptor; } @@ -154,6 +200,18 @@ static void cleanupMetadata(FileIO io, String commitStatus, String metadataLocat } } + static void cleanupMetadataAndUnlock( + FileIO io, + BaseMetastoreOperations.CommitStatus commitStatus, + String metadataLocation, + HiveLock lock) { + try { + cleanupMetadata(io, commitStatus.name(), metadataLocation); + } finally { + lock.unlock(); + } + } + default Table newHmsTable(String hmsTableOwner) { Preconditions.checkNotNull(hmsTableOwner, "'hmsOwner' parameter can't be null"); final long currentTimeMillis = System.currentTimeMillis(); diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java index a3750b9f3101..bae074d55d24 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java @@ -36,6 +36,7 @@ import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.iceberg.BaseMetastoreOperations; import org.apache.iceberg.BaseMetastoreTableOperations; import org.apache.iceberg.ClientPool; import org.apache.iceberg.PartitionSpecParser; @@ -174,7 +175,8 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { boolean hiveEngineEnabled = hiveEngineEnabled(metadata, conf); boolean keepHiveStats = conf.getBoolean(ConfigProperties.KEEP_HIVE_STATS, false); - CommitStatus commitStatus = CommitStatus.FAILURE; + BaseMetastoreOperations.CommitStatus commitStatus = + BaseMetastoreOperations.CommitStatus.FAILURE; boolean updateHiveTable = false; HiveLock lock = lockObject(metadata); @@ -203,7 +205,9 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { tbl.setSd( HiveOperationsBase.storageDescriptor( - metadata, hiveEngineEnabled)); // set to pickup any schema changes + metadata.schema(), + metadata.location(), + hiveEngineEnabled)); // set to pickup any schema changes String metadataLocation = tbl.getParameters().get(METADATA_LOCATION_PROP); String baseMetadataLocation = base != null ? base.metadataFileLocation() : null; @@ -240,9 +244,9 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { tbl, updateHiveTable, hiveLockEnabled(metadata, conf) ? null : baseMetadataLocation); lock.ensureActive(); - commitStatus = CommitStatus.SUCCESS; + commitStatus = BaseMetastoreOperations.CommitStatus.SUCCESS; } catch (LockException le) { - commitStatus = CommitStatus.UNKNOWN; + commitStatus = BaseMetastoreOperations.CommitStatus.UNKNOWN; throw new CommitStateUnknownException( "Failed to heartbeat for hive lock while " + "committing changes. This can lead to a concurrent commit attempt be able to overwrite this commit. " @@ -282,7 +286,9 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { database, tableName, e); - commitStatus = checkCommitStatus(newMetadataLocation, metadata); + commitStatus = + BaseMetastoreOperations.CommitStatus.valueOf( + checkCommitStatus(newMetadataLocation, metadata).name()); switch (commitStatus) { case SUCCESS: break; @@ -304,23 +310,13 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { throw new CommitFailedException(e); } finally { - cleanupMetadataAndUnlock(commitStatus, newMetadataLocation, lock); + HiveOperationsBase.cleanupMetadataAndUnlock(io(), commitStatus, newMetadataLocation, lock); } LOG.info( "Committed to table {} with the new metadata location {}", fullName, newMetadataLocation); } - @VisibleForTesting - Table loadHmsTable() throws TException, InterruptedException { - try { - return metaClients.run(client -> client.getTable(database, tableName)); - } catch (NoSuchObjectException nte) { - LOG.trace("Table not found {}", fullName, nte); - return null; - } - } - private void setHmsTableParameters( String newMetadataLocation, Table tbl, @@ -376,7 +372,7 @@ private void setHmsTableParameters( } setSnapshotStats(metadata, parameters); - setSchema(metadata, parameters); + setSchema(metadata.schema(), parameters); setPartitionSpec(metadata, parameters); setSortOrder(metadata, parameters); @@ -467,15 +463,6 @@ public ClientPool metaClients() { return metaClients; } - private void cleanupMetadataAndUnlock( - CommitStatus commitStatus, String metadataLocation, HiveLock lock) { - try { - HiveOperationsBase.cleanupMetadata(io(), commitStatus.name(), metadataLocation); - } finally { - lock.unlock(); - } - } - /** * Returns if the hive engine related values should be enabled on the table, or not. * From 2eabd52a809fa8c56105b42d552170058fad0489 Mon Sep 17 00:00:00 2001 From: Naveen Kumar Date: Tue, 26 Mar 2024 20:10:56 +0530 Subject: [PATCH 09/25] Hive: Add test to make sure iceberg table with same name as hive table can't be created (#9980) --- .../apache/iceberg/hive/HiveTableTest.java | 47 +++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableTest.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableTest.java index 0fa6c94bf154..6d8e9b4391c3 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableTest.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableTest.java @@ -68,6 +68,7 @@ import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.exceptions.AlreadyExistsException; import org.apache.iceberg.exceptions.CommitFailedException; +import org.apache.iceberg.exceptions.NoSuchIcebergTableException; import org.apache.iceberg.exceptions.NoSuchTableException; import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.hadoop.ConfigProperties; @@ -79,6 +80,8 @@ import org.apache.thrift.TException; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; public class HiveTableTest extends HiveTableBaseTest { static final String NON_DEFAULT_DATABASE = "nondefault"; @@ -334,7 +337,8 @@ public void testListTables() throws TException, IOException { // create a hive table String hiveTableName = "test_hive_table"; - org.apache.hadoop.hive.metastore.api.Table hiveTable = createHiveTable(hiveTableName); + org.apache.hadoop.hive.metastore.api.Table hiveTable = + createHiveTable(hiveTableName, TableType.EXTERNAL_TABLE); HIVE_METASTORE_EXTENSION.metastoreClient().createTable(hiveTable); catalog.setListAllTables(false); @@ -349,8 +353,43 @@ public void testListTables() throws TException, IOException { HIVE_METASTORE_EXTENSION.metastoreClient().dropTable(DB_NAME, hiveTableName); } - private org.apache.hadoop.hive.metastore.api.Table createHiveTable(String hiveTableName) - throws IOException { + @ParameterizedTest + @EnumSource( + value = TableType.class, + names = {"EXTERNAL_TABLE", "VIRTUAL_VIEW", "MANAGED_TABLE"}) + public void testHiveTableAndIcebergTableWithSameName(TableType tableType) + throws TException, IOException { + + assertThat(catalog.listTables(TABLE_IDENTIFIER.namespace())) + .hasSize(1) + .containsExactly(TABLE_IDENTIFIER); + + // create a hive table with a defined table type. + String hiveTableName = "test_hive_table"; + TableIdentifier identifier = TableIdentifier.of(DB_NAME, hiveTableName); + HIVE_METASTORE_EXTENSION + .metastoreClient() + .createTable(createHiveTable(hiveTableName, tableType)); + + catalog.setListAllTables(true); + assertThat(catalog.listTables(TABLE_IDENTIFIER.namespace())) + .hasSize(2) + .containsExactly(TABLE_IDENTIFIER, identifier); + catalog.setListAllTables(false); // reset to default. + + // create an iceberg table with the same name + assertThatThrownBy(() -> catalog.createTable(identifier, schema, PartitionSpec.unpartitioned())) + .isInstanceOf(NoSuchIcebergTableException.class) + .hasMessageStartingWith(String.format("Not an iceberg table: hive.%s", identifier)); + + assertThat(catalog.tableExists(identifier)).isFalse(); + + assertThat(catalog.tableExists(TABLE_IDENTIFIER)).isTrue(); + HIVE_METASTORE_EXTENSION.metastoreClient().dropTable(DB_NAME, hiveTableName); + } + + private org.apache.hadoop.hive.metastore.api.Table createHiveTable( + String hiveTableName, TableType type) throws IOException { Map parameters = Maps.newHashMap(); parameters.put( serdeConstants.SERIALIZATION_CLASS, "org.apache.hadoop.hive.serde2.thrift.test.IntString"); @@ -387,7 +426,7 @@ private org.apache.hadoop.hive.metastore.api.Table createHiveTable(String hiveTa Maps.newHashMap(), "viewOriginalText", "viewExpandedText", - TableType.EXTERNAL_TABLE.name()); + type.name()); return hiveTable; } From b6cbb528ee38fa990ac9032dc44932060094e98d Mon Sep 17 00:00:00 2001 From: Manu Zhang Date: Tue, 26 Mar 2024 23:58:48 +0800 Subject: [PATCH 10/25] Build: Bump Spark from 3.5 to 3.5.1 (#9832) --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index ce571331529f..79ef9c78a396 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -79,7 +79,7 @@ slf4j = "1.7.36" snowflake-jdbc = "3.14.5" spark-hive33 = "3.3.4" spark-hive34 = "3.4.2" -spark-hive35 = "3.5.0" +spark-hive35 = "3.5.1" spring-boot = "2.7.18" spring-web = "5.3.33" sqlite-jdbc = "3.45.1.0" From 4579b7a1e941780496547f313e6fa07e712b09c5 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Wed, 27 Mar 2024 07:58:18 +0100 Subject: [PATCH 11/25] Spark: Fail on recursive cycle in view (#9834) --- .../sql/catalyst/analysis/CheckViews.scala | 52 ++++++++++- .../iceberg/spark/extensions/TestViews.java | 87 +++++++++++++++++++ 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/spark/v3.5/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala b/spark/v3.5/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala index 9a2dee997a10..b8cd1020298d 100644 --- a/spark/v3.5/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala +++ b/spark/v3.5/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala @@ -20,8 +20,12 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.expressions.SubqueryExpression import org.apache.spark.sql.catalyst.plans.logical.AlterViewAs import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias +import org.apache.spark.sql.catalyst.plans.logical.View import org.apache.spark.sql.catalyst.plans.logical.views.CreateIcebergView import org.apache.spark.sql.catalyst.plans.logical.views.ResolvedV2View import org.apache.spark.sql.connector.catalog.ViewCatalog @@ -30,12 +34,18 @@ import org.apache.spark.sql.util.SchemaUtils object CheckViews extends (LogicalPlan => Unit) { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + override def apply(plan: LogicalPlan): Unit = { plan foreach { case CreateIcebergView(resolvedIdent@ResolvedIdentifier(_: ViewCatalog, _), _, query, columnAliases, _, - _, _, _, _, _, _) => + _, _, _, _, replace, _) => verifyColumnCount(resolvedIdent, columnAliases, query) SchemaUtils.checkColumnNameDuplication(query.schema.fieldNames, SQLConf.get.resolver) + if (replace) { + val viewIdent: Seq[String] = resolvedIdent.catalog.name() +: resolvedIdent.identifier.asMultipartIdentifier + checkCyclicViewReference(viewIdent, query, Seq(viewIdent)) + } case AlterViewAs(ResolvedV2View(_, _), _, _) => throw new AnalysisException("ALTER VIEW AS is not supported. Use CREATE OR REPLACE VIEW instead") @@ -63,4 +73,44 @@ object CheckViews extends (LogicalPlan => Unit) { } } } + + private def checkCyclicViewReference( + viewIdent: Seq[String], + plan: LogicalPlan, + cyclePath: Seq[Seq[String]]): Unit = { + plan match { + case sub@SubqueryAlias(_, Project(_, _)) => + val currentViewIdent: Seq[String] = sub.identifier.qualifier :+ sub.identifier.name + checkIfRecursiveView(viewIdent, currentViewIdent, cyclePath, sub.children) + case v1View: View => + val currentViewIdent: Seq[String] = v1View.desc.identifier.nameParts + checkIfRecursiveView(viewIdent, currentViewIdent, cyclePath, v1View.children) + case _ => + plan.children.foreach(child => checkCyclicViewReference(viewIdent, child, cyclePath)) + } + + plan.expressions.flatMap(_.flatMap { + case e: SubqueryExpression => + checkCyclicViewReference(viewIdent, e.plan, cyclePath) + None + case _ => None + }) + } + + private def checkIfRecursiveView( + viewIdent: Seq[String], + currentViewIdent: Seq[String], + cyclePath: Seq[Seq[String]], + children: Seq[LogicalPlan] + ): Unit = { + val newCyclePath = cyclePath :+ currentViewIdent + if (currentViewIdent == viewIdent) { + throw new AnalysisException(String.format("Recursive cycle in view detected: %s (cycle: %s)", + viewIdent.asIdentifier, newCyclePath.map(p => p.mkString(".")).mkString(" -> "))) + } else { + children.foreach { c => + checkCyclicViewReference(viewIdent, c, newCyclePath) + } + } + } } diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestViews.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestViews.java index bd611b936a59..3cc1e32d00a1 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestViews.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestViews.java @@ -1866,6 +1866,93 @@ public void replacingViewWithDialectDropAllowed() { .isEqualTo(ImmutableSQLViewRepresentation.builder().dialect("spark").sql(sql).build()); } + @TestTemplate + public void createViewWithRecursiveCycle() { + String viewOne = viewName("viewOne"); + String viewTwo = viewName("viewTwo"); + + sql("CREATE VIEW %s AS SELECT * FROM %s", viewOne, tableName); + // viewTwo points to viewOne + sql("CREATE VIEW %s AS SELECT * FROM %s", viewTwo, viewOne); + + // viewOne points to viewTwo points to viewOne, creating a recursive cycle + String view1 = String.format("%s.%s.%s", catalogName, NAMESPACE, viewOne); + String view2 = String.format("%s.%s.%s", catalogName, NAMESPACE, viewTwo); + String cycle = String.format("%s -> %s -> %s", view1, view2, view1); + assertThatThrownBy(() -> sql("CREATE OR REPLACE VIEW %s AS SELECT * FROM %s", viewOne, view2)) + .isInstanceOf(AnalysisException.class) + .hasMessageStartingWith( + String.format("Recursive cycle in view detected: %s (cycle: %s)", view1, cycle)); + } + + @TestTemplate + public void createViewWithRecursiveCycleToV1View() { + String viewOne = viewName("view_one"); + String viewTwo = viewName("view_two"); + + sql("CREATE VIEW %s AS SELECT * FROM %s", viewOne, tableName); + // viewTwo points to viewOne + sql("USE spark_catalog"); + sql("CREATE VIEW %s AS SELECT * FROM %s.%s.%s", viewTwo, catalogName, NAMESPACE, viewOne); + + sql("USE %s", catalogName); + // viewOne points to viewTwo points to viewOne, creating a recursive cycle + String view1 = String.format("%s.%s.%s", catalogName, NAMESPACE, viewOne); + String view2 = String.format("%s.%s.%s", "spark_catalog", NAMESPACE, viewTwo); + String cycle = String.format("%s -> %s -> %s", view1, view2, view1); + assertThatThrownBy(() -> sql("CREATE OR REPLACE VIEW %s AS SELECT * FROM %s", viewOne, view2)) + .isInstanceOf(AnalysisException.class) + .hasMessageStartingWith( + String.format("Recursive cycle in view detected: %s (cycle: %s)", view1, cycle)); + } + + @TestTemplate + public void createViewWithRecursiveCycleInCTE() { + String viewOne = viewName("viewOne"); + String viewTwo = viewName("viewTwo"); + + sql("CREATE VIEW %s AS SELECT * FROM %s", viewOne, tableName); + // viewTwo points to viewOne + sql("CREATE VIEW %s AS SELECT * FROM %s", viewTwo, viewOne); + + // CTE points to viewTwo + String sql = + String.format( + "WITH max_by_data AS (SELECT max(id) as max FROM %s) " + + "SELECT max, count(1) AS count FROM max_by_data GROUP BY max", + viewTwo); + + // viewOne points to CTE, creating a recursive cycle + String view1 = String.format("%s.%s.%s", catalogName, NAMESPACE, viewOne); + String cycle = String.format("%s -> %s -> %s", view1, viewTwo, view1); + assertThatThrownBy(() -> sql("CREATE OR REPLACE VIEW %s AS %s", viewOne, sql)) + .isInstanceOf(AnalysisException.class) + .hasMessageStartingWith( + String.format("Recursive cycle in view detected: %s (cycle: %s)", view1, cycle)); + } + + @TestTemplate + public void createViewWithRecursiveCycleInSubqueryExpression() { + String viewOne = viewName("viewOne"); + String viewTwo = viewName("viewTwo"); + + sql("CREATE VIEW %s AS SELECT * FROM %s", viewOne, tableName); + // viewTwo points to viewOne + sql("CREATE VIEW %s AS SELECT * FROM %s", viewTwo, viewOne); + + // subquery expression points to viewTwo + String sql = + String.format("SELECT * FROM %s WHERE id = (SELECT id FROM %s)", tableName, viewTwo); + + // viewOne points to subquery expression, creating a recursive cycle + String view1 = String.format("%s.%s.%s", catalogName, NAMESPACE, viewOne); + String cycle = String.format("%s -> %s -> %s", view1, viewTwo, view1); + assertThatThrownBy(() -> sql("CREATE OR REPLACE VIEW %s AS %s", viewOne, sql)) + .isInstanceOf(AnalysisException.class) + .hasMessageStartingWith( + String.format("Recursive cycle in view detected: %s (cycle: %s)", view1, cycle)); + } + private void insertRows(int numRows) throws NoSuchTableException { List records = Lists.newArrayListWithCapacity(numRows); for (int i = 1; i <= numRows; i++) { From fa80c85007ebf77f59f3bc4f015c1b4bed1b8f76 Mon Sep 17 00:00:00 2001 From: Manu Zhang Date: Wed, 27 Mar 2024 15:27:51 +0800 Subject: [PATCH 12/25] Build: disable link-check for existing medium blog posts (#10042) --- site/docs/blogs.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/site/docs/blogs.md b/site/docs/blogs.md index cf4e3254981b..4e94c9e71b08 100644 --- a/site/docs/blogs.md +++ b/site/docs/blogs.md @@ -59,16 +59,19 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + ### [How not to use Apache Iceberg](https://medium.com/@ajanthabhat/how-not-to-use-apache-iceberg-046ae7e7c884) **Date**: January 23rd, 2024, **Company**: Dremio **Authors**: [Ajantha Bhat](https://www.linkedin.com/in/ajanthabhat/) + ### [Apache Hive-4.x with Iceberg Branches & Tags](https://medium.com/@ayushtkn/apache-hive-4-x-with-iceberg-branches-tags-3d52293ac0bf/) **Date**: October 12th, 2023, **Company**: Cloudera **Authors**: [Ayush Saxena](https://www.linkedin.com/in/ayush151/) + ### [Apache Hive 4.x With Apache Iceberg](https://medium.com/@ayushtkn/apache-hive-4-x-with-apache-iceberg-part-i-355e7a380725/) **Date**: October 12th, 2023, **Company**: Cloudera @@ -99,6 +102,7 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Authors**: [Riza Suminto](https://www.linkedin.com/in/rizasuminto/) + ### [How Bilibili Builds OLAP Data Lakehouse with Apache Iceberg](https://medium.com/@lirui.fudan/how-bilibili-builds-olap-data-lakehouse-with-apache-iceberg-9f3408e53f9) **Date**: June 14th, 2023, **Company**: Bilibili @@ -165,21 +169,25 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Benny Chow](https://www.linkedin.com/in/bechow/) + ### [Understanding Iceberg Table Metadata](https://medium.com/snowflake/understanding-iceberg-table-metadata-b1209fbcc7c3) **Date**: January 30st, 2023, **Company**: Snowflake **Author**: [Phani Raj](https://www.linkedin.com/in/phani-raj-9830a31b/) + ### [Creating and managing Apache Iceberg tables using serverless features and without coding](https://medium.com/snowflake/creating-and-managing-apache-iceberg-tables-using-serverless-features-and-without-coding-14d2198cf5b5) **Date**: January 27th, 2023, **Company**: Snowflake **Author**: [Parag Jain](https://www.linkedin.com/in/paragjainsa/) + ### [Getting started with Apache Iceberg](https://medium.com/snowflake/getting-started-with-apache-iceberg-80f338921a31) **Date**: January 27th, 2023, **Company**: Snowflake **Author**: [Jedidiah Rajbhushan](https://www.linkedin.com/in/jrajbhushan/) + ### [How Apache Iceberg enables ACID compliance for data lakes](https://medium.com/snowflake/how-apache-iceberg-enables-acid-compliance-for-data-lakes-9069ae783b60/) **Date**: January 13th, 2023, **Company**: Snowflake From 9987314e540cf8b0d4f4d61b561e0ebf273148e1 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Wed, 27 Mar 2024 16:59:55 +0100 Subject: [PATCH 13/25] Spark 3.4: Fail on recursive cycle in view (#10048) --- .../sql/catalyst/analysis/CheckViews.scala | 52 ++++++++++- .../iceberg/spark/extensions/TestViews.java | 87 +++++++++++++++++++ 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/spark/v3.4/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala b/spark/v3.4/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala index 4debc4d343a0..685b85a0d75f 100644 --- a/spark/v3.4/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala +++ b/spark/v3.4/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala @@ -20,8 +20,12 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.expressions.SubqueryExpression import org.apache.spark.sql.catalyst.plans.logical.AlterViewAs import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias +import org.apache.spark.sql.catalyst.plans.logical.View import org.apache.spark.sql.catalyst.plans.logical.views.CreateIcebergView import org.apache.spark.sql.catalyst.plans.logical.views.ResolvedV2View import org.apache.spark.sql.connector.catalog.ViewCatalog @@ -30,12 +34,18 @@ import org.apache.spark.sql.util.SchemaUtils object CheckViews extends (LogicalPlan => Unit) { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + override def apply(plan: LogicalPlan): Unit = { plan foreach { case CreateIcebergView(resolvedIdent@ResolvedIdentifier(_: ViewCatalog, _), _, query, columnAliases, _, - _, _, _, _, _, _) => + _, _, _, _, replace, _) => verifyColumnCount(resolvedIdent, columnAliases, query) SchemaUtils.checkColumnNameDuplication(query.schema.fieldNames, SQLConf.get.resolver) + if (replace) { + val viewIdent: Seq[String] = resolvedIdent.catalog.name() +: resolvedIdent.identifier.asMultipartIdentifier + checkCyclicViewReference(viewIdent, query, Seq(viewIdent)) + } case AlterViewAs(ResolvedV2View(_, _), _, _) => throw new AnalysisException("ALTER VIEW AS is not supported. Use CREATE OR REPLACE VIEW instead") @@ -59,4 +69,44 @@ object CheckViews extends (LogicalPlan => Unit) { } } } + + private def checkCyclicViewReference( + viewIdent: Seq[String], + plan: LogicalPlan, + cyclePath: Seq[Seq[String]]): Unit = { + plan match { + case sub@SubqueryAlias(_, Project(_, _)) => + val currentViewIdent: Seq[String] = sub.identifier.qualifier :+ sub.identifier.name + checkIfRecursiveView(viewIdent, currentViewIdent, cyclePath, sub.children) + case v1View: View => + val currentViewIdent: Seq[String] = v1View.desc.identifier.nameParts + checkIfRecursiveView(viewIdent, currentViewIdent, cyclePath, v1View.children) + case _ => + plan.children.foreach(child => checkCyclicViewReference(viewIdent, child, cyclePath)) + } + + plan.expressions.flatMap(_.flatMap { + case e: SubqueryExpression => + checkCyclicViewReference(viewIdent, e.plan, cyclePath) + None + case _ => None + }) + } + + private def checkIfRecursiveView( + viewIdent: Seq[String], + currentViewIdent: Seq[String], + cyclePath: Seq[Seq[String]], + children: Seq[LogicalPlan] + ): Unit = { + val newCyclePath = cyclePath :+ currentViewIdent + if (currentViewIdent == viewIdent) { + throw new AnalysisException(String.format("Recursive cycle in view detected: %s (cycle: %s)", + viewIdent.asIdentifier, newCyclePath.map(p => p.mkString(".")).mkString(" -> "))) + } else { + children.foreach { c => + checkCyclicViewReference(viewIdent, c, newCyclePath) + } + } + } } diff --git a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestViews.java b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestViews.java index 5d1cb2db612b..624b4e354937 100644 --- a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestViews.java +++ b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestViews.java @@ -1867,6 +1867,93 @@ public void replacingViewWithDialectDropAllowed() { .isEqualTo(ImmutableSQLViewRepresentation.builder().dialect("spark").sql(sql).build()); } + @Test + public void createViewWithRecursiveCycle() { + String viewOne = viewName("viewOne"); + String viewTwo = viewName("viewTwo"); + + sql("CREATE VIEW %s AS SELECT * FROM %s", viewOne, tableName); + // viewTwo points to viewOne + sql("CREATE VIEW %s AS SELECT * FROM %s", viewTwo, viewOne); + + // viewOne points to viewTwo points to viewOne, creating a recursive cycle + String view1 = String.format("%s.%s.%s", catalogName, NAMESPACE, viewOne); + String view2 = String.format("%s.%s.%s", catalogName, NAMESPACE, viewTwo); + String cycle = String.format("%s -> %s -> %s", view1, view2, view1); + assertThatThrownBy(() -> sql("CREATE OR REPLACE VIEW %s AS SELECT * FROM %s", viewOne, view2)) + .isInstanceOf(AnalysisException.class) + .hasMessageStartingWith( + String.format("Recursive cycle in view detected: %s (cycle: %s)", view1, cycle)); + } + + @Test + public void createViewWithRecursiveCycleToV1View() { + String viewOne = viewName("view_one"); + String viewTwo = viewName("view_two"); + + sql("CREATE VIEW %s AS SELECT * FROM %s", viewOne, tableName); + // viewTwo points to viewOne + sql("USE spark_catalog"); + sql("CREATE VIEW %s AS SELECT * FROM %s.%s.%s", viewTwo, catalogName, NAMESPACE, viewOne); + + sql("USE %s", catalogName); + // viewOne points to viewTwo points to viewOne, creating a recursive cycle + String view1 = String.format("%s.%s.%s", catalogName, NAMESPACE, viewOne); + String view2 = String.format("%s.%s.%s", "spark_catalog", NAMESPACE, viewTwo); + String cycle = String.format("%s -> %s -> %s", view1, view2, view1); + assertThatThrownBy(() -> sql("CREATE OR REPLACE VIEW %s AS SELECT * FROM %s", viewOne, view2)) + .isInstanceOf(AnalysisException.class) + .hasMessageStartingWith( + String.format("Recursive cycle in view detected: %s (cycle: %s)", view1, cycle)); + } + + @Test + public void createViewWithRecursiveCycleInCTE() { + String viewOne = viewName("viewOne"); + String viewTwo = viewName("viewTwo"); + + sql("CREATE VIEW %s AS SELECT * FROM %s", viewOne, tableName); + // viewTwo points to viewOne + sql("CREATE VIEW %s AS SELECT * FROM %s", viewTwo, viewOne); + + // CTE points to viewTwo + String sql = + String.format( + "WITH max_by_data AS (SELECT max(id) as max FROM %s) " + + "SELECT max, count(1) AS count FROM max_by_data GROUP BY max", + viewTwo); + + // viewOne points to CTE, creating a recursive cycle + String view1 = String.format("%s.%s.%s", catalogName, NAMESPACE, viewOne); + String cycle = String.format("%s -> %s -> %s", view1, viewTwo, view1); + assertThatThrownBy(() -> sql("CREATE OR REPLACE VIEW %s AS %s", viewOne, sql)) + .isInstanceOf(AnalysisException.class) + .hasMessageStartingWith( + String.format("Recursive cycle in view detected: %s (cycle: %s)", view1, cycle)); + } + + @Test + public void createViewWithRecursiveCycleInSubqueryExpression() { + String viewOne = viewName("viewOne"); + String viewTwo = viewName("viewTwo"); + + sql("CREATE VIEW %s AS SELECT * FROM %s", viewOne, tableName); + // viewTwo points to viewOne + sql("CREATE VIEW %s AS SELECT * FROM %s", viewTwo, viewOne); + + // subquery expression points to viewTwo + String sql = + String.format("SELECT * FROM %s WHERE id = (SELECT id FROM %s)", tableName, viewTwo); + + // viewOne points to subquery expression, creating a recursive cycle + String view1 = String.format("%s.%s.%s", catalogName, NAMESPACE, viewOne); + String cycle = String.format("%s -> %s -> %s", view1, viewTwo, view1); + assertThatThrownBy(() -> sql("CREATE OR REPLACE VIEW %s AS %s", viewOne, sql)) + .isInstanceOf(AnalysisException.class) + .hasMessageStartingWith( + String.format("Recursive cycle in view detected: %s (cycle: %s)", view1, cycle)); + } + private void insertRows(int numRows) throws NoSuchTableException { List records = Lists.newArrayListWithCapacity(numRows); for (int i = 1; i <= numRows; i++) { From 371a6b7ff7f3a776e1616e90a95ec99c64df1ac2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:18:49 +0100 Subject: [PATCH 14/25] Build: Bump org.xerial:sqlite-jdbc from 3.45.1.0 to 3.45.2.0 (#9974) Bumps [org.xerial:sqlite-jdbc](https://github.com/xerial/sqlite-jdbc) from 3.45.1.0 to 3.45.2.0. - [Release notes](https://github.com/xerial/sqlite-jdbc/releases) - [Changelog](https://github.com/xerial/sqlite-jdbc/blob/master/CHANGELOG) - [Commits](https://github.com/xerial/sqlite-jdbc/compare/3.45.1.0...3.45.2.0) --- updated-dependencies: - dependency-name: org.xerial:sqlite-jdbc dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 79ef9c78a396..5749c6f82d54 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -82,7 +82,7 @@ spark-hive34 = "3.4.2" spark-hive35 = "3.5.1" spring-boot = "2.7.18" spring-web = "5.3.33" -sqlite-jdbc = "3.45.1.0" +sqlite-jdbc = "3.45.2.0" testcontainers = "1.19.5" tez010 = "0.10.3" tez08 = { strictly = "0.8.4"} # see rich version usage explanation above From baaedc6e041bd0f0dfc7bec2d4f2c67c1db1ef77 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:19:20 +0100 Subject: [PATCH 15/25] Build: Bump io.netty:netty-buffer from 4.1.107.Final to 4.1.108.Final (#10032) Bumps [io.netty:netty-buffer](https://github.com/netty/netty) from 4.1.107.Final to 4.1.108.Final. - [Commits](https://github.com/netty/netty/compare/netty-4.1.107.Final...netty-4.1.108.Final) --- updated-dependencies: - dependency-name: io.netty:netty-buffer dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 5749c6f82d54..05b1369cd765 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -66,8 +66,8 @@ microprofile-openapi-api = "3.1.1" mockito = "4.11.0" mockserver = "5.15.0" nessie = "0.79.0" -netty-buffer = "4.1.107.Final" -netty-buffer-compat = "4.1.107.Final" +netty-buffer = "4.1.108.Final" +netty-buffer-compat = "4.1.108.Final" object-client-bundle = "3.3.2" orc = "1.9.2" parquet = "1.13.1" From 003cd9477ac2d6226a776110eaf80af8ef422e38 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:20:00 +0100 Subject: [PATCH 16/25] Build: Bump arrow from 15.0.1 to 15.0.2 (#10034) Bumps `arrow` from 15.0.1 to 15.0.2. Updates `org.apache.arrow:arrow-memory-netty` from 15.0.1 to 15.0.2 Updates `org.apache.arrow:arrow-vector` from 15.0.1 to 15.0.2 - [Commits](https://github.com/apache/arrow/compare/go/v15.0.1...go/v15.0.2) --- updated-dependencies: - dependency-name: org.apache.arrow:arrow-memory-netty dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.apache.arrow:arrow-vector dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 05b1369cd765..df757c9690ff 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -24,7 +24,7 @@ activation = "1.1.1" aliyun-sdk-oss = "3.10.2" antlr = "4.9.3" aircompressor = "0.26" -arrow = "15.0.1" +arrow = "15.0.2" avro = "1.11.3" assertj-core = "3.25.3" awaitility = "4.2.1" From 15e2a16443038f87ecd808c68a764f14030890c8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 09:33:10 -0700 Subject: [PATCH 17/25] Build: Bump kafka from 3.6.1 to 3.7.0 (#9855) Bumps `kafka` from 3.6.1 to 3.7.0. Updates `org.apache.kafka:kafka-clients` from 3.6.1 to 3.7.0 Updates `org.apache.kafka:connect-api` from 3.6.1 to 3.7.0 Updates `org.apache.kafka:connect-json` from 3.6.1 to 3.7.0 --- updated-dependencies: - dependency-name: org.apache.kafka:kafka-clients dependency-type: direct:production update-type: version-update:semver-minor - dependency-name: org.apache.kafka:connect-api dependency-type: direct:production update-type: version-update:semver-minor - dependency-name: org.apache.kafka:connect-json dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index df757c9690ff..0e89682a6997 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -60,7 +60,7 @@ jaxb-api = "2.3.1" jaxb-runtime = "2.3.3" jetty = "9.4.54.v20240208" junit = "5.10.1" -kafka = "3.6.1" +kafka = "3.7.0" kryo-shaded = "4.0.3" microprofile-openapi-api = "3.1.1" mockito = "4.11.0" From 4de819e80b7031b91d4e40c742959a5cbab96a6e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:35:38 +0100 Subject: [PATCH 18/25] Build: Bump orc from 1.9.2 to 1.9.3 (#10033) Bumps `orc` from 1.9.2 to 1.9.3. Updates `org.apache.orc:orc-core` from 1.9.2 to 1.9.3 Updates `org.apache.orc:orc-tools` from 1.9.2 to 1.9.3 --- updated-dependencies: - dependency-name: org.apache.orc:orc-core dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.apache.orc:orc-tools dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 0e89682a6997..00ab2e4c548f 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -69,7 +69,7 @@ nessie = "0.79.0" netty-buffer = "4.1.108.Final" netty-buffer-compat = "4.1.108.Final" object-client-bundle = "3.3.2" -orc = "1.9.2" +orc = "1.9.3" parquet = "1.13.1" pig = "0.17.0" roaringbitmap = "1.0.5" From 66a0954e4a77da72cb0cf398ca4a9b17208b939b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:40:26 +0100 Subject: [PATCH 19/25] Build: Bump com.azure:azure-sdk-bom from 1.2.20 to 1.2.21 (#9857) Bumps [com.azure:azure-sdk-bom](https://github.com/azure/azure-sdk-for-java) from 1.2.20 to 1.2.21. - [Release notes](https://github.com/azure/azure-sdk-for-java/releases) - [Commits](https://github.com/azure/azure-sdk-for-java/compare/azure-sdk-bom_1.2.20...azure-sdk-bom_1.2.21) --- updated-dependencies: - dependency-name: com.azure:azure-sdk-bom dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 00ab2e4c548f..298a8a74984d 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -29,7 +29,7 @@ avro = "1.11.3" assertj-core = "3.25.3" awaitility = "4.2.1" awssdk-bom = "2.24.5" -azuresdk-bom = "1.2.20" +azuresdk-bom = "1.2.21" awssdk-s3accessgrants = "2.0.0" caffeine = "2.9.3" calcite = "1.10.0" From bd4603529ebb32534c90de377919273a007d1ec4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 20:10:50 +0100 Subject: [PATCH 20/25] Build: Bump com.esotericsoftware:kryo from 4.0.2 to 4.0.3 (#9984) Bumps [com.esotericsoftware:kryo](https://github.com/EsotericSoftware/kryo) from 4.0.2 to 4.0.3. - [Release notes](https://github.com/EsotericSoftware/kryo/releases) - [Commits](https://github.com/EsotericSoftware/kryo/compare/kryo-parent-4.0.2...kryo-parent-4.0.3) --- updated-dependencies: - dependency-name: com.esotericsoftware:kryo dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 298a8a74984d..cec16519417a 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -35,7 +35,7 @@ caffeine = "2.9.3" calcite = "1.10.0" delta-standalone = "3.1.0" delta-spark = "3.1.0" -esotericsoftware-kryo = "4.0.2" +esotericsoftware-kryo = "4.0.3" errorprone-annotations = "2.26.1" findbugs-jsr305 = "3.0.2" flink116 = { strictly = "1.16.3"} From 81b62c78e0c230516090becda7d6040ee03e6a91 Mon Sep 17 00:00:00 2001 From: Steven Zhen Wu Date: Wed, 27 Mar 2024 13:46:49 -0700 Subject: [PATCH 21/25] Flink: implement range partitioner for map data statistics (#9321) --- .../shuffle/MapRangePartitionerBenchmark.java | 199 ++++++++ .../sink/shuffle/MapRangePartitioner.java | 381 +++++++++++++++ .../sink/shuffle/TestMapRangePartitioner.java | 448 ++++++++++++++++++ jmh.gradle | 5 + 4 files changed, 1033 insertions(+) create mode 100644 flink/v1.17/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java create mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java create mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java diff --git a/flink/v1.17/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java b/flink/v1.17/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java new file mode 100644 index 000000000000..c3917165753d --- /dev/null +++ b/flink/v1.17/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.concurrent.ThreadLocalRandom; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +@Fork(1) +@State(Scope.Benchmark) +@Warmup(iterations = 3) +@Measurement(iterations = 5) +@BenchmarkMode(Mode.SingleShotTime) +public class MapRangePartitionerBenchmark { + private static final String CHARS = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-.!?"; + private static final int SAMPLE_SIZE = 100_000; + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "name2", Types.StringType.get()), + Types.NestedField.required(3, "name3", Types.StringType.get()), + Types.NestedField.required(4, "name4", Types.StringType.get()), + Types.NestedField.required(5, "name5", Types.StringType.get()), + Types.NestedField.required(6, "name6", Types.StringType.get()), + Types.NestedField.required(7, "name7", Types.StringType.get()), + Types.NestedField.required(8, "name8", Types.StringType.get()), + Types.NestedField.required(9, "name9", Types.StringType.get())); + + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); + private static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); + + private MapRangePartitioner partitioner; + private RowData[] rows; + + @Setup + public void setupBenchmark() { + NavigableMap weights = longTailDistribution(100_000, 24, 240, 100, 2.0); + Map mapStatistics = Maps.newHashMapWithExpectedSize(weights.size()); + weights.forEach( + (id, weight) -> { + SortKey sortKey = SORT_KEY.copy(); + sortKey.set(0, id); + mapStatistics.put(sortKey, weight); + }); + + MapDataStatistics dataStatistics = new MapDataStatistics(mapStatistics); + this.partitioner = + new MapRangePartitioner( + SCHEMA, SortOrder.builderFor(SCHEMA).asc("id").build(), dataStatistics, 2); + + List keys = Lists.newArrayList(weights.keySet().iterator()); + long[] weightsCDF = new long[keys.size()]; + long totalWeight = 0; + for (int i = 0; i < keys.size(); ++i) { + totalWeight += weights.get(keys.get(i)); + weightsCDF[i] = totalWeight; + } + + // pre-calculate the samples for benchmark run + this.rows = new GenericRowData[SAMPLE_SIZE]; + for (int i = 0; i < SAMPLE_SIZE; ++i) { + long weight = ThreadLocalRandom.current().nextLong(totalWeight); + int index = binarySearchIndex(weightsCDF, weight); + rows[i] = + GenericRowData.of( + keys.get(index), + randomString("name2-"), + randomString("name3-"), + randomString("name4-"), + randomString("name5-"), + randomString("name6-"), + randomString("name7-"), + randomString("name8-"), + randomString("name9-")); + } + } + + @TearDown + public void tearDownBenchmark() {} + + @Benchmark + @Threads(1) + public void testPartitionerLongTailDistribution(Blackhole blackhole) { + for (int i = 0; i < SAMPLE_SIZE; ++i) { + blackhole.consume(partitioner.partition(rows[i], 128)); + } + } + + private static String randomString(String prefix) { + int length = ThreadLocalRandom.current().nextInt(200); + byte[] buffer = new byte[length]; + + for (int i = 0; i < length; i += 1) { + buffer[i] = (byte) CHARS.charAt(ThreadLocalRandom.current().nextInt(CHARS.length())); + } + + return prefix + new String(buffer); + } + + /** find the index where weightsUDF[index] < weight && weightsUDF[index+1] >= weight */ + private static int binarySearchIndex(long[] weightsUDF, long target) { + Preconditions.checkArgument( + target < weightsUDF[weightsUDF.length - 1], + "weight is out of range: total weight = %s, search target = %s", + weightsUDF[weightsUDF.length - 1], + target); + int start = 0; + int end = weightsUDF.length - 1; + while (start < end) { + int mid = (start + end) / 2; + if (weightsUDF[mid] < target && weightsUDF[mid + 1] >= target) { + return mid; + } + + if (weightsUDF[mid] >= target) { + end = mid - 1; + } else if (weightsUDF[mid + 1] < target) { + start = mid + 1; + } + } + return start; + } + + /** Key is the id string and value is the weight in long value. */ + private static NavigableMap longTailDistribution( + long startingWeight, + int longTailStartingIndex, + int longTailLength, + long longTailBaseWeight, + double weightRandomJitterPercentage) { + + NavigableMap weights = Maps.newTreeMap(); + + // first part just decays the weight by half + long currentWeight = startingWeight; + for (int index = 0; index < longTailStartingIndex; ++index) { + double jitter = ThreadLocalRandom.current().nextDouble(weightRandomJitterPercentage / 100); + long weight = (long) (currentWeight * (1.0 + jitter)); + weight = weight > 0 ? weight : 1; + weights.put(index, weight); + if (currentWeight > longTailBaseWeight) { + currentWeight = currentWeight / 2; + } + } + + // long tail part + for (int index = longTailStartingIndex; + index < longTailStartingIndex + longTailLength; + ++index) { + long longTailWeight = + (long) + (longTailBaseWeight + * ThreadLocalRandom.current().nextDouble(weightRandomJitterPercentage)); + longTailWeight = longTailWeight > 0 ? longTailWeight : 1; + weights.put(index, longTailWeight); + } + + return weights; + } +} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java new file mode 100644 index 000000000000..fb1a8f03a65c --- /dev/null +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java @@ -0,0 +1,381 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.SortOrderComparators; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.Pair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Internal partitioner implementation that supports MapDataStatistics, which is typically used for + * low-cardinality use cases. While MapDataStatistics can keep accurate counters, it can't be used + * for high-cardinality use cases. Otherwise, the memory footprint is too high. + * + *

It is a greedy algorithm for bin packing. With close file cost, the calculation isn't always + * precise when calculating close cost for every file, target weight per subtask, padding residual + * weight, assigned weight without close cost. + * + *

All actions should be executed in a single Flink mailbox thread. So there is no need to make + * it thread safe. + */ +class MapRangePartitioner implements Partitioner { + private static final Logger LOG = LoggerFactory.getLogger(MapRangePartitioner.class); + + private final RowDataWrapper rowDataWrapper; + private final SortKey sortKey; + private final Comparator comparator; + private final Map mapStatistics; + private final double closeFileCostInWeightPercentage; + + // Counter that tracks how many times a new key encountered + // where there is no traffic statistics learned about it. + private long newSortKeyCounter; + private long lastNewSortKeyLogTimeMilli; + + // lazily computed due to the need of numPartitions + private Map assignment; + private NavigableMap sortedStatsWithCloseFileCost; + + MapRangePartitioner( + Schema schema, + SortOrder sortOrder, + MapDataStatistics dataStatistics, + double closeFileCostInWeightPercentage) { + dataStatistics + .statistics() + .entrySet() + .forEach( + entry -> + Preconditions.checkArgument( + entry.getValue() > 0, + "Invalid statistics: weight is 0 for key %s", + entry.getKey())); + + this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + this.sortKey = new SortKey(schema, sortOrder); + this.comparator = SortOrderComparators.forSchema(schema, sortOrder); + this.mapStatistics = dataStatistics.statistics(); + this.closeFileCostInWeightPercentage = closeFileCostInWeightPercentage; + this.newSortKeyCounter = 0; + this.lastNewSortKeyLogTimeMilli = System.currentTimeMillis(); + } + + @Override + public int partition(RowData row, int numPartitions) { + // assignment table can only be built lazily when first referenced here, + // because number of partitions (downstream subtasks) is needed. + // the numPartitions is not available in the constructor. + Map assignmentMap = assignment(numPartitions); + // reuse the sortKey and rowDataWrapper + sortKey.wrap(rowDataWrapper.wrap(row)); + KeyAssignment keyAssignment = assignmentMap.get(sortKey); + if (keyAssignment == null) { + LOG.trace( + "Encountered new sort key: {}. Fall back to round robin as statistics not learned yet.", + sortKey); + // Ideally unknownKeyCounter should be published as a counter metric. + // It seems difficult to pass in MetricGroup into the partitioner. + // Just log an INFO message every minute. + newSortKeyCounter += 1; + long now = System.currentTimeMillis(); + if (now - lastNewSortKeyLogTimeMilli > TimeUnit.MINUTES.toMillis(1)) { + LOG.info("Encounter new sort keys in total {} times", newSortKeyCounter); + lastNewSortKeyLogTimeMilli = now; + } + return (int) (newSortKeyCounter % numPartitions); + } + + return keyAssignment.select(); + } + + @VisibleForTesting + Map assignment(int numPartitions) { + if (assignment == null) { + long totalWeight = mapStatistics.values().stream().mapToLong(l -> l).sum(); + double targetWeightPerSubtask = ((double) totalWeight) / numPartitions; + long closeFileCostInWeight = + (long) Math.ceil(targetWeightPerSubtask * closeFileCostInWeightPercentage / 100); + + this.sortedStatsWithCloseFileCost = Maps.newTreeMap(comparator); + mapStatistics.forEach( + (k, v) -> { + int estimatedSplits = (int) Math.ceil(v / targetWeightPerSubtask); + long estimatedCloseFileCost = closeFileCostInWeight * estimatedSplits; + sortedStatsWithCloseFileCost.put(k, v + estimatedCloseFileCost); + }); + + long totalWeightWithCloseFileCost = + sortedStatsWithCloseFileCost.values().stream().mapToLong(l -> l).sum(); + long targetWeightPerSubtaskWithCloseFileCost = + (long) Math.ceil(((double) totalWeightWithCloseFileCost) / numPartitions); + this.assignment = + buildAssignment( + numPartitions, + sortedStatsWithCloseFileCost, + targetWeightPerSubtaskWithCloseFileCost, + closeFileCostInWeight); + } + + return assignment; + } + + @VisibleForTesting + Map mapStatistics() { + return mapStatistics; + } + + /** + * @return assignment summary for every subtask. Key is subtaskId. Value pair is (weight assigned + * to the subtask, number of keys assigned to the subtask) + */ + Map> assignmentInfo() { + Map> assignmentInfo = Maps.newTreeMap(); + assignment.forEach( + (key, keyAssignment) -> { + for (int i = 0; i < keyAssignment.assignedSubtasks.length; ++i) { + int subtaskId = keyAssignment.assignedSubtasks[i]; + long subtaskWeight = keyAssignment.subtaskWeightsExcludingCloseCost[i]; + Pair oldValue = assignmentInfo.getOrDefault(subtaskId, Pair.of(0L, 0)); + assignmentInfo.put( + subtaskId, Pair.of(oldValue.first() + subtaskWeight, oldValue.second() + 1)); + } + }); + + return assignmentInfo; + } + + private Map buildAssignment( + int numPartitions, + NavigableMap sortedStatistics, + long targetWeightPerSubtask, + long closeFileCostInWeight) { + Map assignmentMap = + Maps.newHashMapWithExpectedSize(sortedStatistics.size()); + Iterator mapKeyIterator = sortedStatistics.keySet().iterator(); + int subtaskId = 0; + SortKey currentKey = null; + long keyRemainingWeight = 0L; + long subtaskRemainingWeight = targetWeightPerSubtask; + List assignedSubtasks = Lists.newArrayList(); + List subtaskWeights = Lists.newArrayList(); + while (mapKeyIterator.hasNext() || currentKey != null) { + // This should never happen because target weight is calculated using ceil function. + if (subtaskId >= numPartitions) { + LOG.error( + "Internal algorithm error: exhausted subtasks with unassigned keys left. number of partitions: {}, " + + "target weight per subtask: {}, close file cost in weight: {}, data statistics: {}", + numPartitions, + targetWeightPerSubtask, + closeFileCostInWeight, + sortedStatistics); + throw new IllegalStateException( + "Internal algorithm error: exhausted subtasks with unassigned keys left"); + } + + if (currentKey == null) { + currentKey = mapKeyIterator.next(); + keyRemainingWeight = sortedStatistics.get(currentKey); + } + + assignedSubtasks.add(subtaskId); + if (keyRemainingWeight < subtaskRemainingWeight) { + // assign the remaining weight of the key to the current subtask + subtaskWeights.add(keyRemainingWeight); + subtaskRemainingWeight -= keyRemainingWeight; + keyRemainingWeight = 0L; + } else { + // filled up the current subtask + long assignedWeight = subtaskRemainingWeight; + keyRemainingWeight -= subtaskRemainingWeight; + + // If assigned weight is less than close file cost, pad it up with close file cost. + // This might cause the subtask assigned weight over the target weight. + // But it should be no more than one close file cost. Small skew is acceptable. + if (assignedWeight <= closeFileCostInWeight) { + long paddingWeight = Math.min(keyRemainingWeight, closeFileCostInWeight); + keyRemainingWeight -= paddingWeight; + assignedWeight += paddingWeight; + } + + subtaskWeights.add(assignedWeight); + // move on to the next subtask + subtaskId += 1; + subtaskRemainingWeight = targetWeightPerSubtask; + } + + Preconditions.checkState( + assignedSubtasks.size() == subtaskWeights.size(), + "List size mismatch: assigned subtasks = %s, subtask weights = %s", + assignedSubtasks, + subtaskWeights); + + // If the remaining key weight is smaller than the close file cost, simply skip the residual + // as it doesn't make sense to assign a weight smaller than close file cost to a new subtask. + // this might lead to some inaccuracy in weight calculation. E.g., assuming the key weight is + // 2 and close file cost is 2. key weight with close cost is 4. Let's assume the previous + // task has a weight of 3 available. So weight of 3 for this key is assigned to the task and + // the residual weight of 1 is dropped. Then the routing weight for this key is 1 (minus the + // close file cost), which is inaccurate as the true key weight should be 2. + // Again, this greedy algorithm is not intended to be perfect. Some small inaccuracy is + // expected and acceptable. Traffic distribution should still be balanced. + if (keyRemainingWeight > 0 && keyRemainingWeight <= closeFileCostInWeight) { + keyRemainingWeight = 0; + } + + if (keyRemainingWeight == 0) { + // finishing up the assignment for the current key + KeyAssignment keyAssignment = + new KeyAssignment(assignedSubtasks, subtaskWeights, closeFileCostInWeight); + assignmentMap.put(currentKey, keyAssignment); + assignedSubtasks.clear(); + subtaskWeights.clear(); + currentKey = null; + } + } + + return assignmentMap; + } + + /** Subtask assignment for a key */ + @VisibleForTesting + static class KeyAssignment { + private final int[] assignedSubtasks; + private final long[] subtaskWeightsExcludingCloseCost; + private final long keyWeight; + private final long[] cumulativeWeights; + + /** + * @param assignedSubtasks assigned subtasks for this key. It could be a single subtask. It + * could also be multiple subtasks if the key has heavy weight that should be handled by + * multiple subtasks. + * @param subtaskWeightsWithCloseFileCost assigned weight for each subtask. E.g., if the + * keyWeight is 27 and the key is assigned to 3 subtasks, subtaskWeights could contain + * values as [10, 10, 7] for target weight of 10 per subtask. + */ + KeyAssignment( + List assignedSubtasks, + List subtaskWeightsWithCloseFileCost, + long closeFileCostInWeight) { + Preconditions.checkArgument( + assignedSubtasks != null && !assignedSubtasks.isEmpty(), + "Invalid assigned subtasks: null or empty"); + Preconditions.checkArgument( + subtaskWeightsWithCloseFileCost != null && !subtaskWeightsWithCloseFileCost.isEmpty(), + "Invalid assigned subtasks weights: null or empty"); + Preconditions.checkArgument( + assignedSubtasks.size() == subtaskWeightsWithCloseFileCost.size(), + "Invalid assignment: size mismatch (tasks length = %s, weights length = %s)", + assignedSubtasks.size(), + subtaskWeightsWithCloseFileCost.size()); + subtaskWeightsWithCloseFileCost.forEach( + weight -> + Preconditions.checkArgument( + weight > closeFileCostInWeight, + "Invalid weight: should be larger than close file cost: weight = %s, close file cost = %s", + weight, + closeFileCostInWeight)); + + this.assignedSubtasks = assignedSubtasks.stream().mapToInt(i -> i).toArray(); + // Exclude the close file cost for key routing + this.subtaskWeightsExcludingCloseCost = + subtaskWeightsWithCloseFileCost.stream() + .mapToLong(weightWithCloseFileCost -> weightWithCloseFileCost - closeFileCostInWeight) + .toArray(); + this.keyWeight = Arrays.stream(subtaskWeightsExcludingCloseCost).sum(); + this.cumulativeWeights = new long[subtaskWeightsExcludingCloseCost.length]; + long cumulativeWeight = 0; + for (int i = 0; i < subtaskWeightsExcludingCloseCost.length; ++i) { + cumulativeWeight += subtaskWeightsExcludingCloseCost[i]; + cumulativeWeights[i] = cumulativeWeight; + } + } + + /** @return subtask id */ + int select() { + if (assignedSubtasks.length == 1) { + // only choice. no need to run random number generator. + return assignedSubtasks[0]; + } else { + long randomNumber = ThreadLocalRandom.current().nextLong(keyWeight); + int index = Arrays.binarySearch(cumulativeWeights, randomNumber); + // choose the subtask where randomNumber < cumulativeWeights[pos]. + // this works regardless whether index is negative or not. + int position = Math.abs(index + 1); + Preconditions.checkState( + position < assignedSubtasks.length, + "Invalid selected position: out of range. key weight = %s, random number = %s, cumulative weights array = %s", + keyWeight, + randomNumber, + cumulativeWeights); + return assignedSubtasks[position]; + } + } + + @Override + public int hashCode() { + return 31 * Arrays.hashCode(assignedSubtasks) + + Arrays.hashCode(subtaskWeightsExcludingCloseCost); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + KeyAssignment that = (KeyAssignment) o; + return Arrays.equals(assignedSubtasks, that.assignedSubtasks) + && Arrays.equals(subtaskWeightsExcludingCloseCost, that.subtaskWeightsExcludingCloseCost); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("assignedSubtasks", assignedSubtasks) + .add("subtaskWeightsExcludingCloseCost", subtaskWeightsExcludingCloseCost) + .toString(); + } + } +} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java new file mode 100644 index 000000000000..92eb71acc834 --- /dev/null +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java @@ -0,0 +1,448 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.util.Pair; +import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.Test; + +public class TestMapRangePartitioner { + private static final SortOrder SORT_ORDER = + SortOrder.builderFor(TestFixtures.SCHEMA).asc("data").build(); + + private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final SortKey[] SORT_KEYS = initSortKeys(); + + private static SortKey[] initSortKeys() { + SortKey[] sortKeys = new SortKey[10]; + for (int i = 0; i < 10; ++i) { + RowData rowData = + GenericRowData.of(StringData.fromString("k" + i), i, StringData.fromString("2023-06-20")); + RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); + keyWrapper.wrap(rowData); + SortKey sortKey = SORT_KEY.copy(); + sortKey.wrap(keyWrapper); + sortKeys[i] = sortKey; + } + return sortKeys; + } + + // Total weight is 800 + private final MapDataStatistics mapDataStatistics = + new MapDataStatistics( + ImmutableMap.of( + SORT_KEYS[0], + 350L, + SORT_KEYS[1], + 230L, + SORT_KEYS[2], + 120L, + SORT_KEYS[3], + 40L, + SORT_KEYS[4], + 10L, + SORT_KEYS[5], + 10L, + SORT_KEYS[6], + 10L, + SORT_KEYS[7], + 10L, + SORT_KEYS[8], + 10L, + SORT_KEYS[9], + 10L)); + + @Test + public void testEvenlyDividableNoClosingFileCost() { + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapDataStatistics, 0.0); + int numPartitions = 8; + + // each task should get targeted weight of 100 (=800/8) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(100L, 100L, 100L, 50L), 0L), + SORT_KEYS[1], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(3, 4, 5), ImmutableList.of(50L, 100L, 80L), 0L), + SORT_KEYS[2], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(5, 6), ImmutableList.of(20L, 100L), 0L), + SORT_KEYS[3], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(40L), 0L), + SORT_KEYS[4], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[5], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[6], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[7], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[8], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[9], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L)); + Map actualAssignment = + partitioner.assignment(numPartitions); + Assertions.assertThat(actualAssignment).isEqualTo(expectedAssignment); + + // key: subtask id + // value pair: first is the assigned weight, second is the number of assigned keys + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(100L, 1), + 1, + Pair.of(100L, 1), + 2, + Pair.of(100L, 1), + 3, + Pair.of(100L, 2), + 4, + Pair.of(100L, 1), + 5, + Pair.of(100L, 2), + 6, + Pair.of(100L, 1), + 7, + Pair.of(100L, 7)); + Map> actualAssignmentInfo = partitioner.assignmentInfo(); + Assertions.assertThat(actualAssignmentInfo).isEqualTo(expectedAssignmentInfo); + + Map>> partitionResults = + runPartitioner(partitioner, numPartitions); + validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); + } + + @Test + public void testEvenlyDividableWithClosingFileCost() { + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapDataStatistics, 5.0); + int numPartitions = 8; + + // target subtask weight is 100 before close file cost factored in. + // close file cost is 5 = 5% * 100. + // key weights before and after close file cost factored in + // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 + // close-cost: 20, 15, 10, 5, 5, 5, 5, 5, 5, 5 + // after: 370, 245, 130, 45, 15, 15, 15, 15, 15, 15 + // target subtask weight with close cost per subtask is 110 (880/8) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(110L, 110L, 110L, 40L), 5L), + SORT_KEYS[1], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(3, 4, 5), ImmutableList.of(70L, 110L, 65L), 5L), + SORT_KEYS[2], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(5, 6), ImmutableList.of(45L, 85L), 5L), + SORT_KEYS[3], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(6, 7), ImmutableList.of(25L, 20L), 5L), + SORT_KEYS[4], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[5], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[6], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[7], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[8], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[9], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L)); + Map actualAssignment = + partitioner.assignment(numPartitions); + Assertions.assertThat(actualAssignment).isEqualTo(expectedAssignment); + + // key: subtask id + // value pair: first is the assigned weight (excluding close file cost) for the subtask, + // second is the number of keys assigned to the subtask + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(105L, 1), + 1, + Pair.of(105L, 1), + 2, + Pair.of(105L, 1), + 3, + Pair.of(100L, 2), + 4, + Pair.of(105L, 1), + 5, + Pair.of(100L, 2), + 6, + Pair.of(100L, 2), + 7, + Pair.of(75L, 7)); + Map> actualAssignmentInfo = partitioner.assignmentInfo(); + Assertions.assertThat(actualAssignmentInfo).isEqualTo(expectedAssignmentInfo); + + Map>> partitionResults = + runPartitioner(partitioner, numPartitions); + validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); + } + + @Test + public void testNonDividableNoClosingFileCost() { + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapDataStatistics, 0.0); + int numPartitions = 9; + + // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 + // each task should get targeted weight of 89 = ceiling(800/9) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(89L, 89L, 89L, 83L), 0L), + SORT_KEYS[1], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(3, 4, 5, 6), ImmutableList.of(6L, 89L, 89L, 46L), 0L), + SORT_KEYS[2], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(6, 7), ImmutableList.of(43L, 77L), 0L), + SORT_KEYS[3], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(7, 8), ImmutableList.of(12L, 28L), 0L), + SORT_KEYS[4], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[5], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[6], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[7], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[8], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[9], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L)); + Map actualAssignment = + partitioner.assignment(numPartitions); + Assertions.assertThat(actualAssignment).isEqualTo(expectedAssignment); + + // key: subtask id + // value pair: first is the assigned weight, second is the number of assigned keys + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(89L, 1), + 1, + Pair.of(89L, 1), + 2, + Pair.of(89L, 1), + 3, + Pair.of(89L, 2), + 4, + Pair.of(89L, 1), + 5, + Pair.of(89L, 1), + 6, + Pair.of(89L, 2), + 7, + Pair.of(89L, 2), + 8, + Pair.of(88L, 7)); + Map> actualAssignmentInfo = partitioner.assignmentInfo(); + Assertions.assertThat(actualAssignmentInfo).isEqualTo(expectedAssignmentInfo); + + Map>> partitionResults = + runPartitioner(partitioner, numPartitions); + validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); + } + + @Test + public void testNonDividableWithClosingFileCost() { + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapDataStatistics, 5.0); + int numPartitions = 9; + + // target subtask weight is 89 before close file cost factored in. + // close file cost is 5 (= 5% * 89) per file. + // key weights before and after close file cost factored in + // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 + // close-cost: 20, 15, 10, 5, 5, 5, 5, 5, 5, 5 + // after: 370, 245, 130, 45, 15, 15, 15, 15, 15, 15 + // target subtask weight per subtask is 98 ceiling(880/9) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(98L, 98L, 98L, 76L), 5L), + SORT_KEYS[1], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(3, 4, 5, 6), ImmutableList.of(22L, 98L, 98L, 27L), 5L), + SORT_KEYS[2], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(6, 7), ImmutableList.of(71L, 59L), 5L), + SORT_KEYS[3], + new MapRangePartitioner.KeyAssignment( + ImmutableList.of(7, 8), ImmutableList.of(39L, 6L), 5L), + SORT_KEYS[4], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[5], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[6], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[7], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[8], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[9], + new MapRangePartitioner.KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L)); + Map actualAssignment = + partitioner.assignment(numPartitions); + Assertions.assertThat(actualAssignment).isEqualTo(expectedAssignment); + + // key: subtask id + // value pair: first is the assigned weight for the subtask, second is the number of keys + // assigned to the subtask + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(93L, 1), + 1, + Pair.of(93L, 1), + 2, + Pair.of(93L, 1), + 3, + Pair.of(88L, 2), + 4, + Pair.of(93L, 1), + 5, + Pair.of(93L, 1), + 6, + Pair.of(88L, 2), + 7, + Pair.of(88L, 2), + 8, + Pair.of(61L, 7)); + Map> actualAssignmentInfo = partitioner.assignmentInfo(); + Assertions.assertThat(actualAssignmentInfo).isEqualTo(expectedAssignmentInfo); + + Map>> partitionResults = + runPartitioner(partitioner, numPartitions); + // drift threshold is high for non-dividable scenario with close cost + validatePartitionResults(expectedAssignmentInfo, partitionResults, 10.0); + } + + private static Map>> runPartitioner( + MapRangePartitioner partitioner, int numPartitions) { + // The Map key is the subtaskId. + // For the map value pair, the first element is the count of assigned and + // the second element of Set is for the set of assigned keys. + Map>> partitionResults = Maps.newHashMap(); + partitioner + .mapStatistics() + .forEach( + (sortKey, weight) -> { + String key = sortKey.get(0, String.class); + // run 100x times of the weight + long iterations = weight * 100; + for (int i = 0; i < iterations; ++i) { + RowData rowData = + GenericRowData.of( + StringData.fromString(key), 1, StringData.fromString("2023-06-20")); + int subtaskId = partitioner.partition(rowData, numPartitions); + partitionResults.computeIfAbsent( + subtaskId, k -> Pair.of(new AtomicLong(0), Sets.newHashSet())); + Pair> pair = partitionResults.get(subtaskId); + pair.first().incrementAndGet(); + pair.second().add(rowData); + } + }); + return partitionResults; + } + + /** @param expectedAssignmentInfo excluding closing cost */ + private void validatePartitionResults( + Map> expectedAssignmentInfo, + Map>> partitionResults, + double maxDriftPercentage) { + + Assertions.assertThat(partitionResults.size()).isEqualTo(expectedAssignmentInfo.size()); + + List expectedAssignedKeyCounts = + Lists.newArrayListWithExpectedSize(expectedAssignmentInfo.size()); + List actualAssignedKeyCounts = + Lists.newArrayListWithExpectedSize(partitionResults.size()); + List expectedNormalizedWeights = + Lists.newArrayListWithExpectedSize(expectedAssignmentInfo.size()); + List actualNormalizedWeights = + Lists.newArrayListWithExpectedSize(partitionResults.size()); + + long expectedTotalWeight = + expectedAssignmentInfo.values().stream().mapToLong(Pair::first).sum(); + expectedAssignmentInfo.forEach( + (subtaskId, pair) -> { + expectedAssignedKeyCounts.add(pair.second()); + expectedNormalizedWeights.add(pair.first().doubleValue() / expectedTotalWeight); + }); + + long actualTotalWeight = + partitionResults.values().stream().mapToLong(pair -> pair.first().longValue()).sum(); + partitionResults.forEach( + (subtaskId, pair) -> { + actualAssignedKeyCounts.add(pair.second().size()); + actualNormalizedWeights.add(pair.first().doubleValue() / actualTotalWeight); + }); + + // number of assigned keys should match exactly + Assertions.assertThat(actualAssignedKeyCounts) + .as("the number of assigned keys should match for every subtask") + .isEqualTo(expectedAssignedKeyCounts); + + // weight for every subtask shouldn't differ for more than some threshold relative to the + // expected weight + for (int subtaskId = 0; subtaskId < expectedNormalizedWeights.size(); ++subtaskId) { + double expectedWeight = expectedNormalizedWeights.get(subtaskId); + double min = expectedWeight * (1 - maxDriftPercentage / 100); + double max = expectedWeight * (1 + maxDriftPercentage / 100); + Assertions.assertThat(actualNormalizedWeights.get(subtaskId)) + .as( + "Subtask %d weight should within %.1f percent of the expected range %s", + subtaskId, maxDriftPercentage, expectedWeight) + .isBetween(min, max); + } + } +} diff --git a/jmh.gradle b/jmh.gradle index 076899239430..ea317cc2eea1 100644 --- a/jmh.gradle +++ b/jmh.gradle @@ -21,10 +21,15 @@ if (jdkVersion != '8' && jdkVersion != '11' && jdkVersion != '17') { throw new GradleException("The JMH benchmarks must be run with JDK 8 or JDK 11 or JDK 17") } +def flinkVersions = (System.getProperty("flinkVersions") != null ? System.getProperty("flinkVersions") : System.getProperty("defaultFlinkVersions")).split(",") def sparkVersions = (System.getProperty("sparkVersions") != null ? System.getProperty("sparkVersions") : System.getProperty("defaultSparkVersions")).split(",") def scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") def jmhProjects = [project(":iceberg-core"), project(":iceberg-data")] +if (flinkVersions.contains("1.17")) { + jmhProjects.add(project(":iceberg-flink:iceberg-flink-1.17")) +} + if (sparkVersions.contains("3.3")) { jmhProjects.add(project(":iceberg-spark:iceberg-spark-3.3_${scalaVersion}")) jmhProjects.add(project(":iceberg-spark:iceberg-spark-extensions-3.3_${scalaVersion}")) From 8e6c08e357a7c6024d146e0add958bf055d8c565 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 28 Mar 2024 11:38:26 +0100 Subject: [PATCH 22/25] Build: Bump software.amazon.awssdk:bom from 2.24.5 to 2.25.18 (#10050) Bumps software.amazon.awssdk:bom from 2.24.5 to 2.25.18. --- updated-dependencies: - dependency-name: software.amazon.awssdk:bom dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index cec16519417a..a0fb79bcd363 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -28,7 +28,7 @@ arrow = "15.0.2" avro = "1.11.3" assertj-core = "3.25.3" awaitility = "4.2.1" -awssdk-bom = "2.24.5" +awssdk-bom = "2.25.18" azuresdk-bom = "1.2.21" awssdk-s3accessgrants = "2.0.0" caffeine = "2.9.3" From 783158aca9c0287c4990976dc18eafdfb472ae1f Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 28 Mar 2024 11:39:25 +0100 Subject: [PATCH 23/25] CI: Run Markdown links checker only when `{docs,site}/**` changes (#10049) --- .github/workflows/docs-check-links.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/docs-check-links.yml b/.github/workflows/docs-check-links.yml index 9de842813dc3..4275a2d6efc4 100644 --- a/.github/workflows/docs-check-links.yml +++ b/.github/workflows/docs-check-links.yml @@ -27,6 +27,9 @@ on: branches: - 'main' pull_request: + paths: + - docs/** + - site/** workflow_dispatch: jobs: From 4eef2fe8263f11e8e448a11c4e07acf2cbecda7f Mon Sep 17 00:00:00 2001 From: Tom Tanaka <43331405+tomtongue@users.noreply.github.com> Date: Thu, 28 Mar 2024 23:21:31 +0900 Subject: [PATCH 24/25] Core, Data: Migrate tests to JUnit5 (#10039) --- .../apache/iceberg/FilterFilesTestBase.java | 11 +- .../java/org/apache/iceberg/TestMetrics.java | 170 +++--- .../org/apache/iceberg/TestWapWorkflow.java | 554 ++++++------------ .../iceberg/actions/TestCommitService.java | 39 +- .../actions/TestSizeBasedRewriter.java | 26 +- .../apache/iceberg/avro/AvroTestHelpers.java | 24 +- .../avro/TestNameMappingWithAvroSchema.java | 13 +- .../iceberg/encryption/TestGcmStreams.java | 104 ++-- .../TestStandardKeyMetadataParser.java | 13 +- .../iceberg/io/TestOutputFileFactory.java | 48 +- .../iceberg/mapping/TestMappingUpdates.java | 172 +++--- .../iceberg/mapping/TestNameMapping.java | 81 +-- .../apache/iceberg/orc/TestOrcMetrics.java | 28 +- .../iceberg/parquet/TestParquetMetrics.java | 17 +- 14 files changed, 554 insertions(+), 746 deletions(-) diff --git a/core/src/test/java/org/apache/iceberg/FilterFilesTestBase.java b/core/src/test/java/org/apache/iceberg/FilterFilesTestBase.java index ad92d0f662a3..bb4bb282a330 100644 --- a/core/src/test/java/org/apache/iceberg/FilterFilesTestBase.java +++ b/core/src/test/java/org/apache/iceberg/FilterFilesTestBase.java @@ -19,7 +19,7 @@ package org.apache.iceberg; import static org.apache.iceberg.types.Types.NestedField.required; -import static org.junit.Assert.assertEquals; +import static org.assertj.core.api.Assertions.assertThat; import java.io.File; import java.io.IOException; @@ -28,7 +28,6 @@ import java.nio.file.Path; import java.util.Map; import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; @@ -119,10 +118,10 @@ private void testFilterFiles(Table table) { table.refresh(); ScanT emptyScan = newScan(table).filter(Expressions.equal("id", 5)); - assertEquals(0, Iterables.size(emptyScan.planFiles())); + assertThat(emptyScan.planFiles()).isEmpty(); ScanT nonEmptyScan = newScan(table).filter(Expressions.equal("id", 1)); - assertEquals(1, Iterables.size(nonEmptyScan.planFiles())); + assertThat(nonEmptyScan.planFiles()).hasSize(1); } private void testCaseInsensitiveFilterFiles(Table table) { @@ -153,9 +152,9 @@ private void testCaseInsensitiveFilterFiles(Table table) { table.refresh(); ScanT emptyScan = newScan(table).caseSensitive(false).filter(Expressions.equal("ID", 5)); - assertEquals(0, Iterables.size(emptyScan.planFiles())); + assertThat(emptyScan.planFiles()).hasSize(0); ScanT nonEmptyScan = newScan(table).caseSensitive(false).filter(Expressions.equal("ID", 1)); - assertEquals(1, Iterables.size(nonEmptyScan.planFiles())); + assertThat(nonEmptyScan.planFiles()).hasSize(1); } } diff --git a/core/src/test/java/org/apache/iceberg/TestMetrics.java b/core/src/test/java/org/apache/iceberg/TestMetrics.java index 32bc6299ce1b..424e0e0a7b93 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetrics.java +++ b/core/src/test/java/org/apache/iceberg/TestMetrics.java @@ -21,6 +21,8 @@ import static org.apache.iceberg.types.Conversions.fromByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assumptions.assumeThat; import java.io.File; import java.io.IOException; @@ -28,9 +30,11 @@ import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; import java.util.List; import java.util.Map; -import java.util.Objects; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.io.InputFile; @@ -56,21 +60,19 @@ import org.apache.iceberg.types.Types.TimeType; import org.apache.iceberg.types.Types.TimestampType; import org.apache.iceberg.util.DateTimeUtil; -import org.junit.After; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.io.TempDir; /** Tests for Metrics. */ public abstract class TestMetrics { - protected TestMetrics(int formatVersion) { - this.formatVersion = formatVersion; + @Parameters(name = "formatVersion = {0}") + public static List parameters() { + return Arrays.asList(1, 2); } - @Rule public TemporaryFolder temp = new TemporaryFolder(); + @TempDir public Path temp; private static final StructType LEAF_STRUCT_TYPE = StructType.of( @@ -113,10 +115,10 @@ protected TestMetrics(int formatVersion) { private static final Record NAN_ONLY_RECORD = createRecordWithFloatAndDouble(Float.NaN, Double.NaN); - private final int formatVersion; + @Parameter private int formatVersion; private final byte[] fixed = "abcd".getBytes(StandardCharsets.UTF_8); - @After + @AfterEach public void after() { TestTables.clearTables(); } @@ -146,7 +148,7 @@ public boolean supportsSmallRowGroups() { protected abstract OutputFile createOutputFile() throws IOException; - @Test + @TestTemplate public void testMetricsForRepeatedValues() throws IOException { Record record = GenericRecord.create(SIMPLE_SCHEMA); record.setField("booleanCol", true); @@ -164,7 +166,7 @@ public void testMetricsForRepeatedValues() throws IOException { record.setField("timestampColBelowEpoch", DateTimeUtil.timestampFromMicros(0L)); Metrics metrics = getMetrics(SIMPLE_SCHEMA, record, record); - Assert.assertEquals(2L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(2L); assertCounts(1, 2L, 0L, metrics); assertCounts(2, 2L, 0L, metrics); assertCounts(3, 2L, 2L, metrics); @@ -180,7 +182,7 @@ public void testMetricsForRepeatedValues() throws IOException { assertCounts(13, 2L, 0L, metrics); } - @Test + @TestTemplate public void testMetricsForTopLevelFields() throws IOException { Record firstRecord = GenericRecord.create(SIMPLE_SCHEMA); firstRecord.setField("booleanCol", true); @@ -212,7 +214,7 @@ public void testMetricsForTopLevelFields() throws IOException { secondRecord.setField("timestampColBelowEpoch", DateTimeUtil.timestampFromMicros(-7_000L)); Metrics metrics = getMetrics(SIMPLE_SCHEMA, firstRecord, secondRecord); - Assert.assertEquals(2L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(2L); assertCounts(1, 2L, 0L, metrics); assertBounds(1, BooleanType.get(), false, true, metrics); assertCounts(2, 2L, 0L, metrics); @@ -255,7 +257,7 @@ public void testMetricsForTopLevelFields() throws IOException { } } - @Test + @TestTemplate public void testMetricsForDecimals() throws IOException { Schema schema = new Schema( @@ -269,7 +271,7 @@ public void testMetricsForDecimals() throws IOException { record.setField("decimalAsFixed", new BigDecimal("5.80")); Metrics metrics = getMetrics(schema, record); - Assert.assertEquals(1L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(1); assertCounts(1, 1L, 0L, metrics); assertBounds(1, DecimalType.of(4, 2), new BigDecimal("2.55"), new BigDecimal("2.55"), metrics); assertCounts(2, 1L, 0L, metrics); @@ -278,10 +280,10 @@ public void testMetricsForDecimals() throws IOException { assertBounds(3, DecimalType.of(22, 2), new BigDecimal("5.80"), new BigDecimal("5.80"), metrics); } - @Test + @TestTemplate public void testMetricsForNestedStructFields() throws IOException { Metrics metrics = getMetrics(NESTED_SCHEMA, buildNestedTestRecord()); - Assert.assertEquals(1L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(1L); assertCounts(1, 1L, 0L, metrics); assertBounds(1, IntegerType.get(), Integer.MAX_VALUE, Integer.MAX_VALUE, metrics); assertCounts(3, 1L, 0L, metrics); @@ -299,7 +301,7 @@ public void testMetricsForNestedStructFields() throws IOException { assertBounds(7, DoubleType.get(), null, null, metrics); } - @Test + @TestTemplate public void testMetricsModeForNestedStructFields() throws IOException { Map properties = ImmutableMap.of( @@ -310,9 +312,9 @@ public void testMetricsModeForNestedStructFields() throws IOException { MetricsConfig config = MetricsConfig.fromProperties(properties); Metrics metrics = getMetrics(NESTED_SCHEMA, config, buildNestedTestRecord()); - Assert.assertEquals(1L, (long) metrics.recordCount()); - Assert.assertEquals(1, metrics.lowerBounds().size()); - Assert.assertEquals(1, metrics.upperBounds().size()); + assertThat(metrics.recordCount()).isEqualTo(1L); + assertThat(metrics.lowerBounds()).hasSize(1); + assertThat(metrics.upperBounds()).hasSize(1); assertBounds(3, LongType.get(), 100L, 100L, metrics); } @@ -331,7 +333,7 @@ private Record buildNestedTestRecord() { return record; } - @Test + @TestTemplate public void testMetricsForListAndMapElements() throws IOException { StructType structType = StructType.of( @@ -352,7 +354,7 @@ public void testMetricsForListAndMapElements() throws IOException { record.set(1, map); Metrics metrics = getMetrics(schema, record); - Assert.assertEquals(1L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(1L); if (fileFormat() != FileFormat.ORC) { assertCounts(1, 1L, 0L, metrics); assertCounts(2, 1L, 0L, metrics); @@ -371,7 +373,7 @@ public void testMetricsForListAndMapElements() throws IOException { assertBounds(7, structType, null, null, metrics); } - @Test + @TestTemplate public void testMetricsForNullColumns() throws IOException { Schema schema = new Schema(optional(1, "intCol", IntegerType.get())); Record firstRecord = GenericRecord.create(schema); @@ -380,15 +382,15 @@ public void testMetricsForNullColumns() throws IOException { secondRecord.setField("intCol", null); Metrics metrics = getMetrics(schema, firstRecord, secondRecord); - Assert.assertEquals(2L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(2L); assertCounts(1, 2L, 2L, metrics); assertBounds(1, IntegerType.get(), null, null, metrics); } - @Test + @TestTemplate public void testMetricsForNaNColumns() throws IOException { Metrics metrics = getMetrics(FLOAT_DOUBLE_ONLY_SCHEMA, NAN_ONLY_RECORD, NAN_ONLY_RECORD); - Assert.assertEquals(2L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(2L); assertCounts(1, 2L, 0L, 2L, metrics); assertCounts(2, 2L, 0L, 2L, metrics); @@ -396,7 +398,7 @@ public void testMetricsForNaNColumns() throws IOException { assertBounds(2, DoubleType.get(), null, null, metrics); } - @Test + @TestTemplate public void testColumnBoundsWithNaNValueAtFront() throws IOException { Metrics metrics = getMetrics( @@ -404,7 +406,7 @@ public void testColumnBoundsWithNaNValueAtFront() throws IOException { NAN_ONLY_RECORD, FLOAT_DOUBLE_RECORD_1, FLOAT_DOUBLE_RECORD_2); - Assert.assertEquals(3L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(3L); assertCounts(1, 3L, 0L, 1L, metrics); assertCounts(2, 3L, 0L, 1L, metrics); @@ -412,7 +414,7 @@ public void testColumnBoundsWithNaNValueAtFront() throws IOException { assertBounds(2, DoubleType.get(), 3.4D, 7.8D, metrics); } - @Test + @TestTemplate public void testColumnBoundsWithNaNValueInMiddle() throws IOException { Metrics metrics = getMetrics( @@ -420,7 +422,7 @@ public void testColumnBoundsWithNaNValueInMiddle() throws IOException { FLOAT_DOUBLE_RECORD_1, NAN_ONLY_RECORD, FLOAT_DOUBLE_RECORD_2); - Assert.assertEquals(3L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(3L); assertCounts(1, 3L, 0L, 1L, metrics); assertCounts(2, 3L, 0L, 1L, metrics); @@ -428,7 +430,7 @@ public void testColumnBoundsWithNaNValueInMiddle() throws IOException { assertBounds(2, DoubleType.get(), 3.4D, 7.8D, metrics); } - @Test + @TestTemplate public void testColumnBoundsWithNaNValueAtEnd() throws IOException { Metrics metrics = getMetrics( @@ -436,7 +438,7 @@ public void testColumnBoundsWithNaNValueAtEnd() throws IOException { FLOAT_DOUBLE_RECORD_1, FLOAT_DOUBLE_RECORD_2, NAN_ONLY_RECORD); - Assert.assertEquals(3L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(3L); assertCounts(1, 3L, 0L, 1L, metrics); assertCounts(2, 3L, 0L, 1L, metrics); @@ -444,10 +446,11 @@ public void testColumnBoundsWithNaNValueAtEnd() throws IOException { assertBounds(2, DoubleType.get(), 3.4D, 7.8D, metrics); } - @Test + @TestTemplate public void testMetricsForTopLevelWithMultipleRowGroup() throws Exception { - Assume.assumeTrue( - "Skip test for formats that do not support small row groups", supportsSmallRowGroups()); + assumeThat(supportsSmallRowGroups()) + .as("Skip test for formats that do not support small row groups") + .isTrue(); int recordCount = 201; List records = Lists.newArrayListWithExpectedSize(recordCount); @@ -479,11 +482,11 @@ public void testMetricsForTopLevelWithMultipleRowGroup() throws Exception { SIMPLE_SCHEMA, outputFile, records.toArray(new Record[0])); InputFile recordsFile = outputFile.toInputFile(); - Assert.assertNotNull(recordsFile); + assertThat(recordsFile).isNotNull(); // rowgroup size should be > 1 - Assert.assertEquals(3, splitCount(recordsFile)); + assertThat(splitCount(recordsFile)).isEqualTo(3); - Assert.assertEquals(201L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(201L); assertCounts(1, 201L, 0L, metrics); assertBounds(1, Types.BooleanType.get(), false, true, metrics); assertBounds(2, Types.IntegerType.get(), 1, 201, metrics); @@ -498,10 +501,11 @@ public void testMetricsForTopLevelWithMultipleRowGroup() throws Exception { 6, Types.DecimalType.of(10, 2), new BigDecimal("2.00"), new BigDecimal("201.00"), metrics); } - @Test + @TestTemplate public void testMetricsForNestedStructFieldsWithMultipleRowGroup() throws IOException { - Assume.assumeTrue( - "Skip test for formats that do not support small row groups", supportsSmallRowGroups()); + assumeThat(supportsSmallRowGroups()) + .as("Skip test for formats that do not support small row groups") + .isTrue(); int recordCount = 201; List records = Lists.newArrayListWithExpectedSize(recordCount); @@ -527,11 +531,11 @@ public void testMetricsForNestedStructFieldsWithMultipleRowGroup() throws IOExce NESTED_SCHEMA, outputFile, records.toArray(new Record[0])); InputFile recordsFile = outputFile.toInputFile(); - Assert.assertNotNull(recordsFile); + assertThat(recordsFile).isNotNull(); // rowgroup size should be > 1 - Assert.assertEquals(3, splitCount(recordsFile)); + assertThat(splitCount(recordsFile)).isEqualTo(3); - Assert.assertEquals(201L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(201L); assertCounts(1, 201L, 0L, metrics); assertBounds(1, IntegerType.get(), 1, 201, metrics); assertCounts(3, 201L, 0L, metrics); @@ -549,15 +553,15 @@ public void testMetricsForNestedStructFieldsWithMultipleRowGroup() throws IOExce assertBounds(7, DoubleType.get(), null, null, metrics); } - @Test + @TestTemplate public void testNoneMetricsMode() throws IOException { Metrics metrics = getMetrics( NESTED_SCHEMA, MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "none")), buildNestedTestRecord()); - Assert.assertEquals(1L, (long) metrics.recordCount()); - Assert.assertTrue(metrics.columnSizes().values().stream().allMatch(Objects::nonNull)); + assertThat(metrics.recordCount()).isEqualTo(1L); + assertThat(metrics.columnSizes()).doesNotContainValue(null); assertCounts(1, null, null, metrics); assertBounds(1, Types.IntegerType.get(), null, null, metrics); assertCounts(3, null, null, metrics); @@ -570,7 +574,7 @@ public void testNoneMetricsMode() throws IOException { assertBounds(7, Types.DoubleType.get(), null, null, metrics); } - @Test + @TestTemplate public void testCountsMetricsMode() throws IOException { Metrics metrics = getMetrics( @@ -578,8 +582,8 @@ public void testCountsMetricsMode() throws IOException { MetricsConfig.fromProperties( ImmutableMap.of("write.metadata.metrics.default", "counts")), buildNestedTestRecord()); - Assert.assertEquals(1L, (long) metrics.recordCount()); - Assert.assertTrue(metrics.columnSizes().values().stream().allMatch(Objects::nonNull)); + assertThat(metrics.recordCount()).isEqualTo(1L); + assertThat(metrics.columnSizes()).doesNotContainValue(null); assertCounts(1, 1L, 0L, metrics); assertBounds(1, Types.IntegerType.get(), null, null, metrics); assertCounts(3, 1L, 0L, metrics); @@ -592,15 +596,15 @@ public void testCountsMetricsMode() throws IOException { assertBounds(7, Types.DoubleType.get(), null, null, metrics); } - @Test + @TestTemplate public void testFullMetricsMode() throws IOException { Metrics metrics = getMetrics( NESTED_SCHEMA, MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "full")), buildNestedTestRecord()); - Assert.assertEquals(1L, (long) metrics.recordCount()); - Assert.assertTrue(metrics.columnSizes().values().stream().allMatch(Objects::nonNull)); + assertThat(metrics.recordCount()).isEqualTo(1L); + assertThat(metrics.columnSizes()).doesNotContainValue(null); assertCounts(1, 1L, 0L, metrics); assertBounds(1, Types.IntegerType.get(), Integer.MAX_VALUE, Integer.MAX_VALUE, metrics); assertCounts(3, 1L, 0L, metrics); @@ -618,7 +622,7 @@ public void testFullMetricsMode() throws IOException { assertBounds(7, Types.DoubleType.get(), null, null, metrics); } - @Test + @TestTemplate public void testTruncateStringMetricsMode() throws IOException { String colName = "str_to_truncate"; Schema singleStringColSchema = new Schema(required(1, colName, Types.StringType.get())); @@ -636,13 +640,13 @@ public void testTruncateStringMetricsMode() throws IOException { CharBuffer expectedMinBound = CharBuffer.wrap("Lorem ipsu"); CharBuffer expectedMaxBound = CharBuffer.wrap("Lorem ipsv"); - Assert.assertEquals(1L, (long) metrics.recordCount()); - Assert.assertTrue(metrics.columnSizes().values().stream().allMatch(Objects::nonNull)); + assertThat(metrics.recordCount()).isEqualTo(1L); + assertThat(metrics.columnSizes()).doesNotContainValue(null); assertCounts(1, 1L, 0L, metrics); assertBounds(1, Types.StringType.get(), expectedMinBound, expectedMaxBound, metrics); } - @Test + @TestTemplate public void testTruncateBinaryMetricsMode() throws IOException { String colName = "bin_to_truncate"; Schema singleBinaryColSchema = new Schema(required(1, colName, Types.BinaryType.get())); @@ -660,16 +664,16 @@ public void testTruncateBinaryMetricsMode() throws IOException { ByteBuffer expectedMinBounds = ByteBuffer.wrap(new byte[] {0x1, 0x2, 0x3, 0x4, 0x5}); ByteBuffer expectedMaxBounds = ByteBuffer.wrap(new byte[] {0x1, 0x2, 0x3, 0x4, 0x6}); - Assert.assertEquals(1L, (long) metrics.recordCount()); - Assert.assertTrue(metrics.columnSizes().values().stream().allMatch(Objects::nonNull)); + assertThat(metrics.recordCount()).isEqualTo(1L); + assertThat(metrics.columnSizes()).doesNotContainValue(null); assertCounts(1, 1L, 0L, metrics); assertBounds(1, Types.BinaryType.get(), expectedMinBounds, expectedMaxBounds, metrics); } - @Test + @TestTemplate public void testSortedColumnMetrics() throws IOException { - File tableDir = temp.newFolder(); - tableDir.delete(); // created by table create + File tableDir = Files.createTempDirectory(temp, "junit").toFile(); + assertThat(tableDir.delete()).isTrue(); // created by table create SortOrder sortOrder = SortOrder.builderFor(SIMPLE_SCHEMA) @@ -719,7 +723,7 @@ public void testSortedColumnMetrics() throws IOException { Metrics metrics = getMetrics(SIMPLE_SCHEMA, MetricsConfig.forTable(table), firstRecord, secondRecord); - Assert.assertEquals(2L, (long) metrics.recordCount()); + assertThat(metrics.recordCount()).isEqualTo(2L); assertBounds(1, BooleanType.get(), false, true, metrics); assertBounds(2, IntegerType.get(), Integer.MIN_VALUE, Integer.MAX_VALUE, metrics); assertBounds(3, LongType.get(), Long.MIN_VALUE, Long.MAX_VALUE, metrics); @@ -729,10 +733,10 @@ public void testSortedColumnMetrics() throws IOException { assertBounds(8, DateType.get(), 1500, 3000, metrics); } - @Test + @TestTemplate public void testMetricsForSortedNestedStructFields() throws IOException { - File tableDir = temp.newFolder(); - tableDir.delete(); // created by table create + File tableDir = Files.createTempDirectory(temp, "junit").toFile(); + assertThat(tableDir.delete()).isTrue(); // created by table create SortOrder sortOrder = SortOrder.builderFor(NESTED_SCHEMA) @@ -766,12 +770,9 @@ protected void assertCounts(int fieldId, Long valueCount, Long nullValueCount, M protected void assertCounts( int fieldId, Long valueCount, Long nullValueCount, Long nanValueCount, Metrics metrics) { - Map valueCounts = metrics.valueCounts(); - Map nullValueCounts = metrics.nullValueCounts(); - Map nanValueCounts = metrics.nanValueCounts(); - Assert.assertEquals(valueCount, valueCounts.get(fieldId)); - Assert.assertEquals(nullValueCount, nullValueCounts.get(fieldId)); - Assert.assertEquals(nanValueCount, nanValueCounts.get(fieldId)); + assertThat(metrics.valueCounts().get(fieldId)).isEqualTo(valueCount); + assertThat(metrics.nullValueCounts().get(fieldId)).isEqualTo(nullValueCount); + assertThat(metrics.nanValueCounts().get(fieldId)).isEqualTo(nanValueCount); } protected void assertBounds( @@ -779,11 +780,16 @@ protected void assertBounds( Map lowerBounds = metrics.lowerBounds(); Map upperBounds = metrics.upperBounds(); - Assert.assertEquals( - lowerBound, - lowerBounds.containsKey(fieldId) ? fromByteBuffer(type, lowerBounds.get(fieldId)) : null); - Assert.assertEquals( - upperBound, - upperBounds.containsKey(fieldId) ? fromByteBuffer(type, upperBounds.get(fieldId)) : null); + if (lowerBounds.containsKey(fieldId)) { + assertThat((Object) fromByteBuffer(type, lowerBounds.get(fieldId))).isEqualTo(lowerBound); + } else { + assertThat(lowerBound).isNull(); + } + + if (upperBounds.containsKey(fieldId)) { + assertThat((Object) fromByteBuffer(type, upperBounds.get(fieldId))).isEqualTo(upperBound); + } else { + assertThat(upperBound).isNull(); + } } } diff --git a/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java b/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java index c65d469e3262..58b83f43f399 100644 --- a/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java +++ b/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java @@ -18,35 +18,32 @@ */ package org.apache.iceberg; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.Arrays; +import java.util.List; import org.apache.iceberg.exceptions.CherrypickAncestorCommitException; import org.apache.iceberg.exceptions.DuplicateWAPCommitException; import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Streams; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestWapWorkflow extends TableTestBase { - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; - } - - public TestWapWorkflow(int formatVersion) { - super(formatVersion); +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestWapWorkflow extends TestBase { + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); } - @Before + @BeforeEach public void setupTableProperties() { table.updateProperties().set(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true").commit(); } - @Test + @TestTemplate public void testCherryPickOverwrite() { table.newAppend().appendFile(FILE_A).commit(); @@ -68,7 +65,7 @@ public void testCherryPickOverwrite() { validateTableFiles(table, FILE_B); } - @Test + @TestTemplate public void testCherryPickOverwriteFailsIfCurrentHasChanged() { table.newAppend().appendFile(FILE_A).commit(); @@ -87,8 +84,7 @@ public void testCherryPickOverwriteFailsIfCurrentHasChanged() { .get(); // try to cherry-pick, which should fail because the overwrite's parent is no longer current - Assertions.assertThatThrownBy( - () -> table.manageSnapshots().cherrypick(overwrite.snapshotId()).commit()) + assertThatThrownBy(() -> table.manageSnapshots().cherrypick(overwrite.snapshotId()).commit()) .isInstanceOf(ValidationException.class) .hasMessage( "Cannot cherry-pick snapshot 2: not append, dynamic overwrite, or fast-forward"); @@ -97,7 +93,7 @@ public void testCherryPickOverwriteFailsIfCurrentHasChanged() { validateTableFiles(table, FILE_A, FILE_C); } - @Test + @TestTemplate public void testCurrentSnapshotOperation() { table.newAppend().appendFile(FILE_A).commit(); @@ -109,38 +105,27 @@ public void testCurrentSnapshotOperation() { Snapshot wapSnapshot = base.snapshots().get(1); - Assert.assertEquals("Metadata should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals( - "Snapshot should have wap id in summary", "123456789", wapSnapshot.summary().get("wap.id")); - Assert.assertEquals( - "Current snapshot should be first commit's snapshot", - firstSnapshotId, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(2); + assertThat(wapSnapshot.summary()).containsEntry("wap.id", "123456789"); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(1); // do setCurrentSnapshot table.manageSnapshots().setCurrentSnapshot(wapSnapshot.snapshotId()).commit(); base = readMetadata(); - Assert.assertEquals( - "Current snapshot should be what we rolled back to", - wapSnapshot.snapshotId(), - base.currentSnapshot().snapshotId()); - Assert.assertEquals("Metadata should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals( - "Should contain manifests for both files", - 2, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should contain append from last commit", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(wapSnapshot.snapshotId()); + assertThat(base.snapshots()).hasSize(2); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(2); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(2); } - @Test + @TestTemplate public void testSetCurrentSnapshotNoWAP() { table.newAppend().appendFile(FILE_A).commit(); @@ -154,24 +139,16 @@ public void testSetCurrentSnapshotNoWAP() { table.manageSnapshots().setCurrentSnapshot(firstSnapshotId).commit(); base = readMetadata(); - Assert.assertEquals( - "Current snapshot should be what we rolled back to", - firstSnapshotId, - base.currentSnapshot().snapshotId()); - Assert.assertEquals("Metadata should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals( - "Should contain manifests for both files", - 1, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should contain append from last commit", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 3, base.snapshotLog().size()); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshots()).hasSize(2); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(1); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(3); } - @Test + @TestTemplate public void testRollbackOnInvalidNonAncestor() { table.newAppend().appendFile(FILE_A).commit(); @@ -183,42 +160,31 @@ public void testRollbackOnInvalidNonAncestor() { Snapshot wapSnapshot = base.snapshots().get(1); - Assert.assertEquals("Metadata should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals( - "Snapshot should have wap id in summary", "123456789", wapSnapshot.summary().get("wap.id")); - Assert.assertEquals( - "Current snapshot should be first commit's snapshot", - firstSnapshotId, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(2); + assertThat(wapSnapshot.summary()).containsEntry("wap.id", "123456789"); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(1); // do rollback - Assertions.assertThatThrownBy( + assertThatThrownBy( // rollback to snapshot that is not an ancestor () -> table.manageSnapshots().rollbackTo(wapSnapshot.snapshotId()).commit()) .isInstanceOf(ValidationException.class) .hasMessage("Cannot roll back to snapshot, not an ancestor of the current state: 2"); base = readMetadata(); - Assert.assertEquals( - "Current snapshot should be what we rolled back to", - firstSnapshotId, - base.currentSnapshot().snapshotId()); - Assert.assertEquals("Metadata should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals( - "Should contain manifests for one snapshot", - 1, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should contain append from last commit", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshots()).hasSize(2); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(1); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(1); } - @Test + @TestTemplate public void testRollbackAndCherrypick() { // first snapshot table.newAppend().appendFile(FILE_A).commit(); @@ -239,26 +205,22 @@ public void testRollbackAndCherrypick() { // rollback to first snapshot table.manageSnapshots().rollbackTo(firstSnapshotId).commit(); base = readMetadata(); - Assert.assertEquals( - "Should be at first snapshot", firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Should have all three snapshots in the system", 3, base.snapshots().size()); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshots()).hasSize(3); // fast forward to third snapshot table.manageSnapshots().cherrypick(thirdSnapshot.snapshotId()).commit(); base = readMetadata(); - Assert.assertEquals( - "Current state should be at third snapshot", 4, base.currentSnapshot().snapshotId()); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(4); // fast forward to 2nd snapshot table.manageSnapshots().cherrypick(secondSnapshot.snapshotId()).commit(); base = readMetadata(); - Assert.assertEquals( - "Current state should be at second snapshot", 5, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Count all snapshots", 5, base.snapshots().size()); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(5); + assertThat(base.snapshots()).hasSize(5); } - @Test + @TestTemplate public void testRollbackToTime() { // first snapshot @@ -279,13 +241,11 @@ public void testRollbackToTime() { table.manageSnapshots().rollbackToTime(secondSnapshot.timestampMillis()).commit(); base = readMetadata(); - Assert.assertEquals( - "Should be at first snapshot", firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Should have all three snapshots in the system", 3, base.snapshots().size()); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshots()).hasSize(3); } - @Test + @TestTemplate public void testWithCherryPicking() { table.newAppend().appendFile(FILE_A).commit(); @@ -299,15 +259,12 @@ public void testWithCherryPicking() { // pick the snapshot that's staged but not committed Snapshot wapSnapshot = base.snapshots().get(1); - Assert.assertEquals("Should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals( - "Should have first wap id in summary", "123456789", wapSnapshot.summary().get("wap.id")); - Assert.assertEquals( - "Current snapshot should be first commit's snapshot", - firstSnapshotId, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(2); + assertThat(wapSnapshot.summary()).containsEntry("wap.id", "123456789"); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(1); // cherry-pick snapshot table.manageSnapshots().cherrypick(wapSnapshot.snapshotId()).commit(); @@ -315,24 +272,16 @@ public void testWithCherryPicking() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation - Assert.assertEquals( - "Current snapshot should be fast-forwarded to wap snapshot", - wapSnapshot.snapshotId(), - base.currentSnapshot().snapshotId()); - Assert.assertEquals("Should have two snapshots", 2, base.snapshots().size()); - Assert.assertEquals( - "Should contain manifests for both files", - 2, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should contain append from last commit", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(wapSnapshot.snapshotId()); + assertThat(base.snapshots()).hasSize(2); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(2); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(2); } - @Test + @TestTemplate public void testWithTwoPhaseCherryPicking() { table.newAppend().appendFile(FILE_A).commit(); @@ -352,25 +301,15 @@ public void testWithTwoPhaseCherryPicking() { Snapshot wap1Snapshot = base.snapshots().get(1); Snapshot wap2Snapshot = base.snapshots().get(2); - Assert.assertEquals("Should have three snapshots", 3, base.snapshots().size()); - Assert.assertEquals( - "Should have first wap id in summary", "123456789", wap1Snapshot.summary().get("wap.id")); - Assert.assertEquals( - "Should have second wap id in summary", "987654321", wap2Snapshot.summary().get("wap.id")); - Assert.assertEquals( - "Current snapshot should be first commit's snapshot", - firstSnapshotId, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Parent snapshot id should be same for first WAP snapshot", - firstSnapshotId, - wap1Snapshot.parentId().longValue()); - Assert.assertEquals( - "Parent snapshot id should be same for second WAP snapshot", - firstSnapshotId, - wap2Snapshot.parentId().longValue()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(3); + assertThat(wap1Snapshot.summary()).containsEntry("wap.id", "123456789"); + assertThat(wap2Snapshot.summary()).containsEntry("wap.id", "987654321"); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(wap1Snapshot.parentId()).isEqualTo(firstSnapshotId); + assertThat(wap2Snapshot.parentId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(1); // load current snapshot parentSnapshot = base.currentSnapshot(); @@ -380,24 +319,15 @@ public void testWithTwoPhaseCherryPicking() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation - Assert.assertEquals( - "Current snapshot should be set to one after wap snapshot", - parentSnapshot.snapshotId() + 1, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Should contain manifests for both files", - 2, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should contain append from last commit", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Parent snapshot id should change to latest snapshot before commit", - parentSnapshot.snapshotId(), - base.currentSnapshot().parentId().longValue()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(parentSnapshot.snapshotId() + 1); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(2); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.currentSnapshot().parentId()) + .as("Parent snapshot id should change to latest snapshot before commit") + .isEqualTo(parentSnapshot.snapshotId()); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(2); // load current snapshot parentSnapshot = base.currentSnapshot(); @@ -407,27 +337,19 @@ public void testWithTwoPhaseCherryPicking() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation - Assert.assertEquals( - "Current snapshot should be set to one after wap snapshot", - parentSnapshot.snapshotId() + 1 /* one fast-forwarded snapshot */ + 1, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Should contain manifests for both files", - 3, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should contain append from last commit", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Parent snapshot id should change to latest snapshot before commit", - parentSnapshot.snapshotId(), - base.currentSnapshot().parentId().longValue()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 3, base.snapshotLog().size()); + assertThat(base.currentSnapshot().snapshotId()) + .isEqualTo(parentSnapshot.snapshotId() + 1 /* one fast-forwarded snapshot */ + 1); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(3); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.currentSnapshot().parentId()) + .as("Parent snapshot id should change to latest snapshot before commit") + .isEqualTo(parentSnapshot.snapshotId()); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(3); } - @Test + @TestTemplate public void testWithCommitsBetweenCherryPicking() { table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); @@ -446,25 +368,15 @@ public void testWithCommitsBetweenCherryPicking() { Snapshot wap1Snapshot = base.snapshots().get(1); Snapshot wap2Snapshot = base.snapshots().get(2); - Assert.assertEquals("Should have three snapshots", 3, base.snapshots().size()); - Assert.assertEquals( - "Should have first wap id in summary", "123456789", wap1Snapshot.summary().get("wap.id")); - Assert.assertEquals( - "Should have second wap id in summary", "987654321", wap2Snapshot.summary().get("wap.id")); - Assert.assertEquals( - "Current snapshot should be first commit's snapshot", - firstSnapshotId, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Parent snapshot id should be same for first WAP snapshot", - firstSnapshotId, - wap1Snapshot.parentId().longValue()); - Assert.assertEquals( - "Parent snapshot id should be same for second WAP snapshot", - firstSnapshotId, - wap2Snapshot.parentId().longValue()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(3); + assertThat(wap1Snapshot.summary()).containsEntry("wap.id", "123456789"); + assertThat(wap2Snapshot.summary()).containsEntry("wap.id", "987654321"); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(wap1Snapshot.parentId()).isEqualTo(firstSnapshotId); + assertThat(wap2Snapshot.parentId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(1); // load current snapshot parentSnapshot = base.currentSnapshot(); @@ -473,17 +385,12 @@ public void testWithCommitsBetweenCherryPicking() { table.newAppend().appendFile(FILE_D).commit(); base = readMetadata(); - Assert.assertEquals("Should have four snapshots", 4, base.snapshots().size()); - Assert.assertEquals( - "Current snapshot should carry over the parent snapshot", - parentSnapshot.snapshotId(), - base.currentSnapshot().parentId().longValue()); - Assert.assertEquals( - "Should contain manifests for two files", - 2, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(4); + assertThat(base.currentSnapshot().parentId()).isEqualTo(parentSnapshot.snapshotId()); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(2); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(2); // load current snapshot parentSnapshot = base.currentSnapshot(); @@ -493,25 +400,14 @@ public void testWithCommitsBetweenCherryPicking() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation - Assert.assertEquals("Should have five snapshots", 5, base.snapshots().size()); - Assert.assertEquals( - "Current snapshot should be set to one after wap snapshot", - parentSnapshot.snapshotId() + 1, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Should contain manifests for three files", - 3, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should contain append from last commit", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Parent snapshot id should point to same snapshot", - parentSnapshot.snapshotId(), - base.currentSnapshot().parentId().longValue()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 3, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(5); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(parentSnapshot.snapshotId() + 1); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(3); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.currentSnapshot().parentId()).isEqualTo(parentSnapshot.snapshotId()); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(3); // load current snapshot parentSnapshot = base.currentSnapshot(); @@ -521,28 +417,17 @@ public void testWithCommitsBetweenCherryPicking() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation - Assert.assertEquals("Should have all the snapshots", 6, base.snapshots().size()); - Assert.assertEquals( - "Current snapshot should be set to one after wap snapshot", - parentSnapshot.snapshotId() + 1, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Should contain manifests for four files", - 4, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should contain append from last commit", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Parent snapshot id should point to same snapshot", - parentSnapshot.snapshotId(), - base.currentSnapshot().parentId().longValue()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 4, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(6); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(parentSnapshot.snapshotId() + 1); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(4); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.currentSnapshot().parentId()).isEqualTo(parentSnapshot.snapshotId()); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(4); } - @Test + @TestTemplate public void testWithCherryPickingWithCommitRetry() { table.newAppend().appendFile(FILE_A).commit(); @@ -559,19 +444,13 @@ public void testWithCherryPickingWithCommitRetry() { // pick the snapshot that's staged but not committed Snapshot wap1Snapshot = base.snapshots().get(1); - Assert.assertEquals("Should have two snapshots", 2, base.snapshots().size()); - Assert.assertEquals( - "Should have first wap id in summary", "123456789", wap1Snapshot.summary().get("wap.id")); - Assert.assertEquals( - "Current snapshot should be first commit's snapshot", - firstSnapshotId, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Parent snapshot id should be same for first WAP snapshot", - firstSnapshotId, - wap1Snapshot.parentId().longValue()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(2); + assertThat(wap1Snapshot.summary()).containsEntry("wap.id", "123456789"); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(wap1Snapshot.parentId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(1); // load current snapshot base = readMetadata(); @@ -583,27 +462,16 @@ public void testWithCherryPickingWithCommitRetry() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation - Assert.assertEquals( - "Current snapshot should be set to one after wap snapshot", - parentSnapshot.snapshotId() + 1, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Should contain manifests for both files", - 2, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should not contain redundant append due to retry", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Parent snapshot id should change to latest snapshot before commit", - parentSnapshot.snapshotId(), - base.currentSnapshot().parentId().longValue()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(parentSnapshot.snapshotId() + 1); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(2); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.currentSnapshot().parentId()).isEqualTo(parentSnapshot.snapshotId()); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(2); } - @Test + @TestTemplate public void testCherrypickingAncestor() { table.newAppend().appendFile(FILE_A).commit(); @@ -617,15 +485,12 @@ public void testCherrypickingAncestor() { // pick the snapshot that's staged but not committed Snapshot wapSnapshot = base.snapshots().get(1); - Assert.assertEquals("Should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals( - "Should have first wap id in summary", "123456789", wapSnapshot.summary().get("wap.id")); - Assert.assertEquals( - "Current snapshot should be first commit's snapshot", - firstSnapshotId, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(2); + assertThat(wapSnapshot.summary()).containsEntry("wap.id", "123456789"); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(1); // cherry-pick snapshot table.manageSnapshots().cherrypick(wapSnapshot.snapshotId()).commit(); @@ -634,30 +499,22 @@ public void testCherrypickingAncestor() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation - Assert.assertEquals( - "Current snapshot should be fast-forwarded to wap snapshot", - wapSnapshot.snapshotId(), - base.currentSnapshot().snapshotId()); - Assert.assertEquals("Should have two snapshots", 2, base.snapshots().size()); - Assert.assertEquals( - "Should contain manifests for both files", - 2, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should contain append from last commit", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); - - Assertions.assertThatThrownBy( + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(wapPublishedId); + assertThat(base.snapshots()).hasSize(2); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(2); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(2); + + assertThatThrownBy( // duplicate cherry-pick snapshot () -> table.manageSnapshots().cherrypick(firstSnapshotId).commit()) .isInstanceOf(CherrypickAncestorCommitException.class) .hasMessage("Cannot cherrypick snapshot 1: already an ancestor"); } - @Test + @TestTemplate public void testDuplicateCherrypick() { table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); @@ -673,39 +530,26 @@ public void testDuplicateCherrypick() { Snapshot wapSnapshot1 = base.snapshots().get(1); Snapshot wapSnapshot2 = base.snapshots().get(2); - Assert.assertEquals("Should have both snapshots", 3, base.snapshots().size()); - Assert.assertEquals( - "Should have wap id in first wap snapshot summary", - "123456789", - wapSnapshot1.summary().get("wap.id")); - Assert.assertEquals( - "Should have wap id in second wap snapshot summary", - "123456789", - wapSnapshot2.summary().get("wap.id")); - Assert.assertEquals( - "Current snapshot should be first commit's snapshot", - firstSnapshotId, - base.currentSnapshot().snapshotId()); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); + assertThat(base.snapshots()).hasSize(3); + assertThat(wapSnapshot1.summary()).containsEntry("wap.id", "123456789"); + assertThat(wapSnapshot2.summary()).containsEntry("wap.id", "123456789"); + assertThat(base.currentSnapshot().snapshotId()).isEqualTo(firstSnapshotId); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(1); // cherry-pick snapshot table.manageSnapshots().cherrypick(wapSnapshot1.snapshotId()).commit(); base = readMetadata(); - Assert.assertEquals("Should have three snapshots", 3, base.snapshots().size()); - Assert.assertEquals( - "Should contain manifests for both files", - 2, - base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals( - "Should contain append from last commit", - 1, - Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals( - "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); - - Assertions.assertThatThrownBy( + assertThat(base.snapshots()).hasSize(3); + assertThat(base.currentSnapshot().allManifests(table.io())).hasSize(2); + assertThat(base.currentSnapshot().addedDataFiles(table.io())).hasSize(1); + assertThat(base.snapshotLog()) + .as("Snapshot log should indicate number of snapshots committed") + .hasSize(2); + + assertThatThrownBy( // duplicate cherry-pick snapshot () -> table.manageSnapshots().cherrypick(wapSnapshot2.snapshotId()).commit()) .isInstanceOf(DuplicateWAPCommitException.class) @@ -713,7 +557,7 @@ public void testDuplicateCherrypick() { "Duplicate request to cherry pick wap id that was published already: 123456789"); } - @Test + @TestTemplate public void testNonWapCherrypick() { table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); @@ -727,41 +571,29 @@ public void testNonWapCherrypick() { base = readMetadata(); long thirdSnapshotId = base.currentSnapshot().snapshotId(); - Assert.assertEquals( - "Should be pointing to third snapshot", - thirdSnapshotId, - table.currentSnapshot().snapshotId()); + assertThat(table.currentSnapshot().snapshotId()).isEqualTo(thirdSnapshotId); // NOOP commit table.manageSnapshots().commit(); - Assert.assertEquals( - "Should still be pointing to third snapshot", - thirdSnapshotId, - table.currentSnapshot().snapshotId()); + assertThat(table.currentSnapshot().snapshotId()).isEqualTo(thirdSnapshotId); // Rollback to second snapshot table.manageSnapshots().rollbackTo(secondSnapshotId).commit(); - Assert.assertEquals( - "Should be pointing to second snapshot", - secondSnapshotId, - table.currentSnapshot().snapshotId()); + assertThat(table.currentSnapshot().snapshotId()).isEqualTo(secondSnapshotId); // Cherrypick down to third table.manageSnapshots().cherrypick(thirdSnapshotId).commit(); - Assert.assertEquals( - "Should be re-using wap snapshot after cherrypick", - 3, - table.currentSnapshot().snapshotId()); + assertThat(table.currentSnapshot().snapshotId()).isEqualTo(3); // try double cherrypicking of the third snapshot - Assertions.assertThatThrownBy( + assertThatThrownBy( // double cherrypicking of second snapshot () -> table.manageSnapshots().cherrypick(thirdSnapshotId).commit()) .isInstanceOf(CherrypickAncestorCommitException.class) .hasMessage("Cannot cherrypick snapshot 3: already an ancestor"); // try cherrypicking an ancestor - Assertions.assertThatThrownBy( + assertThatThrownBy( // double cherrypicking of second snapshot () -> table.manageSnapshots().cherrypick(firstSnapshotId).commit()) .isInstanceOf(CherrypickAncestorCommitException.class) diff --git a/core/src/test/java/org/apache/iceberg/actions/TestCommitService.java b/core/src/test/java/org/apache/iceberg/actions/TestCommitService.java index 1aae6483337f..2d2e78a81557 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestCommitService.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestCommitService.java @@ -19,31 +19,36 @@ package org.apache.iceberg.actions; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.spy; +import java.util.Arrays; +import java.util.List; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.stream.IntStream; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; import org.apache.iceberg.Table; -import org.apache.iceberg.TableTestBase; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.TestBase; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.util.Tasks; -import org.assertj.core.api.Assertions; import org.awaitility.Awaitility; -import org.junit.Test; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; -public class TestCommitService extends TableTestBase { +@ExtendWith(ParameterizedTestExtension.class) +public class TestCommitService extends TestBase { - public TestCommitService() { - super(1); + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1); } - @Test + @TestTemplate public void testCommittedResultsCorrectly() { CustomCommitService commitService = new CustomCommitService(table, 5, 10000); commitService.start(); @@ -55,10 +60,10 @@ public void testCommittedResultsCorrectly() { Set expected = Sets.newHashSet(IntStream.range(0, 100).iterator()); Set actual = Sets.newHashSet(commitService.results()); - Assertions.assertThat(actual).isEqualTo(expected); + assertThat(actual).isEqualTo(expected); } - @Test + @TestTemplate public void testAbortFileGroupsAfterTimeout() { CustomCommitService commitService = new CustomCommitService(table, 5, 200); commitService.start(); @@ -90,7 +95,7 @@ public void testAbortFileGroupsAfterTimeout() { // simulate timeout on the main thread, which then tries to abort file groups [5-7]. // This tests the race conditions, as the committerService is also trying to commit groups // [5-7]. - Assertions.assertThatThrownBy(commitService::close) + assertThatThrownBy(commitService::close) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("Timeout occurred when waiting for commits"); @@ -102,14 +107,12 @@ public void testAbortFileGroupsAfterTimeout() { .untilAsserted(() -> assertThat(commitService.completedRewritesAllCommitted()).isTrue()); if (commitService.aborted.isEmpty()) { // All file groups are committed - Assertions.assertThat(commitService.results()) - .isEqualTo(ImmutableList.of(0, 1, 2, 3, 4, 5, 6, 7)); + assertThat(commitService.results()).containsExactly(0, 1, 2, 3, 4, 5, 6, 7); } else { // File groups [5-7] are aborted - Assertions.assertThat(commitService.results()) - .doesNotContainAnyElementsOf(commitService.aborted); - Assertions.assertThat(commitService.results()).isEqualTo(ImmutableList.of(0, 1, 2, 3, 4)); - Assertions.assertThat(commitService.aborted).isEqualTo(ImmutableSet.of(5, 6, 7)); + assertThat(commitService.results()).doesNotContainAnyElementsOf(commitService.aborted); + assertThat(commitService.results()).containsExactly(0, 1, 2, 3, 4); + assertThat(commitService.aborted).containsExactly(5, 6, 7); } } diff --git a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java index c33bbc6f6d1e..a9a23d587ac9 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java @@ -20,33 +20,31 @@ import static org.assertj.core.api.Assertions.assertThat; +import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.MockFileScanTask; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; import org.apache.iceberg.Table; -import org.apache.iceberg.TableTestBase; +import org.apache.iceberg.TestBase; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; -@RunWith(Parameterized.class) -public class TestSizeBasedRewriter extends TableTestBase { +@ExtendWith(ParameterizedTestExtension.class) +public class TestSizeBasedRewriter extends TestBase { - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); } - public TestSizeBasedRewriter(int formatVersion) { - super(formatVersion); - } - - @Test + @TestTemplate public void testSplitSizeLowerBound() { SizeBasedDataFileRewriterImpl rewriter = new SizeBasedDataFileRewriterImpl(table); diff --git a/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java b/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java index af35e27f5b22..03108376eb4b 100644 --- a/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java +++ b/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java @@ -19,6 +19,7 @@ package org.apache.iceberg.avro; import static org.apache.iceberg.avro.AvroSchemaUtil.toOption; +import static org.assertj.core.api.Assertions.assertThat; import java.util.Arrays; import java.util.List; @@ -28,7 +29,6 @@ import org.apache.avro.generic.GenericData.Record; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; class AvroTestHelpers { @@ -81,7 +81,7 @@ static void assertEquals(Types.StructType struct, Record expected, Record actual static void assertEquals(Types.ListType list, List expected, List actual) { Type elementType = list.elementType(); - Assertions.assertThat(actual).as("List size should match").hasSameSizeAs(expected); + assertThat(actual).as("List size should match").hasSameSizeAs(expected); for (int i = 0; i < expected.size(); i += 1) { Object expectedValue = expected.get(i); @@ -94,7 +94,7 @@ static void assertEquals(Types.ListType list, List expected, List actual) static void assertEquals(Types.MapType map, Map expected, Map actual) { Type valueType = map.valueType(); - Assertions.assertThat(actual).as("Map keys should match").hasSameSizeAs(expected); + assertThat(actual).as("Map keys should match").hasSameSizeAs(expected); for (Object expectedKey : expected.keySet()) { Object expectedValue = expected.get(expectedKey); @@ -123,25 +123,21 @@ private static void assertEquals(Type type, Object expected, Object actual) { case FIXED: case BINARY: case DECIMAL: - Assertions.assertThat(actual) - .as("Primitive value should be equal to expected") - .isEqualTo(expected); + assertThat(actual).as("Primitive value should be equal to expected").isEqualTo(expected); break; case STRUCT: - Assertions.assertThat(expected) - .as("Expected should be a Record") - .isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Actual should be a Record").isInstanceOf(Record.class); + assertThat(expected).as("Expected should be a Record").isInstanceOf(Record.class); + assertThat(actual).as("Actual should be a Record").isInstanceOf(Record.class); assertEquals(type.asStructType(), (Record) expected, (Record) actual); break; case LIST: - Assertions.assertThat(expected).as("Expected should be a List").isInstanceOf(List.class); - Assertions.assertThat(actual).as("Actual should be a List").isInstanceOf(List.class); + assertThat(expected).as("Expected should be a List").isInstanceOf(List.class); + assertThat(actual).as("Actual should be a List").isInstanceOf(List.class); assertEquals(type.asListType(), (List) expected, (List) actual); break; case MAP: - Assertions.assertThat(expected).as("Expected should be a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Actual should be a Map").isInstanceOf(Map.class); + assertThat(expected).as("Expected should be a Map").isInstanceOf(Map.class); + assertThat(actual).as("Actual should be a Map").isInstanceOf(Map.class); assertEquals(type.asMapType(), (Map) expected, (Map) actual); break; default: diff --git a/core/src/test/java/org/apache/iceberg/avro/TestNameMappingWithAvroSchema.java b/core/src/test/java/org/apache/iceberg/avro/TestNameMappingWithAvroSchema.java index 9c8ea8453df5..686456b3b16f 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestNameMappingWithAvroSchema.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestNameMappingWithAvroSchema.java @@ -18,12 +18,13 @@ */ package org.apache.iceberg.avro; +import static org.assertj.core.api.Assertions.assertThat; + import org.apache.avro.Schema; import org.apache.iceberg.mapping.MappedField; import org.apache.iceberg.mapping.MappedFields; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.Test; public class TestNameMappingWithAvroSchema { @Test @@ -132,9 +133,9 @@ public void testNameMappingWithAvroSchema() { MappedField.of(13, "string"), MappedField.of(14, "int"))))), MappedField.of(21, "timezone"), MappedField.of(22, "bitmap")))); - Assert.assertEquals( - expected, - AvroWithPartnerByStructureVisitor.visit( - icebergSchema.asStruct(), schema, nameMappingWithAvroSchema)); + assertThat( + AvroWithPartnerByStructureVisitor.visit( + icebergSchema.asStruct(), schema, nameMappingWithAvroSchema)) + .isEqualTo(expected); } } diff --git a/core/src/test/java/org/apache/iceberg/encryption/TestGcmStreams.java b/core/src/test/java/org/apache/iceberg/encryption/TestGcmStreams.java index a954cf760baa..af910f9fdd45 100644 --- a/core/src/test/java/org/apache/iceberg/encryption/TestGcmStreams.java +++ b/core/src/test/java/org/apache/iceberg/encryption/TestGcmStreams.java @@ -18,10 +18,14 @@ */ package org.apache.iceberg.encryption; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.Arrays; import java.util.Random; @@ -29,15 +33,12 @@ import org.apache.iceberg.Files; import org.apache.iceberg.io.PositionOutputStream; import org.apache.iceberg.io.SeekableInputStream; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; public class TestGcmStreams { - @Rule public TemporaryFolder temp = new TemporaryFolder(); + @TempDir private Path temp; @Test public void testEmptyFile() throws IOException { @@ -48,7 +49,7 @@ public void testEmptyFile() throws IOException { random.nextBytes(aadPrefix); byte[] readBytes = new byte[1]; - File testFile = temp.newFile(); + File testFile = File.createTempFile("test", null, temp.toFile()); AesGcmOutputFile encryptedFile = new AesGcmOutputFile(Files.localOutput(testFile), key, aadPrefix); @@ -56,20 +57,20 @@ public void testEmptyFile() throws IOException { encryptedStream.close(); AesGcmInputFile decryptedFile = new AesGcmInputFile(Files.localInput(testFile), key, aadPrefix); - Assert.assertEquals("File size", 0, decryptedFile.getLength()); + assertThat(decryptedFile.getLength()).isEqualTo(0); try (SeekableInputStream decryptedStream = decryptedFile.newStream()) { - Assert.assertEquals("Read empty stream", -1, decryptedStream.read(readBytes)); + assertThat(decryptedStream.read(readBytes)).as("Read empty stream").isEqualTo(-1); } // check that the AAD is still verified, even for an empty file byte[] badAAD = Arrays.copyOf(aadPrefix, aadPrefix.length); badAAD[1] -= 1; // modify the AAD slightly AesGcmInputFile badAADFile = new AesGcmInputFile(Files.localInput(testFile), key, badAAD); - Assert.assertEquals("File size", 0, badAADFile.getLength()); + assertThat(badAADFile.getLength()).isEqualTo(0); try (SeekableInputStream decryptedStream = badAADFile.newStream()) { - Assertions.assertThatThrownBy(() -> decryptedStream.read(readBytes)) + assertThatThrownBy(() -> decryptedStream.read(readBytes)) .isInstanceOf(RuntimeException.class) .hasCauseInstanceOf(AEADBadTagException.class) .hasMessageContaining("GCM tag check failed"); @@ -86,7 +87,7 @@ public void testAADValidation() throws IOException { byte[] content = new byte[Ciphers.PLAIN_BLOCK_SIZE / 2]; // half a block random.nextBytes(content); - File testFile = temp.newFile(); + File testFile = File.createTempFile("test", null, temp.toFile()); AesGcmOutputFile encryptedFile = new AesGcmOutputFile(Files.localOutput(testFile), key, aadPrefix); @@ -96,27 +97,24 @@ public void testAADValidation() throws IOException { // verify the data can be read correctly with the right AAD AesGcmInputFile decryptedFile = new AesGcmInputFile(Files.localInput(testFile), key, aadPrefix); - Assert.assertEquals("File size", content.length, decryptedFile.getLength()); + assertThat(decryptedFile.getLength()).isEqualTo(content.length); try (SeekableInputStream decryptedStream = decryptedFile.newStream()) { byte[] readContent = new byte[Ciphers.PLAIN_BLOCK_SIZE]; int bytesRead = decryptedStream.read(readContent); - Assert.assertEquals("Bytes read should match bytes written", content.length, bytesRead); - Assert.assertEquals( - "Content should match", - ByteBuffer.wrap(content), - ByteBuffer.wrap(readContent, 0, bytesRead)); + assertThat(bytesRead).as("Bytes read should match bytes written").isEqualTo(content.length); + assertThat(ByteBuffer.wrap(readContent, 0, bytesRead)).isEqualTo(ByteBuffer.wrap(content)); } // test with the wrong AAD byte[] badAAD = Arrays.copyOf(aadPrefix, aadPrefix.length); badAAD[1] -= 1; // modify the AAD slightly AesGcmInputFile badAADFile = new AesGcmInputFile(Files.localInput(testFile), key, badAAD); - Assert.assertEquals("File size", content.length, badAADFile.getLength()); + assertThat(badAADFile.getLength()).isEqualTo(content.length); try (SeekableInputStream decryptedStream = badAADFile.newStream()) { byte[] readContent = new byte[Ciphers.PLAIN_BLOCK_SIZE]; - Assertions.assertThatThrownBy(() -> decryptedStream.read(readContent)) + assertThatThrownBy(() -> decryptedStream.read(readContent)) .isInstanceOf(RuntimeException.class) .hasCauseInstanceOf(AEADBadTagException.class) .hasMessageContaining("GCM tag check failed"); @@ -132,7 +130,7 @@ public void testAADValidation() throws IOException { // read with the correct AAD and verify the tag check fails try (SeekableInputStream decryptedStream = decryptedFile.newStream()) { byte[] readContent = new byte[Ciphers.PLAIN_BLOCK_SIZE]; - Assertions.assertThatThrownBy(() -> decryptedStream.read(readContent)) + assertThatThrownBy(() -> decryptedStream.read(readContent)) .isInstanceOf(RuntimeException.class) .hasCauseInstanceOf(AEADBadTagException.class) .hasMessageContaining("GCM tag check failed"); @@ -149,7 +147,7 @@ public void testCorruptNonce() throws IOException { byte[] content = new byte[Ciphers.PLAIN_BLOCK_SIZE / 2]; // half a block random.nextBytes(content); - File testFile = temp.newFile(); + File testFile = File.createTempFile("test", null, temp.toFile()); AesGcmOutputFile encryptedFile = new AesGcmOutputFile(Files.localOutput(testFile), key, aadPrefix); @@ -159,16 +157,13 @@ public void testCorruptNonce() throws IOException { // verify the data can be read correctly with the right AAD AesGcmInputFile decryptedFile = new AesGcmInputFile(Files.localInput(testFile), key, aadPrefix); - Assert.assertEquals("File size", content.length, decryptedFile.getLength()); + assertThat(decryptedFile.getLength()).isEqualTo(content.length); try (SeekableInputStream decryptedStream = decryptedFile.newStream()) { byte[] readContent = new byte[Ciphers.PLAIN_BLOCK_SIZE]; int bytesRead = decryptedStream.read(readContent); - Assert.assertEquals("Bytes read should match bytes written", content.length, bytesRead); - Assert.assertEquals( - "Content should match", - ByteBuffer.wrap(content), - ByteBuffer.wrap(readContent, 0, bytesRead)); + assertThat(bytesRead).as("Bytes read should match bytes written").isEqualTo(content.length); + assertThat(ByteBuffer.wrap(readContent, 0, bytesRead)).isEqualTo(ByteBuffer.wrap(content)); } // replace the first block's nonce @@ -181,7 +176,7 @@ public void testCorruptNonce() throws IOException { // read with the correct AAD and verify the read fails try (SeekableInputStream decryptedStream = decryptedFile.newStream()) { byte[] readContent = new byte[Ciphers.PLAIN_BLOCK_SIZE]; - Assertions.assertThatThrownBy(() -> decryptedStream.read(readContent)) + assertThatThrownBy(() -> decryptedStream.read(readContent)) .isInstanceOf(RuntimeException.class) .hasCauseInstanceOf(AEADBadTagException.class) .hasMessageContaining("GCM tag check failed"); @@ -198,7 +193,7 @@ public void testCorruptCiphertext() throws IOException { byte[] content = new byte[Ciphers.PLAIN_BLOCK_SIZE / 2]; // half a block random.nextBytes(content); - File testFile = temp.newFile(); + File testFile = File.createTempFile("test", null, temp.toFile()); AesGcmOutputFile encryptedFile = new AesGcmOutputFile(Files.localOutput(testFile), key, aadPrefix); @@ -208,16 +203,13 @@ public void testCorruptCiphertext() throws IOException { // verify the data can be read correctly with the right AAD AesGcmInputFile decryptedFile = new AesGcmInputFile(Files.localInput(testFile), key, aadPrefix); - Assert.assertEquals("File size", content.length, decryptedFile.getLength()); + assertThat(decryptedFile.getLength()).isEqualTo(content.length); try (SeekableInputStream decryptedStream = decryptedFile.newStream()) { byte[] readContent = new byte[Ciphers.PLAIN_BLOCK_SIZE]; int bytesRead = decryptedStream.read(readContent); - Assert.assertEquals("Bytes read should match bytes written", content.length, bytesRead); - Assert.assertEquals( - "Content should match", - ByteBuffer.wrap(content), - ByteBuffer.wrap(readContent, 0, bytesRead)); + assertThat(bytesRead).as("Bytes read should match bytes written").isEqualTo(content.length); + assertThat(ByteBuffer.wrap(readContent, 0, bytesRead)).isEqualTo(ByteBuffer.wrap(content)); } // replace part of the first block's content @@ -230,7 +222,7 @@ public void testCorruptCiphertext() throws IOException { // read with the correct AAD and verify the read fails try (SeekableInputStream decryptedStream = decryptedFile.newStream()) { byte[] readContent = new byte[Ciphers.PLAIN_BLOCK_SIZE]; - Assertions.assertThatThrownBy(() -> decryptedStream.read(readContent)) + assertThatThrownBy(() -> decryptedStream.read(readContent)) .isInstanceOf(RuntimeException.class) .hasCauseInstanceOf(AEADBadTagException.class) .hasMessageContaining("GCM tag check failed"); @@ -260,7 +252,7 @@ public void testRandomWriteRead() throws IOException { byte[] key = new byte[keyLength]; random.nextBytes(key); random.nextBytes(aadPrefix); - File testFile = temp.newFile(); + File testFile = File.createTempFile("test", null, temp.toFile()); AesGcmOutputFile encryptedFile = new AesGcmOutputFile(Files.localOutput(testFile), key, aadPrefix); @@ -277,17 +269,19 @@ public void testRandomWriteRead() throws IOException { } encryptedStream.write(testFileContents, offset, chunkLen); offset += chunkLen; - Assert.assertEquals("Position", offset, encryptedStream.getPos()); + assertThat(encryptedStream.getPos()).isEqualTo(offset); left -= chunkLen; } encryptedStream.close(); - Assert.assertEquals("Final position in closed stream", offset, encryptedStream.getPos()); + assertThat(encryptedStream.getPos()) + .as("Final position in closed stream") + .isEqualTo(offset); AesGcmInputFile decryptedFile = new AesGcmInputFile(Files.localInput(testFile), key, aadPrefix); SeekableInputStream decryptedStream = decryptedFile.newStream(); - Assert.assertEquals("File size", testFileSize, decryptedFile.getLength()); + assertThat(decryptedFile.getLength()).isEqualTo(testFileSize); byte[] chunk = new byte[testFileSize]; @@ -303,26 +297,26 @@ public void testRandomWriteRead() throws IOException { decryptedStream.seek(pos); int len = decryptedStream.read(chunk, 0, chunkLen); - Assert.assertEquals("Read length", len, chunkLen); + assertThat(chunkLen).isEqualTo(len); long pos2 = decryptedStream.getPos(); - Assert.assertEquals("Position", pos + len, pos2); + assertThat(pos2).isEqualTo(pos + len); ByteBuffer bb1 = ByteBuffer.wrap(chunk, 0, chunkLen); ByteBuffer bb2 = ByteBuffer.wrap(testFileContents, pos, chunkLen); - Assert.assertEquals("Read contents", bb1, bb2); + assertThat(bb2).isEqualTo(bb1); // Test skip long toSkip = random.nextInt(testFileSize); long skipped = decryptedStream.skip(toSkip); if (pos2 + toSkip < testFileSize) { - Assert.assertEquals("Skipped", toSkip, skipped); + assertThat(skipped).isEqualTo(toSkip); } else { - Assert.assertEquals("Skipped", (testFileSize - pos2), skipped); + assertThat(skipped).isEqualTo(testFileSize - pos2); } int pos3 = (int) decryptedStream.getPos(); - Assert.assertEquals("Position", pos2 + skipped, pos3); + assertThat(pos3).isEqualTo(pos2 + skipped); chunkLen = random.nextInt(testFileSize); left = testFileSize - pos3; @@ -334,7 +328,7 @@ public void testRandomWriteRead() throws IOException { decryptedStream.read(chunk, 0, chunkLen); bb1 = ByteBuffer.wrap(chunk, 0, chunkLen); bb2 = ByteBuffer.wrap(testFileContents, pos3, chunkLen); - Assert.assertEquals("Read contents", bb1, bb2); + assertThat(bb2).isEqualTo(bb1); } decryptedStream.close(); @@ -357,7 +351,7 @@ public void testAlignedWriteRead() throws IOException { byte[] aadPrefix = new byte[16]; random.nextBytes(aadPrefix); - File testFile = temp.newFile(); + File testFile = File.createTempFile("test", null, temp.toFile()); AesGcmOutputFile encryptedFile = new AesGcmOutputFile(Files.localOutput(testFile), key, aadPrefix); PositionOutputStream encryptedStream = encryptedFile.createOrOverwrite(); @@ -374,17 +368,17 @@ public void testAlignedWriteRead() throws IOException { encryptedStream.write(testFileContents, offset, chunkLen); offset += chunkLen; - Assert.assertEquals("Position", offset, encryptedStream.getPos()); + assertThat(encryptedStream.getPos()).isEqualTo(offset); left -= chunkLen; } encryptedStream.close(); - Assert.assertEquals("Final position in closed stream", offset, encryptedStream.getPos()); + assertThat(encryptedStream.getPos()).as("Final position in closed stream").isEqualTo(offset); AesGcmInputFile decryptedFile = new AesGcmInputFile(Files.localInput(testFile), key, aadPrefix); SeekableInputStream decryptedStream = decryptedFile.newStream(); - Assert.assertEquals("File size", testFileSize, decryptedFile.getLength()); + assertThat(decryptedFile.getLength()).isEqualTo(testFileSize); offset = 0; chunkLen = Ciphers.PLAIN_BLOCK_SIZE; @@ -399,12 +393,12 @@ public void testAlignedWriteRead() throws IOException { decryptedStream.seek(offset); int len = decryptedStream.read(chunk, 0, chunkLen); - Assert.assertEquals("Read length", len, chunkLen); - Assert.assertEquals("Position", offset + len, decryptedStream.getPos()); + assertThat(chunkLen).isEqualTo(len); + assertThat(decryptedStream.getPos()).isEqualTo(offset + len); ByteBuffer bb1 = ByteBuffer.wrap(chunk, 0, chunkLen); ByteBuffer bb2 = ByteBuffer.wrap(testFileContents, offset, chunkLen); - Assert.assertEquals("Read contents", bb1, bb2); + assertThat(bb2).isEqualTo(bb1); offset += len; left = testFileSize - offset; diff --git a/core/src/test/java/org/apache/iceberg/encryption/TestStandardKeyMetadataParser.java b/core/src/test/java/org/apache/iceberg/encryption/TestStandardKeyMetadataParser.java index 889506cb93e4..d73f52da4661 100644 --- a/core/src/test/java/org/apache/iceberg/encryption/TestStandardKeyMetadataParser.java +++ b/core/src/test/java/org/apache/iceberg/encryption/TestStandardKeyMetadataParser.java @@ -18,11 +18,12 @@ */ package org.apache.iceberg.encryption; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.Test; public class TestStandardKeyMetadataParser { @@ -35,14 +36,14 @@ public void testParser() { ByteBuffer serialized = metadata.buffer(); StandardKeyMetadata parsedMetadata = StandardKeyMetadata.parse(serialized); - Assert.assertEquals(parsedMetadata.encryptionKey(), encryptionKey); - Assert.assertEquals(parsedMetadata.aadPrefix(), aadPrefix); + assertThat(encryptionKey).isEqualTo(parsedMetadata.encryptionKey()); + assertThat(aadPrefix).isEqualTo(parsedMetadata.aadPrefix()); } @Test public void testUnsupportedVersion() { ByteBuffer badBuffer = ByteBuffer.wrap(new byte[] {0x02}); - Assertions.assertThatThrownBy(() -> StandardKeyMetadata.parse(badBuffer)) + assertThatThrownBy(() -> StandardKeyMetadata.parse(badBuffer)) .isInstanceOf(UnsupportedOperationException.class) .hasMessage("Cannot resolve schema for version: 2"); } diff --git a/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java b/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java index f7c81ae879c9..ceffeb3749da 100644 --- a/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java +++ b/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java @@ -18,36 +18,35 @@ */ package org.apache.iceberg.io; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Arrays; +import java.util.List; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; import org.apache.iceberg.PartitionKey; import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.TableTestBase; +import org.apache.iceberg.TestBase; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; -@RunWith(Parameterized.class) -public class TestOutputFileFactory extends TableTestBase { +@ExtendWith(ParameterizedTestExtension.class) +public class TestOutputFileFactory extends TestBase { - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); } private static final int PARTITION_ID = 1; private static final int TASK_ID = 100; - public TestOutputFileFactory(int formatVersion) { - super(formatVersion); - } - - @Test + @TestTemplate public void testOutputFileFactoryWithCustomFormat() { table.updateProperties().defaultFormat(FileFormat.ORC).commit(); @@ -55,11 +54,10 @@ public void testOutputFileFactoryWithCustomFormat() { OutputFileFactory.builderFor(table, PARTITION_ID, TASK_ID).format(FileFormat.AVRO).build(); String location = fileFactory.newOutputFile().encryptingOutputFile().location(); - Assert.assertEquals( - "File format should be correct", FileFormat.AVRO, FileFormat.fromFileName(location)); + assertThat(FileFormat.fromFileName(location)).isEqualTo(FileFormat.AVRO); } - @Test + @TestTemplate public void testOutputFileFactoryWithMultipleSpecs() { OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, PARTITION_ID, TASK_ID).operationId("append").build(); @@ -67,18 +65,17 @@ public void testOutputFileFactoryWithMultipleSpecs() { EncryptedOutputFile unpartitionedFile = fileFactory.newOutputFile(PartitionSpec.unpartitioned(), null); String unpartitionedFileLocation = unpartitionedFile.encryptingOutputFile().location(); - Assert.assertTrue(unpartitionedFileLocation.endsWith("data/00001-100-append-00001.parquet")); + assertThat(unpartitionedFileLocation).endsWith("data/00001-100-append-00001.parquet"); Record record = GenericRecord.create(table.schema()).copy(ImmutableMap.of("data", "aaa")); PartitionKey partitionKey = new PartitionKey(table.spec(), table.schema()); partitionKey.partition(record); EncryptedOutputFile partitionedFile = fileFactory.newOutputFile(table.spec(), partitionKey); String partitionedFileLocation = partitionedFile.encryptingOutputFile().location(); - Assert.assertTrue( - partitionedFileLocation.endsWith("data_bucket=7/00001-100-append-00002.parquet")); + assertThat(partitionedFileLocation).endsWith("data_bucket=7/00001-100-append-00002.parquet"); } - @Test + @TestTemplate public void testWithCustomSuffix() { OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, PARTITION_ID, TASK_ID) @@ -89,15 +86,14 @@ public void testWithCustomSuffix() { EncryptedOutputFile unpartitionedFile = fileFactory.newOutputFile(PartitionSpec.unpartitioned(), null); String unpartitionedFileLocation = unpartitionedFile.encryptingOutputFile().location(); - Assertions.assertThat(unpartitionedFileLocation) - .endsWith("data/00001-100-append-00001-suffix.parquet"); + assertThat(unpartitionedFileLocation).endsWith("data/00001-100-append-00001-suffix.parquet"); Record record = GenericRecord.create(table.schema()).copy(ImmutableMap.of("data", "aaa")); PartitionKey partitionKey = new PartitionKey(table.spec(), table.schema()); partitionKey.partition(record); EncryptedOutputFile partitionedFile = fileFactory.newOutputFile(table.spec(), partitionKey); String partitionedFileLocation = partitionedFile.encryptingOutputFile().location(); - Assertions.assertThat(partitionedFileLocation) + assertThat(partitionedFileLocation) .endsWith("data_bucket=7/00001-100-append-00002-suffix.parquet"); } } diff --git a/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java b/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java index 6b59095225d8..b9ae9dc273f2 100644 --- a/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java +++ b/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java @@ -19,28 +19,27 @@ package org.apache.iceberg.mapping; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import java.util.Arrays; +import java.util.List; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TableTestBase; +import org.apache.iceberg.TestBase; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestMappingUpdates extends TableTestBase { - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestMappingUpdates extends TestBase { + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); } - public TestMappingUpdates(int formatVersion) { - super(formatVersion); - } - - @Test + @TestTemplate public void testAddColumnMappingUpdate() { NameMapping mapping = MappingUtil.create(table.schema()); table @@ -48,22 +47,21 @@ public void testAddColumnMappingUpdate() { .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) .commit(); - Assert.assertEquals( - MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data")), - mapping.asMappedFields()); + assertThat(mapping.asMappedFields()) + .isEqualTo(MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data"))); table.updateSchema().addColumn("ts", Types.TimestampType.withZone()).commit(); NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - Assert.assertEquals( - MappedFields.of( - MappedField.of(1, "id"), MappedField.of(2, "data"), MappedField.of(3, "ts")), - updated.asMappedFields()); + assertThat(updated.asMappedFields()) + .isEqualTo( + MappedFields.of( + MappedField.of(1, "id"), MappedField.of(2, "data"), MappedField.of(3, "ts"))); } - @Test + @TestTemplate public void testAddNestedColumnMappingUpdate() { NameMapping mapping = MappingUtil.create(table.schema()); table @@ -71,9 +69,8 @@ public void testAddNestedColumnMappingUpdate() { .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) .commit(); - Assert.assertEquals( - MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data")), - mapping.asMappedFields()); + assertThat(mapping.asMappedFields()) + .isEqualTo(MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data"))); table .updateSchema() @@ -86,32 +83,32 @@ public void testAddNestedColumnMappingUpdate() { NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - Assert.assertEquals( - MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of( - 3, "point", MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y")))), - updated.asMappedFields()); + assertThat(updated.asMappedFields()) + .isEqualTo( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of( + 3, "point", MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y"))))); table.updateSchema().addColumn("point", "z", Types.DoubleType.get()).commit(); NameMapping pointUpdated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - Assert.assertEquals( - MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of( - 3, - "point", - MappedFields.of( - MappedField.of(4, "x"), MappedField.of(5, "y"), MappedField.of(6, "z")))), - pointUpdated.asMappedFields()); + assertThat(pointUpdated.asMappedFields()) + .isEqualTo( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of( + 3, + "point", + MappedFields.of( + MappedField.of(4, "x"), MappedField.of(5, "y"), MappedField.of(6, "z"))))); } - @Test + @TestTemplate public void testRenameMappingUpdate() { NameMapping mapping = MappingUtil.create(table.schema()); table @@ -119,22 +116,21 @@ public void testRenameMappingUpdate() { .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) .commit(); - Assert.assertEquals( - MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data")), - mapping.asMappedFields()); + assertThat(mapping.asMappedFields()) + .isEqualTo(MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data"))); table.updateSchema().renameColumn("id", "object_id").commit(); NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - Assert.assertEquals( - MappedFields.of( - MappedField.of(1, ImmutableList.of("id", "object_id")), MappedField.of(2, "data")), - updated.asMappedFields()); + assertThat(updated.asMappedFields()) + .isEqualTo( + MappedFields.of( + MappedField.of(1, ImmutableList.of("id", "object_id")), MappedField.of(2, "data"))); } - @Test + @TestTemplate public void testRenameNestedFieldMappingUpdate() { NameMapping mapping = MappingUtil.create(table.schema()); table @@ -153,33 +149,33 @@ public void testRenameNestedFieldMappingUpdate() { NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - Assert.assertEquals( - MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of( - 3, "point", MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y")))), - updated.asMappedFields()); + assertThat(updated.asMappedFields()) + .isEqualTo( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of( + 3, "point", MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y"))))); table.updateSchema().renameColumn("point.x", "X").renameColumn("point.y", "Y").commit(); NameMapping pointUpdated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - Assert.assertEquals( - MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of( - 3, - "point", - MappedFields.of( - MappedField.of(4, ImmutableList.of("x", "X")), - MappedField.of(5, ImmutableList.of("y", "Y"))))), - pointUpdated.asMappedFields()); + assertThat(pointUpdated.asMappedFields()) + .isEqualTo( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of( + 3, + "point", + MappedFields.of( + MappedField.of(4, ImmutableList.of("x", "X")), + MappedField.of(5, ImmutableList.of("y", "Y")))))); } - @Test + @TestTemplate public void testRenameComplexFieldMappingUpdate() { NameMapping mapping = MappingUtil.create(table.schema()); table @@ -198,27 +194,27 @@ public void testRenameComplexFieldMappingUpdate() { NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - Assert.assertEquals( - MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of( - 3, "point", MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y")))), - updated.asMappedFields()); + assertThat(updated.asMappedFields()) + .isEqualTo( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of( + 3, "point", MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y"))))); table.updateSchema().renameColumn("point", "p2").commit(); NameMapping pointUpdated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - Assert.assertEquals( - MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of( - 3, - ImmutableList.of("point", "p2"), - MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y")))), - pointUpdated.asMappedFields()); + assertThat(pointUpdated.asMappedFields()) + .isEqualTo( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of( + 3, + ImmutableList.of("point", "p2"), + MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y"))))); } } diff --git a/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java b/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java index bde99b4113ab..d30a93d50d49 100644 --- a/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java +++ b/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java @@ -19,12 +19,12 @@ package org.apache.iceberg.mapping; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.Test; public class TestNameMapping { @Test @@ -36,7 +36,7 @@ public void testFlatSchemaToMapping() { MappedFields expected = MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data")); NameMapping mapping = MappingUtil.create(schema); - Assert.assertEquals(expected, mapping.asMappedFields()); + assertThat(mapping.asMappedFields()).isEqualTo(expected); } @Test @@ -62,7 +62,7 @@ public void testNestedStructSchemaToMapping() { MappedFields.of(MappedField.of(4, "latitude"), MappedField.of(5, "longitude")))); NameMapping mapping = MappingUtil.create(schema); - Assert.assertEquals(expected, mapping.asMappedFields()); + assertThat(mapping.asMappedFields()).isEqualTo(expected); } @Test @@ -84,7 +84,7 @@ public void testMapSchemaToMapping() { 3, "map", MappedFields.of(MappedField.of(4, "key"), MappedField.of(5, "value")))); NameMapping mapping = MappingUtil.create(schema); - Assert.assertEquals(expected, mapping.asMappedFields()); + assertThat(mapping.asMappedFields()).isEqualTo(expected); } @Test @@ -117,7 +117,7 @@ public void testComplexKeyMapSchemaToMapping() { MappedField.of(5, "value")))); NameMapping mapping = MappingUtil.create(schema); - Assert.assertEquals(expected, mapping.asMappedFields()); + assertThat(mapping.asMappedFields()).isEqualTo(expected); } @Test @@ -152,7 +152,7 @@ public void testComplexValueMapSchemaToMapping() { MappedFields.of(MappedField.of(6, "x"), MappedField.of(7, "y")))))); NameMapping mapping = MappingUtil.create(schema); - Assert.assertEquals(expected, mapping.asMappedFields()); + assertThat(mapping.asMappedFields()).isEqualTo(expected); } @Test @@ -170,13 +170,13 @@ public void testListSchemaToMapping() { MappedField.of(3, "list", MappedFields.of(MappedField.of(4, "element")))); NameMapping mapping = MappingUtil.create(schema); - Assert.assertEquals(expected, mapping.asMappedFields()); + assertThat(mapping.asMappedFields()).isEqualTo(expected); } @Test public void testFailsDuplicateId() { // the schema can be created because ID indexing is lazy - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> new Schema( required(1, "id", Types.LongType.get()), @@ -187,7 +187,7 @@ public void testFailsDuplicateId() { @Test public void testFailsDuplicateName() { - Assertions.assertThatThrownBy( + assertThatThrownBy( () -> new NameMapping(MappedFields.of(MappedField.of(1, "x"), MappedField.of(2, "x")))) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Multiple entries with same key: x=2 and x=1"); @@ -227,17 +227,17 @@ public void testMappingFindById() { NameMapping mapping = MappingUtil.create(schema); - Assert.assertNull("Should not return a field mapping for a missing ID", mapping.find(100)); - Assert.assertEquals(MappedField.of(2, "data"), mapping.find(2)); - Assert.assertEquals(MappedField.of(6, "x"), mapping.find(6)); - Assert.assertEquals(MappedField.of(9, "element"), mapping.find(9)); - Assert.assertEquals(MappedField.of(11, "latitude"), mapping.find(11)); - Assert.assertEquals( - MappedField.of( - 10, - "location", - MappedFields.of(MappedField.of(11, "latitude"), MappedField.of(12, "longitude"))), - mapping.find(10)); + assertThat(mapping.find(100)).as("Should not return a field mapping for a missing ID").isNull(); + assertThat(mapping.find(2)).isEqualTo(MappedField.of(2, "data")); + assertThat(mapping.find(6)).isEqualTo(MappedField.of(6, "x")); + assertThat(mapping.find(9)).isEqualTo(MappedField.of(9, "element")); + assertThat(mapping.find(11)).isEqualTo(MappedField.of(11, "latitude")); + assertThat(mapping.find(10)) + .isEqualTo( + MappedField.of( + 10, + "location", + MappedFields.of(MappedField.of(11, "latitude"), MappedField.of(12, "longitude")))); } @Test @@ -266,20 +266,27 @@ public void testMappingFindByName() { NameMapping mapping = MappingUtil.create(schema); - Assert.assertNull( - "Should not return a field mapping for a nested name", mapping.find("element")); - Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("x")); - Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("key")); - Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("value")); - Assert.assertEquals(MappedField.of(2, "data"), mapping.find("data")); - Assert.assertEquals(MappedField.of(6, "x"), mapping.find("map", "value", "x")); - Assert.assertEquals(MappedField.of(9, "element"), mapping.find("list", "element")); - Assert.assertEquals(MappedField.of(11, "latitude"), mapping.find("location", "latitude")); - Assert.assertEquals( - MappedField.of( - 10, - "location", - MappedFields.of(MappedField.of(11, "latitude"), MappedField.of(12, "longitude"))), - mapping.find("location")); + assertThat(mapping.find("element")) + .as("Should not return a field mapping for a nested name") + .isNull(); + assertThat(mapping.find("x")) + .as("Should not return a field mapping for a nested name") + .isNull(); + assertThat(mapping.find("key")) + .as("Should not return a field mapping for a nested name") + .isNull(); + assertThat(mapping.find("value")) + .as("Should not return a field mapping for a nested name") + .isNull(); + assertThat(mapping.find("data")).isEqualTo(MappedField.of(2, "data")); + assertThat(mapping.find("map", "value", "x")).isEqualTo(MappedField.of(6, "x")); + assertThat(mapping.find("list", "element")).isEqualTo(MappedField.of(9, "element")); + assertThat(mapping.find("location", "latitude")).isEqualTo(MappedField.of(11, "latitude")); + assertThat(mapping.find("location")) + .isEqualTo( + MappedField.of( + 10, + "location", + MappedFields.of(MappedField.of(11, "latitude"), MappedField.of(12, "longitude")))); } } diff --git a/data/src/test/java/org/apache/iceberg/orc/TestOrcMetrics.java b/data/src/test/java/org/apache/iceberg/orc/TestOrcMetrics.java index 724970bc09ed..d85a73dbd233 100644 --- a/data/src/test/java/org/apache/iceberg/orc/TestOrcMetrics.java +++ b/data/src/test/java/org/apache/iceberg/orc/TestOrcMetrics.java @@ -18,6 +18,8 @@ */ package org.apache.iceberg.orc; +import static org.assertj.core.api.Assertions.assertThat; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -26,6 +28,7 @@ import org.apache.iceberg.Files; import org.apache.iceberg.Metrics; import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.ParameterizedTestExtension; import org.apache.iceberg.Schema; import org.apache.iceberg.TestMetrics; import org.apache.iceberg.data.Record; @@ -37,29 +40,18 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Type; -import org.junit.Assert; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; +import org.junit.jupiter.api.extension.ExtendWith; /** Test Metrics for ORC. */ -@RunWith(Parameterized.class) +@ExtendWith(ParameterizedTestExtension.class) public class TestOrcMetrics extends TestMetrics { static final ImmutableSet BINARY_TYPES = ImmutableSet.of(Type.TypeID.BINARY, Type.TypeID.FIXED, Type.TypeID.UUID); - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; - } - - public TestOrcMetrics(int formatVersion) { - super(formatVersion); - } - @Override protected OutputFile createOutputFile() throws IOException { - File tmpFolder = temp.newFolder("orc"); + File tmpFolder = java.nio.file.Files.createTempDirectory(temp, "orc").toFile(); String filename = UUID.randomUUID().toString(); return Files.localOutput(new File(tmpFolder, FileFormat.ORC.addExtension(filename))); } @@ -119,12 +111,8 @@ private boolean isBinaryType(Type type) { protected void assertBounds( int fieldId, Type type, T lowerBound, T upperBound, Metrics metrics) { if (isBinaryType(type)) { - Assert.assertFalse( - "ORC binary field should not have lower bounds.", - metrics.lowerBounds().containsKey(fieldId)); - Assert.assertFalse( - "ORC binary field should not have upper bounds.", - metrics.upperBounds().containsKey(fieldId)); + assertThat(metrics.lowerBounds()).doesNotContainKey(fieldId); + assertThat(metrics.upperBounds()).doesNotContainKey(fieldId); return; } super.assertBounds(fieldId, type, lowerBound, upperBound, metrics); diff --git a/data/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java b/data/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java index f363e5d979d3..545cff9311d9 100644 --- a/data/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java +++ b/data/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java @@ -26,6 +26,7 @@ import org.apache.iceberg.Files; import org.apache.iceberg.Metrics; import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.ParameterizedTestExtension; import org.apache.iceberg.Schema; import org.apache.iceberg.TableProperties; import org.apache.iceberg.TestMetrics; @@ -37,24 +38,14 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.parquet.hadoop.ParquetFileReader; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; +import org.junit.jupiter.api.extension.ExtendWith; /** Test Metrics for Parquet. */ -@RunWith(Parameterized.class) +@ExtendWith(ParameterizedTestExtension.class) public class TestParquetMetrics extends TestMetrics { private static final Map SMALL_ROW_GROUP_CONFIG = ImmutableMap.of(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "1600"); - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {1, 2}; - } - - public TestParquetMetrics(int formatVersion) { - super(formatVersion); - } - @Override public FileFormat fileFormat() { return FileFormat.PARQUET; @@ -62,7 +53,7 @@ public FileFormat fileFormat() { @Override protected OutputFile createOutputFile() throws IOException { - File tmpFolder = temp.newFolder("parquet"); + File tmpFolder = java.nio.file.Files.createTempDirectory(temp, "parquet").toFile(); String filename = UUID.randomUUID().toString(); return Files.localOutput(new File(tmpFolder, FileFormat.PARQUET.addExtension(filename))); } From 2d76c91d6a27378fe753eb7f912bf3031368e6b0 Mon Sep 17 00:00:00 2001 From: Manu Zhang Date: Thu, 28 Mar 2024 23:17:10 +0800 Subject: [PATCH 25/25] Build: disable link-check for all medium blog posts (#10057) --- site/docs/blogs.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/site/docs/blogs.md b/site/docs/blogs.md index 4e94c9e71b08..e7cda2f997a6 100644 --- a/site/docs/blogs.md +++ b/site/docs/blogs.md @@ -409,6 +409,7 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Jason Hughes](https://www.linkedin.com/in/jasonhhughes/) + ### [Migrating to Apache Iceberg at Adobe Experience Platform](https://medium.com/adobetech/migrating-to-apache-iceberg-at-adobe-experience-platform-40fa80f8b8de) **Date**: Jun 17th, 2021, **Company**: Adobe @@ -439,11 +440,13 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Susan Hall](https://thenewstack.io/author/susanhall/) + ### [A Short Introduction to Apache Iceberg](https://medium.com/expedia-group-tech/a-short-introduction-to-apache-iceberg-d34f628b6799) **Date**: Jan 26th, 2021, **Company**: Expedia **Author**: [Christine Mathiesen](https://www.linkedin.com/in/christine-mathiesen-676a98159/) + ### [Taking Query Optimizations to the Next Level with Iceberg](https://medium.com/adobetech/taking-query-optimizations-to-the-next-level-with-iceberg-6c968b83cd6f) **Date**: Jan 14th, 2021, **Company**: Adobe @@ -454,6 +457,7 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Zihan Li](https://www.linkedin.com/in/zihan-li-0a8a15149/), [Sudarshan Vasudevan](https://www.linkedin.com/in/suddu/), [Lei Sun](https://www.linkedin.com/in/lei-s-a93138a0/), [Shirshanka Das](https://www.linkedin.com/in/shirshankadas/) + ### [High Throughput Ingestion with Iceberg](https://medium.com/adobetech/high-throughput-ingestion-with-iceberg-ccf7877a413f) **Date**: Dec 22nd, 2020, **Company**: Adobe @@ -464,6 +468,7 @@ Here is a list of company blogs that talk about Iceberg. The blogs are ordered f **Author**: [Anupom Syam](https://www.linkedin.com/in/anupom/) + ### [Iceberg at Adobe](https://medium.com/adobetech/iceberg-at-adobe-88cf1950e866) **Date**: Dec 3rd, 2020, **Company**: Adobe