From b69f02b4e7bafb5585ddd7e95918e69ec9fc726e Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 12 Mar 2024 10:14:40 -0400 Subject: [PATCH 1/4] Beam 2.55.0 website updates --- CHANGES.md | 13 +- website/www/site/config.toml | 2 +- .../www/site/content/en/blog/beam-2.55.0.md | 181 ++++++++++++++++++ .../site/content/en/get-started/downloads.md | 14 +- 4 files changed, 195 insertions(+), 15 deletions(-) create mode 100644 website/www/site/content/en/blog/beam-2.55.0.md diff --git a/CHANGES.md b/CHANGES.md index 2064d2387ae0..60dab0e2352c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -86,12 +86,10 @@ * ([#X](https://github.com/apache/beam/issues/X)). -# [2.55.0] - Unreleased +# [2.55.0] - 2024-03-25 ## Highlights -* New highly anticipated feature X added to Python SDK ([#X](https://github.com/apache/beam/issues/X)). -* New highly anticipated feature Y added to Java SDK ([#Y](https://github.com/apache/beam/issues/Y)). * The Python SDK will now include automatically generated wrappers for external Java transforms! ([#29834](https://github.com/apache/beam/pull/29834)) ## I/Os @@ -103,7 +101,6 @@ * Added support for handling bad records to PubSubIO ([#30372](https://github.com/apache/beam/pull/30372)). * Support is not available for handling schema mismatches, and enabling error handling for writing to pubsub topics with schemas is not recommended * `--enableBundling` pipeline option for BigQueryIO DIRECT_READ is replaced by `--enableStorageReadApiV2`. Both were considered experimental and may subject to change (Java) ([#26354](https://github.com/apache/beam/issues/26354)). -* Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). ## New Features / Improvements @@ -115,26 +112,20 @@ ## Breaking Changes -* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). * Arrow version was bumped to 15.0.0 from 5.0.0 ([#30181](https://github.com/apache/beam/pull/30181)). * Go SDK users who build custom worker containers may run into issues with the move to distroless containers as a base (see Security Fixes). * The issue stems from distroless containers lacking additional tools, which current custom container processes may rely on. * See https://beam.apache.org/documentation/runtime/environments/#from-scratch-go for instructions on building and using a custom container. * Python SDK has changed the default value for the `--max_cache_memory_usage_mb` pipeline option from 100 to 0. This option was first introduced in 2.52.0 SDK. This change restores the behavior of 2.51.0 SDK, which does not use the state cache. If your pipeline uses iterable side inputs views, consider increasing the cache size by setting the option manually. ([#30360](https://github.com/apache/beam/issues/30360)). -## Deprecations - -* X behavior is deprecated and will be removed in X versions ([#X](https://github.com/apache/beam/issues/X)). - ## Bugfixes -* Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). * Fixed SpannerIO.readChangeStream to support propagating credentials from pipeline options to the getDialect calls for authenticating with Spanner (Java) ([#30361](https://github.com/apache/beam/pull/30361)). * Reduced the number of HTTP requests in GCSIO function calls (Python) ([#30205](https://github.com/apache/beam/pull/30205)) ## Security Fixes -* Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). + * Go SDK base container image moved to distroless/base-nossl-debian12, reducing vulnerable container surface to kernel and glibc ([#30011](https://github.com/apache/beam/pull/30011)). ## Known Issues diff --git a/website/www/site/config.toml b/website/www/site/config.toml index bbd8193915c0..e3e2638825f0 100644 --- a/website/www/site/config.toml +++ b/website/www/site/config.toml @@ -104,7 +104,7 @@ github_project_repo = "https://github.com/apache/beam" [params] description = "Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes." -release_latest = "2.54.0" +release_latest = "2.55.0" # The repository and branch where the files live in Github or Colab. This is used # to serve and stage from your local branch, but publish to the master branch. # e.g. https://github.com/{{< param branch_repo >}}/path/to/notebook.ipynb diff --git a/website/www/site/content/en/blog/beam-2.55.0.md b/website/www/site/content/en/blog/beam-2.55.0.md new file mode 100644 index 000000000000..8d132b5b4c4f --- /dev/null +++ b/website/www/site/content/en/blog/beam-2.55.0.md @@ -0,0 +1,181 @@ +--- +title: "Apache Beam 2.55.0" +date: 2024-03-25 10:00:00 -0400 +categories: + - blog + - release +authors: + - yhu +--- + + +We are happy to present the new 2.55.0 release of Beam. +This release includes both improvements and new functionality. +See the [download page](/get-started/downloads/#2550-2023-03-25) for this release. + + + +For more information on changes in 2.55.0, check out the [detailed release notes](https://github.com/apache/beam/milestone/19). + +## Highlights + +* The Python SDK will now include automatically generated wrappers for external Java transforms! ([#29834](https://github.com/apache/beam/pull/29834)) + +## I/Os + +* Added support for handling bad records to BigQueryIO ([#30081](https://github.com/apache/beam/pull/30081)). + * Full Support for Storage Read and Write APIs + * Partial Support for File Loads (Failures writing to files supported, failures loading files to BQ unsupported) + * No Support for Extract or Streaming Inserts +* Added support for handling bad records to PubSubIO ([#30372](https://github.com/apache/beam/pull/30372)). + * Support is not available for handling schema mismatches, and enabling error handling for writing to pubsub topics with schemas is not recommended +* `--enableBundling` pipeline option for BigQueryIO DIRECT_READ is replaced by `--enableStorageReadApiV2`. Both were considered experimental and may subject to change (Java) ([#26354](https://github.com/apache/beam/issues/26354)). + +## New Features / Improvements + +* Allow writing clustered and not time partitioned BigQuery tables (Java) ([#30094](https://github.com/apache/beam/pull/30094)). +* Redis cache support added to RequestResponseIO and Enrichment transform (Python) ([#30307](https://github.com/apache/beam/pull/30307)) +* Merged sdks/java/fn-execution and runners/core-construction-java into the main SDK. These artifacts were never meant for users, but noting + that they no longer exist. These are steps to bring portability into the core SDK alongside all other core functionality. +* Added Vertex AI Feature Store handler for Enrichment transform (Python) ([#30388](https://github.com/apache/beam/pull/30388)) +* Python Dataflow users no longer need to manually specify --streaming for pipelines using unbounded sources such as ReadFromPubSub. + +## Breaking Changes + +* Arrow version was bumped to 15.0.0 from 5.0.0 ([#30181](https://github.com/apache/beam/pull/30181)). +* Go SDK users who build custom worker containers may run into issues with the move to distroless containers as a base (see Security Fixes). + * The issue stems from distroless containers lacking additional tools, which current custom container processes may rely on. + * See https://beam.apache.org/documentation/runtime/environments/#from-scratch-go for instructions on building and using a custom container. +* Python SDK has changed the default value for the `--max_cache_memory_usage_mb` pipeline option from 100 to 0. This option was first introduced in 2.52.0 SDK. This change restores the behavior of 2.51.0 SDK, which does not use the state cache. If your pipeline uses iterable side inputs views, consider increasing the cache size by setting the option manually. ([#30360](https://github.com/apache/beam/issues/30360)). + +## Deprecations + +* N/A + +## Bugfixes + +* Fixed SpannerIO.readChangeStream to support propagating credentials from pipeline options + to the getDialect calls for authenticating with Spanner (Java) ([#30361](https://github.com/apache/beam/pull/30361)). +* Reduced the number of HTTP requests in GCSIO function calls (Python) ([#30205](https://github.com/apache/beam/pull/30205)) + +## Security Fixes + +* Go SDK base container image moved to distroless/base-nossl-debian12, reducing vulnerable container surface to kernel and glibc ([#30011](https://github.com/apache/beam/pull/30011)). + +## Known Issues + +* In Python pipelines, when shutting down inactive bundle processors, shutdown logic can overaggressively hold the lock, blocking acceptance of new work. Symptoms of this issue include slowness or stuckness in long-running jobs. Fixed in 2.56.0 ([#30679](https://github.com/apache/beam/pull/30679)). + +## List of Contributors + +According to git shortlog, the following people contributed to the {$RELEASE_VERSION} release. Thank you to all contributors! + +Ahmed Abualsaud + +Anand Inguva + +Andrew Crites + +Andrey Devyatkin + +Arun Pandian + +Arvind Ram + +Chamikara Jayalath + +Chris Gray + +Claire McGinty + +Damon Douglas + +Dan Ellis + +Danny McCormick + +Daria Bezkorovaina + +Dima I + +Edward Cui + +Ferran Fernández Garrido + +GStravinsky + +Jan Lukavský + +Jason Mitchell + +JayajP + +Jeff Kinard + +Jeffrey Kinard + +Kenneth Knowles + +Mattie Fu + +Michel Davit + +Oleh Borysevych + +Ritesh Ghorse + +Ritesh Tarway + +Robert Bradshaw + +Robert Burke + +Sam Whittle + +Scott Strong + +Shunping Huang + +Steven van Rossum + +Svetak Sundhar + +Talat UYARER + +Ukjae Jeong (Jay) + +Vitaly Terentyev + +Vlado Djerek + +Yi Hu + +akashorabek + +case-k + +clmccart + +dengwe1 + +dhruvdua + +hardshah + +johnjcasey + +liferoad + +martin trieu + +tvalentyn diff --git a/website/www/site/content/en/get-started/downloads.md b/website/www/site/content/en/get-started/downloads.md index 7b52429d9b54..a598a3e905cd 100644 --- a/website/www/site/content/en/get-started/downloads.md +++ b/website/www/site/content/en/get-started/downloads.md @@ -96,10 +96,18 @@ versions denoted `0.x.y`. ## Releases +### 2.54.0 (2024-03-25) +Official [source code download](https://downloads.apache.org/beam/2.55.0/apache-beam-2.55.0-source-release.zip). +[SHA-512](https://downloads.apache.org/beam/2.55.0/apache-beam-2.55.0-source-release.zip.sha512). +[signature](https://downloads.apache.org/beam/2.55.0/apache-beam-2.55.0-source-release.zip.asc). + +[Release notes](https://github.com/apache/beam/releases/tag/v2.55.0) +[Blog post](/blog/beam-2.55.0). + ### 2.54.0 (2024-02-14) -Official [source code download](https://downloads.apache.org/beam/2.54.0/apache-beam-2.54.0-source-release.zip). -[SHA-512](https://downloads.apache.org/beam/2.54.0/apache-beam-2.54.0-source-release.zip.sha512). -[signature](https://downloads.apache.org/beam/2.54.0/apache-beam-2.54.0-source-release.zip.asc). +Official [source code download](https://archive.apache.org/dist/beam/2.54.0/apache-beam-2.54.0-source-release.zip). +[SHA-512](https://archive.apache.org/dist/beam/2.54.0/apache-beam-2.54.0-source-release.zip.sha512). +[signature](https://archive.apache.org/dist/beam/2.54.0/apache-beam-2.54.0-source-release.zip.asc). [Release notes](https://github.com/apache/beam/releases/tag/v2.54.0) [Blog post](/blog/beam-2.54.0). From a52e0d020a5dbd02ea7ca39d75afa70f2de2f143 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Mon, 25 Mar 2024 10:56:40 -0400 Subject: [PATCH 2/4] Fix wrong item that not in 2.55.0 --- website/www/site/content/en/blog/beam-2.55.0.md | 1 - 1 file changed, 1 deletion(-) diff --git a/website/www/site/content/en/blog/beam-2.55.0.md b/website/www/site/content/en/blog/beam-2.55.0.md index 8d132b5b4c4f..505e9e90b9e4 100644 --- a/website/www/site/content/en/blog/beam-2.55.0.md +++ b/website/www/site/content/en/blog/beam-2.55.0.md @@ -48,7 +48,6 @@ For more information on changes in 2.55.0, check out the [detailed release notes * Merged sdks/java/fn-execution and runners/core-construction-java into the main SDK. These artifacts were never meant for users, but noting that they no longer exist. These are steps to bring portability into the core SDK alongside all other core functionality. * Added Vertex AI Feature Store handler for Enrichment transform (Python) ([#30388](https://github.com/apache/beam/pull/30388)) -* Python Dataflow users no longer need to manually specify --streaming for pipelines using unbounded sources such as ReadFromPubSub. ## Breaking Changes From 978e34e26877dcd3ae864a34e0afb9b72b4fff7e Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Mon, 25 Mar 2024 13:24:01 -0400 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Rebecca Szper <98840847+rszper@users.noreply.github.com> --- website/www/site/content/en/blog/beam-2.55.0.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/website/www/site/content/en/blog/beam-2.55.0.md b/website/www/site/content/en/blog/beam-2.55.0.md index 505e9e90b9e4..38761aee1f70 100644 --- a/website/www/site/content/en/blog/beam-2.55.0.md +++ b/website/www/site/content/en/blog/beam-2.55.0.md @@ -38,14 +38,14 @@ For more information on changes in 2.55.0, check out the [detailed release notes * Partial Support for File Loads (Failures writing to files supported, failures loading files to BQ unsupported) * No Support for Extract or Streaming Inserts * Added support for handling bad records to PubSubIO ([#30372](https://github.com/apache/beam/pull/30372)). - * Support is not available for handling schema mismatches, and enabling error handling for writing to pubsub topics with schemas is not recommended -* `--enableBundling` pipeline option for BigQueryIO DIRECT_READ is replaced by `--enableStorageReadApiV2`. Both were considered experimental and may subject to change (Java) ([#26354](https://github.com/apache/beam/issues/26354)). + * Support is not available for handling schema mismatches, and enabling error handling for writing to Pub/Sub topics with schemas is not recommended +* `--enableBundling` pipeline option for BigQueryIO DIRECT_READ is replaced by `--enableStorageReadApiV2`. Both were considered experimental and subject to change (Java) ([#26354](https://github.com/apache/beam/issues/26354)). ## New Features / Improvements -* Allow writing clustered and not time partitioned BigQuery tables (Java) ([#30094](https://github.com/apache/beam/pull/30094)). +* Allow writing clustered and not time-partitioned BigQuery tables (Java) ([#30094](https://github.com/apache/beam/pull/30094)). * Redis cache support added to RequestResponseIO and Enrichment transform (Python) ([#30307](https://github.com/apache/beam/pull/30307)) -* Merged sdks/java/fn-execution and runners/core-construction-java into the main SDK. These artifacts were never meant for users, but noting +* Merged `sdks/java/fn-execution` and `runners/core-construction-java` into the main SDK. These artifacts were never meant for users, but noting that they no longer exist. These are steps to bring portability into the core SDK alongside all other core functionality. * Added Vertex AI Feature Store handler for Enrichment transform (Python) ([#30388](https://github.com/apache/beam/pull/30388)) @@ -55,16 +55,16 @@ For more information on changes in 2.55.0, check out the [detailed release notes * Go SDK users who build custom worker containers may run into issues with the move to distroless containers as a base (see Security Fixes). * The issue stems from distroless containers lacking additional tools, which current custom container processes may rely on. * See https://beam.apache.org/documentation/runtime/environments/#from-scratch-go for instructions on building and using a custom container. -* Python SDK has changed the default value for the `--max_cache_memory_usage_mb` pipeline option from 100 to 0. This option was first introduced in 2.52.0 SDK. This change restores the behavior of 2.51.0 SDK, which does not use the state cache. If your pipeline uses iterable side inputs views, consider increasing the cache size by setting the option manually. ([#30360](https://github.com/apache/beam/issues/30360)). +* Python SDK has changed the default value for the `--max_cache_memory_usage_mb` pipeline option from 100 to 0. This option was first introduced in the 2.52.0 SDK version. This change restores the behavior of the 2.51.0 SDK, which does not use the state cache. If your pipeline uses iterable side inputs views, consider increasing the cache size by setting the option manually. ([#30360](https://github.com/apache/beam/issues/30360)). ## Deprecations * N/A -## Bugfixes +## Bug fixes * Fixed SpannerIO.readChangeStream to support propagating credentials from pipeline options - to the getDialect calls for authenticating with Spanner (Java) ([#30361](https://github.com/apache/beam/pull/30361)). + to the `getDialect` calls for authenticating with Spanner (Java) ([#30361](https://github.com/apache/beam/pull/30361)). * Reduced the number of HTTP requests in GCSIO function calls (Python) ([#30205](https://github.com/apache/beam/pull/30205)) ## Security Fixes From c7dc99ab35818a8cd9738eb86c973eeb136a06a8 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Mon, 25 Mar 2024 13:24:14 -0400 Subject: [PATCH 4/4] Update website/www/site/content/en/blog/beam-2.55.0.md Co-authored-by: Rebecca Szper <98840847+rszper@users.noreply.github.com> --- website/www/site/content/en/blog/beam-2.55.0.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/www/site/content/en/blog/beam-2.55.0.md b/website/www/site/content/en/blog/beam-2.55.0.md index 38761aee1f70..703e49fbf79b 100644 --- a/website/www/site/content/en/blog/beam-2.55.0.md +++ b/website/www/site/content/en/blog/beam-2.55.0.md @@ -63,7 +63,7 @@ For more information on changes in 2.55.0, check out the [detailed release notes ## Bug fixes -* Fixed SpannerIO.readChangeStream to support propagating credentials from pipeline options +* Fixed `SpannerIO.readChangeStream` to support propagating credentials from pipeline options to the `getDialect` calls for authenticating with Spanner (Java) ([#30361](https://github.com/apache/beam/pull/30361)). * Reduced the number of HTTP requests in GCSIO function calls (Python) ([#30205](https://github.com/apache/beam/pull/30205))