From 1bd1d9f27bf8c2c8bae921b443115ce5c59ff708 Mon Sep 17 00:00:00 2001 From: Szehon Ho Date: Tue, 30 Jun 2026 11:09:21 -0700 Subject: [PATCH] Spark: Document broadcast size-map scaling in rewrite table action Follow-up to #15470. Add a comment explaining that the delete-file size map collected to the driver and broadcast to the manifest-rewrite tasks scales with the number of distinct delete files being rewritten, not the table's total file count. Generated-by: Cursor (Claude Opus 4.8) --- .../iceberg/spark/actions/RewriteTablePathSparkAction.java | 3 +++ .../iceberg/spark/actions/RewriteTablePathSparkAction.java | 3 +++ .../iceberg/spark/actions/RewriteTablePathSparkAction.java | 3 +++ 3 files changed, 9 insertions(+) diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java index 11935e815e76..8aa5fbed1076 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java @@ -327,6 +327,9 @@ private Result rebuildMetadata() { Map rewrittenDeleteFileSizes = rewritePositionDeletes(deleteFilesToRewrite); // rebuild manifest files + // The size map holds one entry per distinct rewritten delete-file location. It is collected to + // the driver and broadcast to the manifest-rewrite tasks, so its footprint scales with the + // number of distinct delete files being rewritten rather than the table's total file count. RewriteContentFileResult rewriteManifestResult = rewriteManifests( deltaSnapshots, diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java index 11935e815e76..8aa5fbed1076 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java @@ -327,6 +327,9 @@ private Result rebuildMetadata() { Map rewrittenDeleteFileSizes = rewritePositionDeletes(deleteFilesToRewrite); // rebuild manifest files + // The size map holds one entry per distinct rewritten delete-file location. It is collected to + // the driver and broadcast to the manifest-rewrite tasks, so its footprint scales with the + // number of distinct delete files being rewritten rather than the table's total file count. RewriteContentFileResult rewriteManifestResult = rewriteManifests( deltaSnapshots, diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java index 11935e815e76..8aa5fbed1076 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java @@ -327,6 +327,9 @@ private Result rebuildMetadata() { Map rewrittenDeleteFileSizes = rewritePositionDeletes(deleteFilesToRewrite); // rebuild manifest files + // The size map holds one entry per distinct rewritten delete-file location. It is collected to + // the driver and broadcast to the manifest-rewrite tasks, so its footprint scales with the + // number of distinct delete files being rewritten rather than the table's total file count. RewriteContentFileResult rewriteManifestResult = rewriteManifests( deltaSnapshots,