Skip to content

Commit 22fefeb

Browse files
BigQuery: Decouple clustering from time partitioning when writing (#30094)
* Decouple clustering from time partitioning when writing * Update sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/DynamicDestinationsHelpers.java Co-authored-by: Michel Davit <michel@davit.fr> * Refactor the tests, remove redundant input validations and reuse existing variables * add to CHANGES.md * add PR and not issue in CHANGES.md --------- Co-authored-by: Michel Davit <michel@davit.fr>
1 parent b9fd39c commit 22fefeb

File tree

6 files changed

+62
-50
lines changed

6 files changed

+62
-50
lines changed

CHANGES.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@
6565

6666
## New Features / Improvements
6767

68-
* X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)).
68+
* [Enrichment Transform](https://s.apache.org/enrichment-transform) along with GCP BigTable handler added to Python SDK ([#30001](https://github.com/apache/beam/pull/30001)).
69+
* Allow writing clustered and not time partitioned BigQuery tables (Java) ([#30094](https://github.com/apache/beam/pull/30094)).
6970

7071
## Breaking Changes
7172

sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@
9696
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.StorageClient;
9797
import org.apache.beam.sdk.io.gcp.bigquery.BigQuerySourceBase.ExtractResult;
9898
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinationsHelpers.ConstantSchemaDestinations;
99-
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinationsHelpers.ConstantTimePartitioningDestinations;
99+
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinationsHelpers.ConstantTimePartitioningClusteringDestinations;
100100
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinationsHelpers.SchemaFromViewDestinations;
101101
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinationsHelpers.TableFunctionDestinations;
102102
import org.apache.beam.sdk.io.gcp.bigquery.PassThroughThenCleanup.CleanupOperation;
@@ -2744,8 +2744,7 @@ public Write<T> withJsonTimePartitioning(ValueProvider<String> partitioning) {
27442744
}
27452745

27462746
/**
2747-
* Specifies the clustering fields to use when writing to a single output table. Can only be
2748-
* used when {@link#withTimePartitioning(TimePartitioning)} is set. If {@link
2747+
* Specifies the clustering fields to use when writing to a single output table. If {@link
27492748
* #to(SerializableFunction)} or {@link #to(DynamicDestinations)} is used to write to dynamic
27502749
* tables, the fields here will be ignored; call {@link #withClustering()} instead.
27512750
*/
@@ -3357,9 +3356,10 @@ && getStorageApiTriggeringFrequency(bqOptions) != null) {
33573356
}
33583357

33593358
// Wrap with a DynamicDestinations class that will provide the proper TimePartitioning.
3360-
if (getJsonTimePartitioning() != null) {
3359+
if (getJsonTimePartitioning() != null
3360+
|| Optional.ofNullable(getClustering()).map(Clustering::getFields).isPresent()) {
33613361
dynamicDestinations =
3362-
new ConstantTimePartitioningDestinations<>(
3362+
new ConstantTimePartitioningClusteringDestinations<>(
33633363
(DynamicDestinations<T, TableDestination>) dynamicDestinations,
33643364
getJsonTimePartitioning(),
33653365
StaticValueProvider.of(BigQueryHelpers.toJsonString(getClustering())));

sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CreateTableHelpers.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,11 +179,13 @@ private static void tryCreateTable(
179179
TimePartitioning timePartitioning = tableDestination.getTimePartitioning();
180180
if (timePartitioning != null) {
181181
table.setTimePartitioning(timePartitioning);
182-
Clustering clustering = tableDestination.getClustering();
183-
if (clustering != null) {
184-
table.setClustering(clustering);
185-
}
186182
}
183+
184+
Clustering clustering = tableDestination.getClustering();
185+
if (clustering != null) {
186+
table.setClustering(clustering);
187+
}
188+
187189
if (kmsKey != null) {
188190
table.setEncryptionConfiguration(new EncryptionConfiguration().setKmsKeyName(kmsKey));
189191
}

sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/DynamicDestinationsHelpers.java

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -272,37 +272,41 @@ public String toString() {
272272
}
273273
}
274274

275-
static class ConstantTimePartitioningDestinations<T>
275+
static class ConstantTimePartitioningClusteringDestinations<T>
276276
extends DelegatingDynamicDestinations<T, TableDestination> {
277277

278-
private final ValueProvider<String> jsonTimePartitioning;
278+
private final @Nullable ValueProvider<String> jsonTimePartitioning;
279279
private final @Nullable ValueProvider<String> jsonClustering;
280280

281-
ConstantTimePartitioningDestinations(
281+
ConstantTimePartitioningClusteringDestinations(
282282
DynamicDestinations<T, TableDestination> inner,
283283
ValueProvider<String> jsonTimePartitioning,
284284
ValueProvider<String> jsonClustering) {
285285
super(inner);
286-
Preconditions.checkArgumentNotNull(
287-
jsonTimePartitioning, "jsonTimePartitioning provider can not be null");
288-
if (jsonTimePartitioning.isAccessible()) {
289-
Preconditions.checkArgumentNotNull(
290-
jsonTimePartitioning.get(), "jsonTimePartitioning can not be null");
291-
}
286+
287+
checkArgument(
288+
(jsonTimePartitioning != null
289+
&& jsonTimePartitioning.isAccessible()
290+
&& jsonTimePartitioning.get() != null)
291+
|| (jsonClustering != null
292+
&& jsonClustering.isAccessible()
293+
&& jsonClustering.get() != null),
294+
"at least one of jsonTimePartitioning or jsonClustering must be non-null, accessible "
295+
+ "and present");
296+
292297
this.jsonTimePartitioning = jsonTimePartitioning;
293298
this.jsonClustering = jsonClustering;
294299
}
295300

296301
@Override
297302
public TableDestination getDestination(@Nullable ValueInSingleWindow<T> element) {
298303
TableDestination destination = super.getDestination(element);
299-
String partitioning = this.jsonTimePartitioning.get();
300-
checkArgument(partitioning != null, "jsonTimePartitioning can not be null");
304+
String partitioning =
305+
Optional.ofNullable(jsonTimePartitioning).map(ValueProvider::get).orElse(null);
306+
String clustering = Optional.ofNullable(jsonClustering).map(ValueProvider::get).orElse(null);
307+
301308
return new TableDestination(
302-
destination.getTableSpec(),
303-
destination.getTableDescription(),
304-
partitioning,
305-
Optional.ofNullable(jsonClustering).map(ValueProvider::get).orElse(null));
309+
destination.getTableSpec(), destination.getTableDescription(), partitioning, clustering);
306310
}
307311

308312
@Override
@@ -316,10 +320,10 @@ public Coder<TableDestination> getDestinationCoder() {
316320

317321
@Override
318322
public String toString() {
319-
MoreObjects.ToStringHelper helper =
320-
MoreObjects.toStringHelper(this)
321-
.add("inner", inner)
322-
.add("jsonTimePartitioning", jsonTimePartitioning);
323+
MoreObjects.ToStringHelper helper = MoreObjects.toStringHelper(this).add("inner", inner);
324+
if (jsonTimePartitioning != null) {
325+
helper.add("jsonTimePartitioning", jsonTimePartitioning);
326+
}
323327
if (jsonClustering != null) {
324328
helper.add("jsonClustering", jsonClustering);
325329
}

sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/UpdateSchemaDestination.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -288,11 +288,12 @@ private BigQueryHelpers.PendingJob startZeroLoadJob(
288288
}
289289
if (timePartitioning != null) {
290290
loadConfig.setTimePartitioning(timePartitioning);
291-
// only set clustering if timePartitioning is set
292-
if (clustering != null) {
293-
loadConfig.setClustering(clustering);
294-
}
295291
}
292+
293+
if (clustering != null) {
294+
loadConfig.setClustering(clustering);
295+
}
296+
296297
if (kmsKey != null) {
297298
loadConfig.setDestinationEncryptionConfiguration(
298299
new EncryptionConfiguration().setKmsKeyName(kmsKey));

sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ private void verifySideInputs() {
500500
}
501501
}
502502

503-
void testTimePartitioningClustering(
503+
void testTimePartitioningAndClustering(
504504
BigQueryIO.Write.Method insertMethod, boolean enablePartitioning, boolean enableClustering)
505505
throws Exception {
506506
TableRow row1 = new TableRow().set("date", "2018-01-01").set("number", "1");
@@ -545,16 +545,8 @@ void testTimePartitioningClustering(
545545
}
546546
}
547547

548-
void testTimePartitioning(BigQueryIO.Write.Method insertMethod) throws Exception {
549-
testTimePartitioningClustering(insertMethod, true, false);
550-
}
551-
552-
void testClustering(BigQueryIO.Write.Method insertMethod) throws Exception {
553-
testTimePartitioningClustering(insertMethod, true, true);
554-
}
555-
556-
@Test
557-
public void testTimePartitioning() throws Exception {
548+
void testTimePartitioningAndClusteringWithAllMethods(
549+
Boolean enablePartitioning, Boolean enableClustering) throws Exception {
558550
BigQueryIO.Write.Method method;
559551
if (useStorageApi) {
560552
method =
@@ -564,15 +556,27 @@ public void testTimePartitioning() throws Exception {
564556
} else {
565557
method = Method.FILE_LOADS;
566558
}
567-
testTimePartitioning(method);
559+
testTimePartitioningAndClustering(method, enablePartitioning, enableClustering);
568560
}
569561

570562
@Test
571-
public void testClusteringStorageApi() throws Exception {
572-
if (useStorageApi) {
573-
testClustering(
574-
useStorageApiApproximate ? Method.STORAGE_API_AT_LEAST_ONCE : Method.STORAGE_WRITE_API);
575-
}
563+
public void testTimePartitioningWithoutClustering() throws Exception {
564+
testTimePartitioningAndClusteringWithAllMethods(true, false);
565+
}
566+
567+
@Test
568+
public void testTimePartitioningWithClustering() throws Exception {
569+
testTimePartitioningAndClusteringWithAllMethods(true, true);
570+
}
571+
572+
@Test
573+
public void testClusteringWithoutPartitioning() throws Exception {
574+
testTimePartitioningAndClusteringWithAllMethods(false, true);
575+
}
576+
577+
@Test
578+
public void testNoClusteringNoPartitioning() throws Exception {
579+
testTimePartitioningAndClusteringWithAllMethods(false, false);
576580
}
577581

578582
@Test

0 commit comments

Comments
 (0)