From 32c1e6d89fbfea994713de7b6709a8d4bca49f13 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 20 Oct 2020 19:48:50 +0000 Subject: [PATCH 01/36] Support import from HBase snapshot New files for configuring HBaseSnapshotInputFormat resovle version conflict and upgrade Beam version to 2.24.0 revert disk option change, not enough quota Code reorg code reduction Refactor naming Add integration config Add unit test for HBaseSnapshotInputConfiguration Set up skeleton for integration testing Ship test data with code, integration tests pass Clean up code for PR Add HBase commands that generates our test snapshot Addressing review comments 1. revert pom file overrides for SkipITs 2. Store SerializableConfiguration as member variable 3. Rever log4j.properties 4. Disable BIGTABLE_BULK_AUTOFLUSH_MS_KEY to prevent bulk mutation failures failing the jobs. --- .../bigtable-beam-import/pom.xml | 74 ++++++- .../beam/{sequencefiles => }/Main.java | 11 +- .../HBaseSnapshotInputConfiguration.java | 95 +++++++++ .../ImportJobFromHbaseSnapshot.java | 158 ++++++++++++++ .../beam/sequencefiles/CreateTableHelper.java | 2 +- .../HBaseResultToMutationFn.java | 4 +- .../bigtable/beam/sequencefiles/Utils.java | 6 +- .../src/main/resources/log4j.properties | 2 + .../test-snapshot/..snapshotinfo.crc | Bin 0 -> 12 bytes .../test-snapshot/.data.manifest.crc | Bin 0 -> 20 bytes .../test-snapshot/.snapshotinfo | 2 + .../test-snapshot/data.manifest | Bin 0 -> 1090 bytes .../cf/.b0f68aca966b48f1b171614e582b1cbb.crc | Bin 0 -> 52 bytes .../cf/b0f68aca966b48f1b171614e582b1cbb | Bin 0 -> 5264 bytes .../cf/.8aff180e3a244dcc807e4de8b6fce0a7.crc | Bin 0 -> 52 bytes .../cf/8aff180e3a244dcc807e4de8b6fce0a7 | Bin 0 -> 5264 bytes .../cf/.c2945aa8dac34922913a1f60fedb6154.crc | Bin 0 -> 52 bytes .../cf/c2945aa8dac34922913a1f60fedb6154 | Bin 0 -> 5264 bytes .../cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc | Bin 0 -> 52 bytes .../cf/cda93ca899f3475fb1c0f8989a8f0d18 | Bin 0 -> 5264 bytes .../cf/.d8b49b374391407ba35d5e0db1c835c9.crc | Bin 0 -> 52 bytes .../cf/d8b49b374391407ba35d5e0db1c835c9 | Bin 0 -> 5299 bytes .../cf/.32053565831341128b8d8f5567d48fdc.crc | Bin 0 -> 52 bytes .../cf/32053565831341128b8d8f5567d48fdc | Bin 0 -> 5264 bytes .../cf/.36798a163ed046b193818e21dd7516b4.crc | Bin 0 -> 52 bytes .../cf/36798a163ed046b193818e21dd7516b4 | Bin 0 -> 5264 bytes .../cf/.65b9c6860f5f4de39d61d1674947b030.crc | Bin 0 -> 52 bytes .../cf/65b9c6860f5f4de39d61d1674947b030 | Bin 0 -> 5264 bytes .../cf/.b83044f76ba6474aa829e3bae7fd82d1.crc | Bin 0 -> 52 bytes .../cf/b83044f76ba6474aa829e3bae7fd82d1 | Bin 0 -> 5264 bytes .../src/test/generate_test_data.txt | 107 ++++++++++ .../beam/hbasesnapshots/EndToEndIT.java | 193 ++++++++++++++++++ .../HBaseSnapshotInputConfigurationTest.java | 51 +++++ pom.xml | 2 +- 34 files changed, 695 insertions(+), 12 deletions(-) rename bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/{sequencefiles => }/Main.java (79%) create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/..snapshotinfo.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.data.manifest.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/data.manifest create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/.b0f68aca966b48f1b171614e582b1cbb.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/b0f68aca966b48f1b171614e582b1cbb create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/.8aff180e3a244dcc807e4de8b6fce0a7.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/8aff180e3a244dcc807e4de8b6fce0a7 create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/.c2945aa8dac34922913a1f60fedb6154.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/c2945aa8dac34922913a1f60fedb6154 create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/cda93ca899f3475fb1c0f8989a8f0d18 create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/.d8b49b374391407ba35d5e0db1c835c9.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/d8b49b374391407ba35d5e0db1c835c9 create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/.32053565831341128b8d8f5567d48fdc.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/32053565831341128b8d8f5567d48fdc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/.36798a163ed046b193818e21dd7516b4.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/36798a163ed046b193818e21dd7516b4 create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/.65b9c6860f5f4de39d61d1674947b030.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/65b9c6860f5f4de39d61d1674947b030 create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/.b83044f76ba6474aa829e3bae7fd82d1.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/b83044f76ba6474aa829e3bae7fd82d1 create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigurationTest.java diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 24de03f557..97b29208f0 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -25,7 +25,7 @@ limitations under the License. bigtable-beam-import - com.google.cloud.bigtable.beam.sequencefiles.Main + com.google.cloud.bigtable.beam.Main @@ -77,14 +77,12 @@ limitations under the License. org.apache.beam - - beam-sdks-java-extensions-google-cloud-platform-core - + beam-sdks-java-io-hadoop-common ${beam.version} org.apache.beam - beam-sdks-java-io-hadoop-common + beam-sdks-java-io-hadoop-format ${beam.version} @@ -94,6 +92,21 @@ limitations under the License. ${hbase.version} + + + org.apache.hbase + hbase-shaded-server + ${hbase.version} + + + + + org.apache.hbase + hbase-common + ${hbase.version} + com.google.auto.value auto-value @@ -149,6 +162,13 @@ limitations under the License. slf4j-api ${slf4j.version} + + + com.google.cloud.bigdataoss + gcs-connector + hadoop2-2.1.4 + shaded + @@ -181,6 +201,12 @@ limitations under the License. ${junit.version} test + + org.apache.hbase + hbase-shaded-testing-util + ${hbase.version} + test + @@ -265,6 +291,16 @@ limitations under the License. + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + @@ -376,5 +412,33 @@ limitations under the License. + + + hbasesnapshotsIntegrationTest + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + hbasesnapshots-integration-test + + integration-test + + integration-test + + 1 + + **/hbasesnapshots/*IT.java + + false + + + + + + + diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java similarity index 79% rename from bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java rename to bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java index 2d5cc71a9e..52fee350d7 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java @@ -13,10 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.google.cloud.bigtable.beam.sequencefiles; +package com.google.cloud.bigtable.beam; import com.google.bigtable.repackaged.com.google.api.core.InternalApi; import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; +import com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot; +import com.google.cloud.bigtable.beam.sequencefiles.CreateTableHelper; +import com.google.cloud.bigtable.beam.sequencefiles.ExportJob; +import com.google.cloud.bigtable.beam.sequencefiles.ImportJob; import java.io.File; import java.net.URISyntaxException; import java.util.Arrays; @@ -43,6 +47,9 @@ public static void main(String[] args) throws Exception { case "import": ImportJob.main(subArgs); break; + case "importsnapshot": + ImportJobFromHbaseSnapshot.main(subArgs); + break; case "create-table": CreateTableHelper.main(subArgs); break; @@ -65,7 +72,7 @@ private static void usage() { System.out.printf( "java -jar %s \n" - + "Where can be 'export', 'import' or 'create-table'. To get further help, run: \n" + + "Where can be 'export', 'import' , 'importsnapshot' or 'create-table'. To get further help, run: \n" + "java -jar %s --help\n", jarName, jarName); } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java new file mode 100644 index 0000000000..631496a81f --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java @@ -0,0 +1,95 @@ +/* + * Copyright 2017 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import static java.lang.System.*; + +import com.google.common.base.Preconditions; +import org.apache.beam.sdk.io.hadoop.SerializableConfiguration; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.TableInputFormat; +import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat; +import org.apache.hadoop.hbase.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.protobuf.generated.ClientProtos; +import org.apache.hadoop.hbase.util.Base64; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.InputFormat; +import org.apache.hadoop.mapreduce.Job; + +/** + * A {@link Configuration} that could be used in {@link HadoopFormatIO} for reading HBase snapshot + * hosted in Google Cloud Storage(GCS) bucket via GCS connector. It uses {@link + * TableSnapshotInputFormat} for reading HBase snapshots. + */ +class HBaseSnapshotInputConfiguration { + + private static final Log LOG = LogFactory.getLog(HBaseSnapshotInputConfiguration.class); + private static final int BATCH_SIZE = 1000; + + private final Configuration hbaseConf; + + /** + * Constructs a new top level source. + * + * @param snapshotDir The path or pattern of the file(s) to read. + */ + HBaseSnapshotInputConfiguration( + ValueProvider gcsProjectId, + ValueProvider snapshotDir, + ValueProvider snapshotName, + ValueProvider restoreDir) { + + Preconditions.checkArgument( + snapshotDir.toString().startsWith("gs://"), + "snapshot folder must be hosted in a GCS bucket "); + + Configuration conf = HBaseConfiguration.create(); + try { + conf.set("hbase.rootdir", snapshotDir.toString()); + conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); + conf.set("fs.gs.project.id", gcsProjectId.toString()); + conf.set("fs.defaultFS", snapshotDir.toString()); + conf.set("google.cloud.auth.service.account.enable", "true"); + conf.setClass( + "mapreduce.job.inputformat.class", TableSnapshotInputFormat.class, InputFormat.class); + conf.setClass("key.class", ImmutableBytesWritable.class, Writable.class); + conf.setClass("value.class", Result.class, Object.class); + ClientProtos.Scan proto = ProtobufUtil.toScan(new Scan().setBatch(BATCH_SIZE)); + conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())); + + this.LOG.debug(conf); + Job job = Job.getInstance(conf); // creates internal clone of hbaseConf + TableSnapshotInputFormat.setInput( + job, snapshotName.toString(), new Path(restoreDir.toString())); + conf = job.getConfiguration(); // extract the modified clone + } catch (Exception e) { + this.LOG.fatal(e); + } + this.hbaseConf = new SerializableConfiguration(conf); + } + + public Configuration getHbaseConf() { + return hbaseConf.get(); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java new file mode 100644 index 0000000000..91bda824a2 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -0,0 +1,158 @@ +/* + * Copyright 2017 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; +import com.google.cloud.bigtable.beam.CloudBigtableIO; +import com.google.cloud.bigtable.beam.CloudBigtableTableConfiguration; +import com.google.cloud.bigtable.beam.TemplateUtils; +import com.google.cloud.bigtable.beam.sequencefiles.HBaseResultToMutationFn; +import com.google.cloud.bigtable.beam.sequencefiles.ImportJob; +import com.google.cloud.bigtable.beam.sequencefiles.Utils; +import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; +import com.google.common.annotations.VisibleForTesting; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.io.hadoop.format.HadoopFormatIO; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PDone; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.client.Mutation; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * A job that imports data from HBase snapshot exports hosted in Cloud Storage bucket into Cloud + * Bigtable. This job can be run directly or as a Dataflow template. + * + *

Execute the following command to run the job directly: + * + *

+ * mvn compile exec:java \
+ *   -DmainClass=com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot \
+ *   -Dexec.args="--runner=DataflowRunner \
+ *                --stagingLocation=gs://$STAGING_PATH \
+ *                --project=$PROJECT \
+ *                --bigtableInstanceId=$INSTANCE \
+ *                --bigtableTableId=$TABLE \
+ *                --hbaseRootDir=gs://$HBASE_ROOT_PATH \
+ *                --snapshotName=$SNAPSHOT_NAME  \
+ *                --restoreDir=gs://$RESTORE_PATH
+ * 
+ * + *

Execute the following command to create the Dataflow template: + * + *

+ * mvn compile exec:java \
+ *   -DmainClass=com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot \
+ *   -Dexec.args="--runner=DataflowRunner \
+ *                --project=$PROJECT \
+ *                --stagingLocation=gs://$STAGING_PATH \
+ *                --templateLocation=gs://$TEMPLATE_PATH \
+ *                --wait=false"
+ * 
+ * + *

There are a few ways to run the pipeline using the template. See Dataflow doc for details: + * https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload + * a metadata file that contains information about the runtime parameters that can be used for + * parameter validation purpose and more. A sample metadata file can be found at + * "src/main/resources/ImportJob_metadata". + * + *

An example using gcloud command line: + * + *

+ * gcloud beta dataflow jobs run $JOB_NAME \
+ *   --gcs-location gs://$TEMPLATE_PATH \
+ *   --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,hbaseRootDir=gs://$HBASE_ROOT_PATH,snapshotName=$SNAPSHOT_NAME,restoreDir=gs://$RESTORE_PATH
+ * 
+ */ +@InternalExtensionOnly +public class ImportJobFromHbaseSnapshot { + private static final Log LOG = LogFactory.getLog(ImportJobFromHbaseSnapshot.class); + + public interface ImportOptions extends ImportJob.ImportOptions { + @Description("The HBase root dir where HBase snapshot files resides.") + ValueProvider getHbaseRootDir(); + + @SuppressWarnings("unused") + void setHbaseRootDir(ValueProvider hbaseRootDir); + + @Description("Temp location for restoring snapshots") + ValueProvider getRestoreDir(); + + @SuppressWarnings("unused") + void setRestoreDir(ValueProvider restoreDir); + + @Description("Snapshot name") + ValueProvider getSnapshotName(); + + @SuppressWarnings("unused") + void setSnapshotName(ValueProvider snapshotName); + } + + public static void main(String[] args) { + PipelineOptionsFactory.register(ImportOptions.class); + + ImportOptions opts = + PipelineOptionsFactory.fromArgs(args).withValidation().as(ImportOptions.class); + + LOG.info("DEBUG===> Building Pipeline"); + Pipeline pipeline = buildPipeline(opts); + + LOG.info("DEBUG===> Running Pipeline"); + PipelineResult result = pipeline.run(); + + if (opts.getWait()) { + Utils.waitForPipelineToFinish(result); + } + } + + @VisibleForTesting + static Pipeline buildPipeline(ImportOptions opts) { + Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); + pipeline + .apply( + "Read from HBase Snapshot", + HadoopFormatIO.read() + .withConfiguration( + new HBaseSnapshotInputConfiguration( + opts.getBigtableProject(), + opts.getHbaseRootDir(), + opts.getSnapshotName(), + opts.getRestoreDir()) + .getHbaseConf())) + .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) + .apply("Write to Bigtable", createSink(opts)); + + return pipeline; + } + + static PTransform, PDone> createSink(ImportOptions opts) { + CloudBigtableTableConfiguration config = TemplateUtils.BuildImportConfig(opts); + config = + config + .toBuilder() + .withConfiguration(BigtableOptionsFactory.BIGTABLE_BULK_AUTOFLUSH_MS_KEY, "0") + .build(); + return CloudBigtableIO.writeToTable(config); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/CreateTableHelper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/CreateTableHelper.java index b4b3862817..4c794ed7eb 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/CreateTableHelper.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/CreateTableHelper.java @@ -57,7 +57,7 @@ * intended to be a preparation step before running an {@link ImportJob}. */ @InternalApi -class CreateTableHelper { +public class CreateTableHelper { private static final Log LOG = LogFactory.getLog(CreateTableHelper.class); @InternalApi diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/HBaseResultToMutationFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/HBaseResultToMutationFn.java index 6b2e628a5d..45954c7762 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/HBaseResultToMutationFn.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/HBaseResultToMutationFn.java @@ -15,6 +15,7 @@ */ package com.google.cloud.bigtable.beam.sequencefiles; +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Predicate; @@ -43,7 +44,8 @@ * A {@link DoFn} function that converts a {@link Result} in the pipeline input to a {@link * Mutation} for output. */ -class HBaseResultToMutationFn extends DoFn, Mutation> { +@InternalApi +public class HBaseResultToMutationFn extends DoFn, Mutation> { private static Logger logger = LoggerFactory.getLogger(HBaseResultToMutationFn.class); private static final long serialVersionUID = 1L; diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java index 142c4a17ef..27e8f3debf 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java @@ -15,6 +15,7 @@ */ package com.google.cloud.bigtable.beam.sequencefiles; +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; import org.apache.beam.runners.dataflow.DataflowRunner; import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; import org.apache.beam.sdk.PipelineResult; @@ -28,7 +29,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -class Utils { +@InternalApi +public class Utils { private static final Log LOG = LogFactory.getLog(Utils.class); /** @@ -74,7 +76,7 @@ public ResourceId apply(String input) { * * @param result */ - static void waitForPipelineToFinish(PipelineResult result) { + public static void waitForPipelineToFinish(PipelineResult result) { try { // Check to see if we are creating a template. // This should throw {@link UnsupportedOperationException} when creating a template. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/resources/log4j.properties b/bigtable-dataflow-parent/bigtable-beam-import/src/main/resources/log4j.properties index 04f0fab6bb..cff29c2435 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/resources/log4j.properties +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/resources/log4j.properties @@ -22,3 +22,5 @@ log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n log4j.category.org.apache.hadoop.io.compress.CodecPool=WARN +log4j.logger.org.apache.hadoop.io.compress.Compression=TRACE +log4j.logger.org.apache.hadoop.util.NativeCodeLoader=DEBUG diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/..snapshotinfo.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/..snapshotinfo.crc new file mode 100644 index 0000000000000000000000000000000000000000..8fe4533a0159f76b5bb3a1968ac5d1fa7fc45a58 GIT binary patch literal 12 TcmYc;N@ieSU}AWGHv9?z6(s~B literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.data.manifest.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.data.manifest.crc new file mode 100644 index 0000000000000000000000000000000000000000..1467a17f1f9924f6a69bd2963d5e21ff088ca3f6 GIT binary patch literal 20 bcmYc;N@ieSU}8|8vgZ5;2Nu4voskOwJbDJ( literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo new file mode 100644 index 0000000000..83e482aac0 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo @@ -0,0 +1,2 @@ + + test-snapshottest�����. ( \ No newline at end of file diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/data.manifest b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/data.manifest new file mode 100644 index 0000000000000000000000000000000000000000..180516dc03055633111dc6316e9e50e08c8196ba GIT binary patch literal 1090 zcmaiz&2H2%6on`K5yb^TT?tiXg91pPO304?u9ri`g{YayD2YI@8QHPD>Wa4S!HOr~ z4cPMb!XD$;cVJZR|#C6t<0iqX4T>i-dQvYgAdSl-`5xwlq(KQ za)l4ke%Z{It?903{;j(=PhydBE&jE|vADh-xO7Chn7QO$rtD5&>JQ0e$asLC@KLmi0{>|nJ_ z1c=m8MJ44yK#@c{cMWav)BoGo`DDxl8zTi0m@PQ>!vI7D?gKa(hd`{_C{*T|8bgOd zb73ubJ20+`LQ`gGlq3f4Bo%A-C$|Wlr#t6V`BxZAc6@gfKkRNEx>_? zD1=l#CFh5YwrHLeW2gtmwPT7iO*v7{GT}uaK7|iV4EpHXhL^(gOpKup7~woB>1_HY sJf&9Xz?=;#dMG^Is1)){kD*dHJxEB2iiZN{4DK6}G>vmhg%ep|Z7_g)uhQttp zF)QTz&CdJ0d2hb+W|Oih$JMDb z2uaIei@(`$AvBI);yyflU=2u$xaJmYm|CCw6#L>?x0h}fA|7!0!5Wb(60Z5jZZm)Q z2nP&sz!n1r8aQB(1MV%HV7QqBMmS)H0V9|LMmgXC14dgo zfRkbDAp;_<91!JJc+7xk8wbQVV2=SY?-SpLA8}Pn!D$Qv90c>#odz2C_WGl}^}IE? z{jBp*+57p|(|6xzmE^Rc>n08A0qd#{x&V`GNV~rJb3C`X@snGD9%vOlU`>E}zIJWO zDMckpIt6K@eXuumX{7g=crNwQjXSVEuIjpGL4ZjN%bdxOfZNb7_vZkmrRzlCE%>X^!%D^Y^Q6d|vLb9fn$(fd!rB0_E z@p0^jrL4TH=7^OiMix>8=w>7w#9mc~7^fbpz7;Es6RhL@UwcX-gwo z*3=CGi7I7QEAiq^?4>}KT(XqpA)gPQ#m%Oo5kpQ-kU)2}KIc8Q=WM0+q;(e;7MBVk zIX^*HvQ*;K?SuWc*IdrVs?G-7jf0LeVGtQKqd7T6&L6|QxXmHx=IPG-a7Pcf-u9{a zC~Y~7RJ!K|)OEe?SkVqlx>>>WM?kNm#J=gZpo2EIG*l{I3)w67T(J-;+ViE-N~kz% zuNFd6g^fJ@S@aq9t@y@TVKq@&Ua`yeTB%%#M$#_4N2JKB$z<3x=>Kffx$)rhzr0T2jWcvFeCu`92jaAfMEf6%Yoq*0T>a00}hN30T>m4Lk^6# z3c#2EyyL*wDFGN4fcG32ZxaAP$cZBkD8ViD_+vsh(y$wGfTLicy4^s74efnTzJ8i) zwhr;r&+fBtU)py@wbYztnl=mS0qd#{nuJL{@NTGnAAQFQdEpOiL#yxsZvxcwv+I$X z)^w_6(vSwf276hT27kyV^63|T(n-Pz-82mc0&HqI_I#E`{FZr>oF~zwX=K%;t(h5z z7HGzHX6bw+z#_>7(vhM%wW*U<4NaeSl4d?*N7(LG202ICscPvO&Dl;iZ##C5IlXoy zXGj=+%wkEv5Ekb(a>GmWwg9 zFwOqt=+vt_1ncdnxx9r{osFcML_KHHqAF;{atexEyGVLTn@6z4vz3M6jGk=0AN2A* zYdM2fy5|AZb-!*~(PNnO^Mct$z>Xv4-|Seh!I~QnmDldY+*Nm>RE(9}yXEp~taQg+ zFUDqz+XZ%6^jYp!Vtb>wo-D7dx@+!6d94zS@GiVZu!7^iPTLl{j(YpF=I4RXfxok> zd#}GA51{eZDSRDe(XH4&(?GsjhTG7?a!3WF0#X5~fK)&#AQg}bNCl(pnJn$(xZ z5mG8Jw@&!*AT*C);R!qfFguVE_pEO^uynZiE%C#5=f zFeB`+&w-h<0x&B8ha8yg6@Y{Q)Hsmvuk}6qoYajBoW>-;VVHro(?AnD;}4GNS?}XV zFS=fSeg5HO<6ZrWmR`0@(`HV?FbCV6uEHW8NZWN>aIlF*?Uft}c`fq>zKFvq)5xhQTQjqi z6iL>mOJpVFW1jdD?n@J$*o0XN$#88*1L5#d@ zS_A?$(rjkp#}}}l0eNzrYUxw90KSa7ZOtH-nq45Fp=N&Wd#>h|(t5JI%bQy}rHEQw zU^`KrxOoSl-oBQ}Ev#wmz(Y9f8dDZgK{FOpP~=JlkKkUHV8ye_f^bJqx8Co8qmQiR z4ASVHJCN6Q+p(gju;}FpvmXJwjtc)~*Mbe!+)AvvcRS+jI_u?9r0i@|tGki%nzLVu zER||S_OlqZocqbzL1{l#t?W8`&Ovpr;f<6!yr-2o9ey9PE%rMa8P&Rue1T(c|KQNu zu`wlzW_#w9Ybb|qM*bNF;=kP1izqyka_slb1w GfcFm(gDz_T literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc new file mode 100644 index 0000000000000000000000000000000000000000..931ebfb54555d336879fa44ef956de26ba9c2a4e GIT binary patch literal 52 mcmYc;N@ieSU}9MD$S>Yy!SuIZFSEYew5qfM7k&ilN1?|tXZW;Cj$jNEmE zlx>)szxnVXG>>562p%fTHl)OT>l-dC9W0(Dz6B24tw)u308Am6?Z_7&Z`zY{+&+Fm z044-ro&ytY0uU2`3{zFp&YBN=lK0*dQ8}X9cTkS;Z1;sehoa;vbsU^ zd=|ne?_s`e38Q=}q)OS>e%y;gDZ{c%2OKD|9lBZ|VZUuH<109lw#WI2qD6(7zIag(9pog%Zdf`#S`EVF||d@hA@9qVm1Y_OiXRXF^O5EmQ5f~ zJZbJ{tAr+7cNCl(MA=N`FDt4b?y;j9U&TL27c7ajlr literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/d8b49b374391407ba35d5e0db1c835c9 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/d8b49b374391407ba35d5e0db1c835c9 new file mode 100644 index 0000000000000000000000000000000000000000..d640fc8498e06935ecaf06a2714dda361af9ed7e GIT binary patch literal 5299 zcmeH{&rcIU6vt=V%8vp1W2p)LK!I=|M4K*9pkNH!c8UwT-MZaY@UpI0tOjib#1Q`k zjlrWaG2!5aXd)i`FFdGc5>F&vh(_=FraQGmFJ3*&ByTr6@AKxp`Oe$PsJ5EY7e^72 zcEFzgX2XZj9D<3r@CbvoAxZJAthz9@weT&uAJ}r&9+s5=xFTTf$fv}c{+ST-$4`J$ z2J1k6CC(AEpBqFx#I3>Qn29n%Y2$z~E=eB)#@abxoCAg#Fy6rd6C5zYfQe2HNN~Uf z1|%>COmcw2fXOZnnBss01E#t;V44G-Ghn)h1Cku@f&t0k&)~CjDc#7zfsFwi1q-(v z7*doYxAW-Nr>`r4(FgB#H0^QnRaMI@S*B^zph2+y)}YHU$%eEaZawO*?>}nX3iQky z@FD9HG#K37+skS?(Q;WxBYgmS-;zf9l1~-0Z~V9qM^d_J8V&^5#B%KAJdypDc@3Y% z(X?sg)wHddIfs-;&UO~bvK*k1_&n~-5S`e>$*P8?FFR?on6qViy7ht2;1E$QT_XkC z$ro+ME>Ne}vG^2@z*0_0)r-U_5vu?xWV#s%M{!VBA;znUCiE#u2#6Z$f@wjTza>$h zglHv(y5tzdDA=Y&AW^-{MkOJ98V4y*AXgnNbIcdP7jdVp8N^a^^F$tK)aSitKYLqg zK55;R)jMnDm|B{rD>*vx>W;vEdz&tAV-06J9>7u0nYM@un$et`B9~k|gnK-KZl3Ne z0>AO`)_Z;MGDKU>AocFK4|UybIac%pCjG2ndRNfjQDWb;>*=7)Esa;#Z^zu4yHY8~ zD(8xj@73IOceA=)k4Dlyyc3clODA`2i{4m6 z!&=A3K={z#*FSJDG9ty%boZQe1?AE8*gw-izGj9S(8FQ~1%v`Z0il3UKqw#-5DEwd SgaSeVp@2|8DDYn?;Qs^s0WsnL literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/.32053565831341128b8d8f5567d48fdc.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/.32053565831341128b8d8f5567d48fdc.crc new file mode 100644 index 0000000000000000000000000000000000000000..80317a1515597ecbac0015cf7edba1283ce6824b GIT binary patch literal 52 mcmYc;N@ieSU}9J$_&t8|r`=f(wzKMOT2)$s3%_IzQ3C*MFc~EP literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/32053565831341128b8d8f5567d48fdc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/32053565831341128b8d8f5567d48fdc new file mode 100644 index 0000000000000000000000000000000000000000..5320c6c58dbe6e391c4994185de1655b0e874b09 GIT binary patch literal 5264 zcmeH{&rcIU6vt=VT40dcmYSeK1i}H0G|=*^#1N+K6jye;b-S%#Ox6{O#Q<#pF~r1+ z@uCT5P4wcyM57lE95694$RA+5dN%QDJb2SL-KiaV@#< zokvJn16%pchXub%rP zqiR&mW+08S2llQhjq)XzEM(qzaTg9HHN((t2(YMWTMIcF_L|03+>IkCL(h?vr5ahA z7HQV9=jcM%$0G4r+?l2twWysTx~eVMDWj0J!t8Ww1E0YsC^0pa<}Eu{uxu;OoNhbf zQ#b@m*+rris9mIH9#VwaW)vL30gXV6TNO*_i6tN)qLa&p32FY8#C!^3mFQ&N)~TMi z43k2lTAB4q0{AozFd$E_*lPNSFNlY5o2Ba1B-vRS?ycA7zNZe|tu&sj?(*`=YAH&J zvuq_>qi)?n*l%aUGY#ZxWViu6EQeG;Dj*e*3P=T{0#X5~fK)&#AQg}bNCl(< I|CIvXKg{tgK>z>% literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/.36798a163ed046b193818e21dd7516b4.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/.36798a163ed046b193818e21dd7516b4.crc new file mode 100644 index 0000000000000000000000000000000000000000..00a9d7720d3d3867ea0cd0a153d5265158a9b5ff GIT binary patch literal 52 mcmYc;N@ieSU}89xHtAGe`OGB@e!ed@ttzd+g?$pMob@YMq*J2@c10Y@H?2v%M_jwf{^1E(T89t3CP4i^ zWB1gIrV}ljfi%(!u;)!_q{CdYka_0E-8h`oP1CR;fD+55r5uU)E%O>ai=!#i$f+r+ znOU0@NtW94q!bCTNZgCN(nKee*csK(^pc%23t1Xrr&}BN3_e9vOV>!A+PMO?X`VUV zcEmk63`^NXRWA^`NUS`hh_KB_IEsV13Nda~ETKy-rYX&hugo?NxH^f6xuU&QTHGl->TXGx^5UZ4A(I&!ztc(S@HODk(- zSuM`8m2923b%$WTT@9DJvAVMb_u;7POj$$)%~(!Ok;|<(jyqj~ZJzBc40rT+>&4~e z6l*zy)Vk*m)V11ltmqL;`n`hLkAPiA$@^y4f(_Q(l2YAVmYoe}u~L>R&h2V-L$2I% zw#xE+d9TQR76X>Eo7~$jZ>6g18_uS)UEQojBWWMrW0JD^{_qmD*zYJlptU^>gbw`O zvA&msgVGS1?3j_Rpd7j`|1%BbYh<_yJuHS$Kqw#-5DEwdgaSeVp@2|8C?FIN3J3*+ J0{@i){y%HhEbjmS literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/.65b9c6860f5f4de39d61d1674947b030.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/.65b9c6860f5f4de39d61d1674947b030.crc new file mode 100644 index 0000000000000000000000000000000000000000..1d7e3d8653bfae2874b0d726f295edbbd3e92fa8 GIT binary patch literal 52 lcmYc;N@ieSU}CVbo1{1?#CqoE%2>fot4b?y;g`%IY5*h+7hwPZ literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/65b9c6860f5f4de39d61d1674947b030 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/65b9c6860f5f4de39d61d1674947b030 new file mode 100644 index 0000000000000000000000000000000000000000..e8d9789f5e9c26ecc1eed6ed9e7061b6e0a9d461 GIT binary patch literal 5264 zcmeH{&rcIU6vt=V@@o)mOAq*i@FN_E!G_-~Vhr1MiVM5lx}8?)WnHmY3|It2iHV6< zO*nWlF=|ZIXvCP{!2^jWPR4lhU-05pW5P|}bfD`P6}`fAS&s z57Y_3pa4ALz+k-q3<$(9PkD4N!&zrlh|5nnuNP5^?C2f9d9euC#$=-u((u+ zYPo5)l4B6B?jY>9t?KeNR&mzjE*$ZkahqtM8Otdsa(V>!;1-Wyn`b)`w_6~sPJ!gE!beqtqhk|uSDGycfMGN7TwFG(n_>= z$z3Z%XA7G-_Os}<-J7w^^}<@bw7lZ3y6dIYax_x5;619SKl8UD)Mme= LkP7@)3i$s3LhCH5 literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/.b83044f76ba6474aa829e3bae7fd82d1.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/.b83044f76ba6474aa829e3bae7fd82d1.crc new file mode 100644 index 0000000000000000000000000000000000000000..ca57c97e2deddae20f6c82712db07fd2e35620d7 GIT binary patch literal 52 lcmYc;N@ieSU}Esk=Ghmve-7`}t(p;=R+U!Z!Y`Ra)BrhK7wrH5 literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/b83044f76ba6474aa829e3bae7fd82d1 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/b83044f76ba6474aa829e3bae7fd82d1 new file mode 100644 index 0000000000000000000000000000000000000000..c119dd13ef4179dcde442da0461e0f1b50569ba1 GIT binary patch literal 5264 zcmeH{&rcIU6vt=V%8x-xTY5kgL4G8}gbjuAqhJivc8UwT-MXDt@Zh>)p%}0P5JO^2 zjPc-)5Iyq;m>5qUGAD`@juXbF!5kCF~OPc)DFFP^)QpX-R!*2oA>5BZ#Ibt zitDo{0Z>+{E${Q;0bmNyiF@=BpjHQzsApl(p;KEkUt*tpTh7Y0LexiHL2C8D6CJGj zCq8rk;2{Bs2*6hkMCt@!NC3WZV5nXIh6Ug|2ZkF2K&Wxz2M3~!0x%-1@QVW@Pyj{+ z;5P?Gn*?A?0QNaBc31$$tH7j}1LMsC5EFnN4#fN|FOT1i>qd&6#t?y(L-o0@<(z%pT`}VZtoMoDX1)ZSQQ5$rcPV%Ab1ZvOm!kyB4VFh-e6?l&~ z0XXrq<0eXJI@Z!Dnnrm}?O9D4`ac> zB+Hy`JHjI{NSCtnNY7zAkF6|C5n`KBU>N#!L}T2lSVB)UMgzhInlUY!=5I;NrvO%o zf#z%j8(Csnm?kQhS*gSi+o7KUS$x*kk_UVNH~<@oW?&1Yr*WvOQlI;t>~XhJeX_cX zi_0qo73HVdO16&Ox&w5-tyP!1v5K<}cEPagOjsCE%~(!Bky8kE!)BLYn`b)<(mQ&v z^>+IvZ?Tp$aJhS~(z>qH94oj_C%wF2_9I}|QQ_b0TCl;ITNy5`FR9L&vrsIkMdxy< zw5Ap>IU5Cawy>RNKZ{<=xgOu%ENmo7t831>vsqd%M+0StzDJbk>-HCKiN$_L-Mw1F z6JKD@+t$(bs=r?u1mjIp${COW=hc6vq4}yAF47*BLn ["1", "2", "3", "4", "5", "6", "7", "8", "9"]} +put 'test','1', 'cf:a', 'value1' +put 'test','2', 'cf:a', 'value2' +put 'test','3', 'cf:a', 'value3' +put 'test','4', 'cf:a', 'value4' +put 'test','5', 'cf:a', 'value5' +put 'test','6', 'cf:a', 'value6' +put 'test','7', 'cf:a', 'value7' +put 'test','8', 'cf:a', 'value8' +put 'test','9', 'cf:a', 'value9' +put 'test','10', 'cf:a', 'value10' +put 'test','11', 'cf:a', 'value11' +put 'test','12', 'cf:a', 'value12' +put 'test','13', 'cf:a', 'value13' +put 'test','14', 'cf:a', 'value14' +put 'test','15', 'cf:a', 'value15' +put 'test','16', 'cf:a', 'value16' +put 'test','17', 'cf:a', 'value17' +put 'test','18', 'cf:a', 'value18' +put 'test','19', 'cf:a', 'value19' +put 'test','20', 'cf:a', 'value20' +put 'test','21', 'cf:a', 'value21' +put 'test','22', 'cf:a', 'value22' +put 'test','23', 'cf:a', 'value23' +put 'test','24', 'cf:a', 'value24' +put 'test','25', 'cf:a', 'value25' +put 'test','26', 'cf:a', 'value26' +put 'test','27', 'cf:a', 'value27' +put 'test','28', 'cf:a', 'value28' +put 'test','29', 'cf:a', 'value29' +put 'test','30', 'cf:a', 'value30' +put 'test','31', 'cf:a', 'value31' +put 'test','32', 'cf:a', 'value32' +put 'test','33', 'cf:a', 'value33' +put 'test','34', 'cf:a', 'value34' +put 'test','35', 'cf:a', 'value35' +put 'test','36', 'cf:a', 'value36' +put 'test','37', 'cf:a', 'value37' +put 'test','38', 'cf:a', 'value38' +put 'test','39', 'cf:a', 'value39' +put 'test','40', 'cf:a', 'value40' +put 'test','41', 'cf:a', 'value41' +put 'test','42', 'cf:a', 'value42' +put 'test','43', 'cf:a', 'value43' +put 'test','44', 'cf:a', 'value44' +put 'test','45', 'cf:a', 'value45' +put 'test','46', 'cf:a', 'value46' +put 'test','47', 'cf:a', 'value47' +put 'test','48', 'cf:a', 'value48' +put 'test','49', 'cf:a', 'value49' +put 'test','50', 'cf:a', 'value50' +put 'test','51', 'cf:a', 'value51' +put 'test','52', 'cf:a', 'value52' +put 'test','53', 'cf:a', 'value53' +put 'test','54', 'cf:a', 'value54' +put 'test','55', 'cf:a', 'value55' +put 'test','56', 'cf:a', 'value56' +put 'test','57', 'cf:a', 'value57' +put 'test','58', 'cf:a', 'value58' +put 'test','59', 'cf:a', 'value59' +put 'test','60', 'cf:a', 'value60' +put 'test','61', 'cf:a', 'value61' +put 'test','62', 'cf:a', 'value62' +put 'test','63', 'cf:a', 'value63' +put 'test','64', 'cf:a', 'value64' +put 'test','65', 'cf:a', 'value65' +put 'test','66', 'cf:a', 'value66' +put 'test','67', 'cf:a', 'value67' +put 'test','68', 'cf:a', 'value68' +put 'test','69', 'cf:a', 'value69' +put 'test','70', 'cf:a', 'value70' +put 'test','71', 'cf:a', 'value71' +put 'test','72', 'cf:a', 'value72' +put 'test','73', 'cf:a', 'value73' +put 'test','74', 'cf:a', 'value74' +put 'test','75', 'cf:a', 'value75' +put 'test','76', 'cf:a', 'value76' +put 'test','77', 'cf:a', 'value77' +put 'test','78', 'cf:a', 'value78' +put 'test','79', 'cf:a', 'value79' +put 'test','80', 'cf:a', 'value80' +put 'test','81', 'cf:a', 'value81' +put 'test','82', 'cf:a', 'value82' +put 'test','83', 'cf:a', 'value83' +put 'test','84', 'cf:a', 'value84' +put 'test','85', 'cf:a', 'value85' +put 'test','86', 'cf:a', 'value86' +put 'test','87', 'cf:a', 'value87' +put 'test','88', 'cf:a', 'value88' +put 'test','89', 'cf:a', 'value89' +put 'test','90', 'cf:a', 'value90' +put 'test','91', 'cf:a', 'value91' +put 'test','92', 'cf:a', 'value92' +put 'test','93', 'cf:a', 'value93' +put 'test','94', 'cf:a', 'value94' +put 'test','95', 'cf:a', 'value95' +put 'test','96', 'cf:a', 'value96' +put 'test','97', 'cf:a', 'value97' +put 'test','98', 'cf:a', 'value98' +put 'test','99', 'cf:a', 'value99' +put 'test','100', 'cf:a', 'value100' +snapshot 'test', 'test-snapshot' +list_snapshots + +disable 'test' +drop 'test' +exit diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java new file mode 100644 index 0000000000..cd142bd790 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -0,0 +1,193 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import static com.google.common.base.Preconditions.checkNotNull; + +import com.google.cloud.bigtable.beam.sequencefiles.testing.BigtableTableUtils; +import com.google.cloud.bigtable.hbase.BigtableConfiguration; +import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; +import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; +import org.apache.beam.runners.direct.DirectRunner; +import org.apache.beam.sdk.PipelineResult.State; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; +import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.snapshot.SnapshotTestingUtils; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/* + * End to end integration test for pipeline that import HBase snapshot data into Cloud Bigtable. + * Prepare test data with gsutil(https://cloud.google.com/storage/docs/quickstart-gsutil): + * gsutil -m cp -r /bigtable-dataflow-parent/bigtable-beam-import/src/test/data/ \ + * gs:///integration-test/ + * + * Setup GCP credential: https://cloud.google.com/docs/authentication + * Ensure your credential have access to Bigtable and Dataflow + * + * Run with: + * mvn integration-test -PhbasesnapshotsIntegrationTest \ + * -Dgoogle.bigtable.project.id= \ + * -Dgoogle.bigtable.instance.id= \ + * -Dgoogle.dataflow.stagingLocation=gs:///staging \ + * -Dcloud.test.data.folder=gs:///integration-test/ + */ +public class EndToEndIT { + + private static final String TEST_SNAPSHOT_NAME = "test-snapshot"; + // Location of test data hosted on Google Cloud Storage, for on-cloud dataflow tests. + private static final String CLOUD_TEST_DATA_FOLDER = "cloud.test.data.folder"; + + // Column family name used in all test bigtables. + private static final String CF = "cf"; + + // Full path of the Cloud Storage folder where dataflow jars are uploaded to. + private static final String GOOGLE_DATAFLOW_STAGING_LOCATION = "google.dataflow.stagingLocation"; + + private Connection connection; + private String projectId; + private String instanceId; + private String tableId; + + private GcsUtil gcsUtil; + private String dataflowStagingLocation; + private String workDir; + private byte[][] keySplits; + + // Snapshot data setup + private String hbaseSnapshotDir; + private String restoreDir; + + @Before + public void setup() throws Exception { + projectId = getTestProperty(BigtableOptionsFactory.PROJECT_ID_KEY); + instanceId = getTestProperty(BigtableOptionsFactory.INSTANCE_ID_KEY); + dataflowStagingLocation = getTestProperty(GOOGLE_DATAFLOW_STAGING_LOCATION); + String cloudTestDataFolder = getTestProperty(CLOUD_TEST_DATA_FOLDER); + if (!cloudTestDataFolder.endsWith(File.separator)) { + cloudTestDataFolder = cloudTestDataFolder + File.separator; + } + + hbaseSnapshotDir = cloudTestDataFolder + "data/"; + UUID test_uuid = UUID.randomUUID(); + restoreDir = cloudTestDataFolder + "restore/" + test_uuid; + + // Cloud Storage config + GcpOptions gcpOptions = PipelineOptionsFactory.create().as(GcpOptions.class); + gcpOptions.setProject(projectId); + gcsUtil = new GcsUtil.GcsUtilFactory().create(gcpOptions); + + // Bigtable config + connection = BigtableConfiguration.connect(projectId, instanceId); + tableId = "test_" + UUID.randomUUID().toString(); + + System.out.println("Setting up integration tests"); + + String[] keys = new String[] {"1", "2", "3", "4", "5", "6", "7", "8", "9"}; + keySplits = new byte[keys.length][]; + for (int i = 0; i < keys.length; i++) { + keySplits[i] = keys[i].getBytes(); + } + } + + private static String getTestProperty(String name) { + return checkNotNull(System.getProperty(name), "Required property missing: " + name); + } + + @After + public void teardown() throws IOException { + final List paths = gcsUtil.expand(GcsPath.fromUri(restoreDir + "/*")); + + if (!paths.isEmpty()) { + final List pathStrs = new ArrayList<>(); + + for (GcsPath path : paths) { + pathStrs.add(path.toString()); + } + this.gcsUtil.remove(pathStrs); + } + + connection.close(); + + // delete test table + BigtableConfiguration.connect(projectId, instanceId) + .getAdmin() + .deleteTable(TableName.valueOf(tableId)); + } + + @Test + public void testHBaseSnapshotImport() throws Exception { + + try (Connection connection = BigtableConfiguration.connect(projectId, instanceId)) { + // Crete table + System.out.println("DEBUG (create test table) ==>"); + TableName tableName = TableName.valueOf(tableId); + HTableDescriptor descriptor = new HTableDescriptor(tableName); + + descriptor.addFamily(new HColumnDescriptor(CF)); + + connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys()); + + // Start import + System.out.println("DEBUG (import snapshot) ==>"); + DataflowPipelineOptions importPipelineOpts = + PipelineOptionsFactory.as(DataflowPipelineOptions.class); + importPipelineOpts.setRunner(DirectRunner.class); + importPipelineOpts.setGcpTempLocation(dataflowStagingLocation); + importPipelineOpts.setNumWorkers(1); + importPipelineOpts.setProject(projectId); + + ImportJobFromHbaseSnapshot.ImportOptions importOpts = + importPipelineOpts.as(ImportJobFromHbaseSnapshot.ImportOptions.class); + // setup GCP and bigtable + importOpts.setBigtableProject(StaticValueProvider.of(projectId)); + importOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId)); + importOpts.setBigtableTableId(StaticValueProvider.of(tableId)); + importOpts.setBigtableAppProfileId(null); + + // setup Hbase snapshot info + importOpts.setHbaseRootDir(StaticValueProvider.of(hbaseSnapshotDir)); + importOpts.setRestoreDir(StaticValueProvider.of(restoreDir)); + importOpts.setSnapshotName(StaticValueProvider.of(TEST_SNAPSHOT_NAME)); + + // run pipeline + State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + // check data in bigtable + BigtableTableUtils destTable = new BigtableTableUtils(connection, tableId, CF); + Assert.assertEquals( + 100 /* There are 100 rows in test snapshot*/, + destTable.readAllCellsFromTable().toArray().length); + + // TODO(vermas2012): Add more validations after this. + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigurationTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigurationTest.java new file mode 100644 index 0000000000..d5290a8562 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigurationTest.java @@ -0,0 +1,51 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import static org.junit.Assert.assertEquals; + +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat; +import org.apache.hadoop.mapreduce.InputFormat; +import org.junit.Test; + +public class HBaseSnapshotInputConfigurationTest { + + private static final String TEST_PROJECT = "test_project"; + private static final String TEST_SNAPSHOT_DIR = "gs://test-bucket/hbase-export"; + private static final String TEST_SNAPSHOT_NAME = "test_snapshot"; + private static final String TEST_RESTORE_DIR = "gs://test-bucket/hbase-restore"; + + @Test + public void testBuildingHBaseSnapshotInputConfiguration() { + Configuration conf = + new HBaseSnapshotInputConfiguration( + ValueProvider.StaticValueProvider.of(TEST_PROJECT), + ValueProvider.StaticValueProvider.of(TEST_SNAPSHOT_DIR), + ValueProvider.StaticValueProvider.of(TEST_SNAPSHOT_NAME), + ValueProvider.StaticValueProvider.of(TEST_RESTORE_DIR)) + .getHbaseConf(); + assertEquals( + "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS", conf.get("fs.AbstractFileSystem.gs.impl")); + assertEquals(TEST_PROJECT, conf.get("fs.gs.project.id")); + assertEquals(TEST_SNAPSHOT_DIR, conf.get("hbase.rootdir")); + assertEquals( + TableSnapshotInputFormat.class, + conf.getClass( + "mapreduce.job.inputformat.class", TableSnapshotInputFormat.class, InputFormat.class)); + } +} diff --git a/pom.xml b/pom.xml index 790d7a0c06..fe179a02a1 100644 --- a/pom.xml +++ b/pom.xml @@ -79,7 +79,7 @@ limitations under the License. 2.24.0 30.1-android - 20.0 + 29.0-jre 1.7.4 1.29.0 From 7d6349065736074896f0ddfb9cdd36a754d23eaa Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Mon, 14 Dec 2020 20:18:43 +0000 Subject: [PATCH 02/36] Update document --- .../beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java index 631496a81f..a9a4148b94 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java @@ -50,7 +50,7 @@ class HBaseSnapshotInputConfiguration { private final Configuration hbaseConf; /** - * Constructs a new top level source. + * Constructs a HBase Configuration that could read HBase snapshot files from GCS Bucket. * * @param snapshotDir The path or pattern of the file(s) to read. */ From 6c0ed5e73ff848740ed4272604964bcd31f87f84 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Mon, 14 Dec 2020 22:24:10 +0000 Subject: [PATCH 03/36] Change the conf type for HBaseSnapshotInputConfiguration for Serialization support --- .../beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java index a9a4148b94..f09c0680f3 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java @@ -47,7 +47,7 @@ class HBaseSnapshotInputConfiguration { private static final Log LOG = LogFactory.getLog(HBaseSnapshotInputConfiguration.class); private static final int BATCH_SIZE = 1000; - private final Configuration hbaseConf; + private final SerializableConfiguration hbaseConf; /** * Constructs a HBase Configuration that could read HBase snapshot files from GCS Bucket. From fc728a856f6e853d5a6d90c73bf1c7f6767e2896 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 15 Dec 2020 16:43:01 +0000 Subject: [PATCH 04/36] Rename HBASE_ROOT_PATH to HBASE_EXPORT_ROOT_PATH in example doc for it to be more intuitive --- .../beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index 91bda824a2..072ee2fc09 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -54,7 +54,7 @@ * --project=$PROJECT \ * --bigtableInstanceId=$INSTANCE \ * --bigtableTableId=$TABLE \ - * --hbaseRootDir=gs://$HBASE_ROOT_PATH \ + * --hbaseRootDir=gs://$HBASE_EXPORT_ROOT_PATH \ * --snapshotName=$SNAPSHOT_NAME \ * --restoreDir=gs://$RESTORE_PATH * @@ -82,7 +82,7 @@ *
  * gcloud beta dataflow jobs run $JOB_NAME \
  *   --gcs-location gs://$TEMPLATE_PATH \
- *   --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,hbaseRootDir=gs://$HBASE_ROOT_PATH,snapshotName=$SNAPSHOT_NAME,restoreDir=gs://$RESTORE_PATH
+ *   --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,hbaseRootDir=gs://$HBASE_EXPORT_ROOT_PATH,snapshotName=$SNAPSHOT_NAME,restoreDir=gs://$RESTORE_PATH
  * 
*/ @InternalExtensionOnly From 316c0aafdbf0080981cd26cad9a1d322f2074c1f Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Mon, 21 Dec 2020 16:44:18 +0000 Subject: [PATCH 05/36] Addressing the review comments: 1. use guava.version instead of beam-guava.version 2. fix typo --- bigtable-dataflow-parent/bigtable-beam-import/pom.xml | 4 ++-- .../beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 97b29208f0..54cabf6148 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -101,12 +101,12 @@ limitations under the License. ${hbase.version} - + com.google.auto.value auto-value diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java index f09c0680f3..ec7b7fbf78 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java @@ -78,13 +78,13 @@ class HBaseSnapshotInputConfiguration { ClientProtos.Scan proto = ProtobufUtil.toScan(new Scan().setBatch(BATCH_SIZE)); conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())); - this.LOG.debug(conf); + // LOG.debug(conf); Job job = Job.getInstance(conf); // creates internal clone of hbaseConf TableSnapshotInputFormat.setInput( job, snapshotName.toString(), new Path(restoreDir.toString())); conf = job.getConfiguration(); // extract the modified clone } catch (Exception e) { - this.LOG.fatal(e); + LOG.fatal(e); } this.hbaseConf = new SerializableConfiguration(conf); } From 6188efab22f27dd571e68af594d422d8aeba7912 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 22 Dec 2020 21:12:09 +0000 Subject: [PATCH 06/36] Add the original Main.java under sequencefiles back --- .../bigtable/beam/sequencefiles/Main.java | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java new file mode 100644 index 0000000000..2d5cc71a9e --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java @@ -0,0 +1,72 @@ +/* + * Copyright 2017 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.sequencefiles; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; +import java.io.File; +import java.net.URISyntaxException; +import java.util.Arrays; + +/** Entry point for create-table/import/export job submission. */ +@InternalExtensionOnly +public class Main { + /** For internal use only - public for technical reasons. */ + @InternalApi("For internal usage only") + public Main() {} + + public static void main(String[] args) throws Exception { + if (args.length < 1) { + usage(); + System.exit(1); + } + + String[] subArgs = Arrays.copyOfRange(args, 1, args.length); + + switch (args[0]) { + case "export": + ExportJob.main(subArgs); + break; + case "import": + ImportJob.main(subArgs); + break; + case "create-table": + CreateTableHelper.main(subArgs); + break; + default: + usage(); + System.exit(1); + } + } + + private static void usage() { + String jarName; + + try { + jarName = + new File(Main.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()) + .getName(); + } catch (URISyntaxException e) { + jarName = ""; + } + + System.out.printf( + "java -jar %s \n" + + "Where can be 'export', 'import' or 'create-table'. To get further help, run: \n" + + "java -jar %s --help\n", + jarName, jarName); + } +} From f621dc0272c39b95566e2182a843c20315788875 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Mon, 28 Dec 2020 22:04:19 +0000 Subject: [PATCH 07/36] gcs connector still requires non-android guava version --- bigtable-dataflow-parent/bigtable-beam-import/pom.xml | 2 +- pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 54cabf6148..fec37e287f 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -117,7 +117,7 @@ limitations under the License. com.google.guava guava - ${guava.version} + ${gcs-guava.version} diff --git a/pom.xml b/pom.xml index fe179a02a1..fb77fefd26 100644 --- a/pom.xml +++ b/pom.xml @@ -79,7 +79,7 @@ limitations under the License. 2.24.0 30.1-android - 29.0-jre + 29.0-jre 1.7.4 1.29.0 From 68d88f465a0cd6317b48975224c4914e9d0602ee Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 20 Oct 2020 19:48:50 +0000 Subject: [PATCH 08/36] Support import from HBase snapshot New files for configuring HBaseSnapshotInputFormat resovle version conflict and upgrade Beam version to 2.24.0 revert disk option change, not enough quota Code reorg code reduction Refactor naming Add integration config Add unit test for HBaseSnapshotInputConfiguration Set up skeleton for integration testing Ship test data with code, integration tests pass Clean up code for PR Add HBase commands that generates our test snapshot Addressing review comments 1. revert pom file overrides for SkipITs 2. Store SerializableConfiguration as member variable 3. Rever log4j.properties 4. Disable BIGTABLE_BULK_AUTOFLUSH_MS_KEY to prevent bulk mutation failures failing the jobs. --- .../bigtable-beam-import/pom.xml | 6 -- .../HBaseSnapshotInputConfiguration.java | 27 ------- .../bigtable/beam/sequencefiles/Main.java | 72 ------------------- 3 files changed, 105 deletions(-) delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index fec37e287f..cb20c36528 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -101,12 +101,6 @@ limitations under the License. ${hbase.version} - com.google.auto.value auto-value diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java index ec7b7fbf78..0ede2c3214 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java @@ -51,33 +51,6 @@ class HBaseSnapshotInputConfiguration { /** * Constructs a HBase Configuration that could read HBase snapshot files from GCS Bucket. - * - * @param snapshotDir The path or pattern of the file(s) to read. - */ - HBaseSnapshotInputConfiguration( - ValueProvider gcsProjectId, - ValueProvider snapshotDir, - ValueProvider snapshotName, - ValueProvider restoreDir) { - - Preconditions.checkArgument( - snapshotDir.toString().startsWith("gs://"), - "snapshot folder must be hosted in a GCS bucket "); - - Configuration conf = HBaseConfiguration.create(); - try { - conf.set("hbase.rootdir", snapshotDir.toString()); - conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); - conf.set("fs.gs.project.id", gcsProjectId.toString()); - conf.set("fs.defaultFS", snapshotDir.toString()); - conf.set("google.cloud.auth.service.account.enable", "true"); - conf.setClass( - "mapreduce.job.inputformat.class", TableSnapshotInputFormat.class, InputFormat.class); - conf.setClass("key.class", ImmutableBytesWritable.class, Writable.class); - conf.setClass("value.class", Result.class, Object.class); - ClientProtos.Scan proto = ProtobufUtil.toScan(new Scan().setBatch(BATCH_SIZE)); - conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())); - // LOG.debug(conf); Job job = Job.getInstance(conf); // creates internal clone of hbaseConf TableSnapshotInputFormat.setInput( diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java deleted file mode 100644 index 2d5cc71a9e..0000000000 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2017 Google Inc. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.google.cloud.bigtable.beam.sequencefiles; - -import com.google.bigtable.repackaged.com.google.api.core.InternalApi; -import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; -import java.io.File; -import java.net.URISyntaxException; -import java.util.Arrays; - -/** Entry point for create-table/import/export job submission. */ -@InternalExtensionOnly -public class Main { - /** For internal use only - public for technical reasons. */ - @InternalApi("For internal usage only") - public Main() {} - - public static void main(String[] args) throws Exception { - if (args.length < 1) { - usage(); - System.exit(1); - } - - String[] subArgs = Arrays.copyOfRange(args, 1, args.length); - - switch (args[0]) { - case "export": - ExportJob.main(subArgs); - break; - case "import": - ImportJob.main(subArgs); - break; - case "create-table": - CreateTableHelper.main(subArgs); - break; - default: - usage(); - System.exit(1); - } - } - - private static void usage() { - String jarName; - - try { - jarName = - new File(Main.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()) - .getName(); - } catch (URISyntaxException e) { - jarName = ""; - } - - System.out.printf( - "java -jar %s \n" - + "Where can be 'export', 'import' or 'create-table'. To get further help, run: \n" - + "java -jar %s --help\n", - jarName, jarName); - } -} From 53f73bc8419a8d4319f839f61e78db187ebbdc02 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Mon, 21 Dec 2020 16:44:18 +0000 Subject: [PATCH 09/36] Addressing the review comments: 1. use guava.version instead of beam-guava.version 2. fix typo --- .../HBaseSnapshotInputConfiguration.java | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java index 0ede2c3214..ec7b7fbf78 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java @@ -51,6 +51,33 @@ class HBaseSnapshotInputConfiguration { /** * Constructs a HBase Configuration that could read HBase snapshot files from GCS Bucket. + * + * @param snapshotDir The path or pattern of the file(s) to read. + */ + HBaseSnapshotInputConfiguration( + ValueProvider gcsProjectId, + ValueProvider snapshotDir, + ValueProvider snapshotName, + ValueProvider restoreDir) { + + Preconditions.checkArgument( + snapshotDir.toString().startsWith("gs://"), + "snapshot folder must be hosted in a GCS bucket "); + + Configuration conf = HBaseConfiguration.create(); + try { + conf.set("hbase.rootdir", snapshotDir.toString()); + conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); + conf.set("fs.gs.project.id", gcsProjectId.toString()); + conf.set("fs.defaultFS", snapshotDir.toString()); + conf.set("google.cloud.auth.service.account.enable", "true"); + conf.setClass( + "mapreduce.job.inputformat.class", TableSnapshotInputFormat.class, InputFormat.class); + conf.setClass("key.class", ImmutableBytesWritable.class, Writable.class); + conf.setClass("value.class", Result.class, Object.class); + ClientProtos.Scan proto = ProtobufUtil.toScan(new Scan().setBatch(BATCH_SIZE)); + conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())); + // LOG.debug(conf); Job job = Job.getInstance(conf); // creates internal clone of hbaseConf TableSnapshotInputFormat.setInput( From cfe86e253555ff1191bb88025c874f521c57e300 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 22 Dec 2020 20:06:05 +0000 Subject: [PATCH 10/36] switch HBasesnapshotConfiguration to a builder class --- ...a => HBaseSnapshotInputConfigBuilder.java} | 66 +++++++++-------- .../ImportJobFromHbaseSnapshot.java | 12 ++-- .../bigtable/beam/sequencefiles/Main.java | 72 +++++++++++++++++++ ... HBaseSnapshotInputConfigBuilderTest.java} | 17 +++-- 4 files changed, 124 insertions(+), 43 deletions(-) rename bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/{HBaseSnapshotInputConfiguration.java => HBaseSnapshotInputConfigBuilder.java} (66%) create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java rename bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/{HBaseSnapshotInputConfigurationTest.java => HBaseSnapshotInputConfigBuilderTest.java} (76%) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java similarity index 66% rename from bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java rename to bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index ec7b7fbf78..21091c6e8b 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfiguration.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -18,8 +18,6 @@ import static java.lang.System.*; import com.google.common.base.Preconditions; -import org.apache.beam.sdk.io.hadoop.SerializableConfiguration; -import org.apache.beam.sdk.options.ValueProvider; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -42,34 +40,51 @@ * hosted in Google Cloud Storage(GCS) bucket via GCS connector. It uses {@link * TableSnapshotInputFormat} for reading HBase snapshots. */ -class HBaseSnapshotInputConfiguration { +class HBaseSnapshotInputConfigBuilder { - private static final Log LOG = LogFactory.getLog(HBaseSnapshotInputConfiguration.class); + private static final Log LOG = LogFactory.getLog(HBaseSnapshotInputConfigBuilder.class); private static final int BATCH_SIZE = 1000; - private final SerializableConfiguration hbaseConf; + private String projectId; + private String exportedSnapshotDir; + private String snapshotName; + private String restoreDir; - /** - * Constructs a HBase Configuration that could read HBase snapshot files from GCS Bucket. - * - * @param snapshotDir The path or pattern of the file(s) to read. - */ - HBaseSnapshotInputConfiguration( - ValueProvider gcsProjectId, - ValueProvider snapshotDir, - ValueProvider snapshotName, - ValueProvider restoreDir) { + public HBaseSnapshotInputConfigBuilder() {} + public HBaseSnapshotInputConfigBuilder setProjectId(String projectId) { + this.projectId = projectId; + return this; + } + + public HBaseSnapshotInputConfigBuilder setExportedSnapshotDir(String exportedSnapshotDir) { + this.exportedSnapshotDir = exportedSnapshotDir; + return this; + } + + public HBaseSnapshotInputConfigBuilder setSnapshotName(String snapshotName) { + this.snapshotName = snapshotName; + return this; + } + + public HBaseSnapshotInputConfigBuilder setRestoreDir(String restoreDir) { + this.restoreDir = restoreDir; + return this; + } + + public Configuration build() { + Preconditions.checkNotNull(projectId); + Preconditions.checkNotNull(exportedSnapshotDir); + Preconditions.checkNotNull(snapshotName); Preconditions.checkArgument( - snapshotDir.toString().startsWith("gs://"), - "snapshot folder must be hosted in a GCS bucket "); + exportedSnapshotDir.startsWith("gs://"), "snapshot folder must be hosted in a GCS bucket "); Configuration conf = HBaseConfiguration.create(); try { - conf.set("hbase.rootdir", snapshotDir.toString()); + conf.set("hbase.rootdir", exportedSnapshotDir); conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); - conf.set("fs.gs.project.id", gcsProjectId.toString()); - conf.set("fs.defaultFS", snapshotDir.toString()); + conf.set("fs.gs.project.id", projectId); + conf.set("fs.defaultFS", exportedSnapshotDir); conf.set("google.cloud.auth.service.account.enable", "true"); conf.setClass( "mapreduce.job.inputformat.class", TableSnapshotInputFormat.class, InputFormat.class); @@ -80,16 +95,11 @@ class HBaseSnapshotInputConfiguration { // LOG.debug(conf); Job job = Job.getInstance(conf); // creates internal clone of hbaseConf - TableSnapshotInputFormat.setInput( - job, snapshotName.toString(), new Path(restoreDir.toString())); - conf = job.getConfiguration(); // extract the modified clone + TableSnapshotInputFormat.setInput(job, snapshotName, new Path(restoreDir)); + return job.getConfiguration(); // extract the modified clone } catch (Exception e) { LOG.fatal(e); } - this.hbaseConf = new SerializableConfiguration(conf); - } - - public Configuration getHbaseConf() { - return hbaseConf.get(); + return conf; } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index 072ee2fc09..6c48038fa2 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -134,12 +134,12 @@ static Pipeline buildPipeline(ImportOptions opts) { "Read from HBase Snapshot", HadoopFormatIO.read() .withConfiguration( - new HBaseSnapshotInputConfiguration( - opts.getBigtableProject(), - opts.getHbaseRootDir(), - opts.getSnapshotName(), - opts.getRestoreDir()) - .getHbaseConf())) + new HBaseSnapshotInputConfigBuilder() + .setProjectId(opts.getBigtableProject().get()) + .setExportedSnapshotDir(opts.getHbaseRootDir().get()) + .setSnapshotName(opts.getSnapshotName().get()) + .setRestoreDir(opts.getRestoreDir().get()) + .build())) .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) .apply("Write to Bigtable", createSink(opts)); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java new file mode 100644 index 0000000000..2d5cc71a9e --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Main.java @@ -0,0 +1,72 @@ +/* + * Copyright 2017 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.sequencefiles; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; +import java.io.File; +import java.net.URISyntaxException; +import java.util.Arrays; + +/** Entry point for create-table/import/export job submission. */ +@InternalExtensionOnly +public class Main { + /** For internal use only - public for technical reasons. */ + @InternalApi("For internal usage only") + public Main() {} + + public static void main(String[] args) throws Exception { + if (args.length < 1) { + usage(); + System.exit(1); + } + + String[] subArgs = Arrays.copyOfRange(args, 1, args.length); + + switch (args[0]) { + case "export": + ExportJob.main(subArgs); + break; + case "import": + ImportJob.main(subArgs); + break; + case "create-table": + CreateTableHelper.main(subArgs); + break; + default: + usage(); + System.exit(1); + } + } + + private static void usage() { + String jarName; + + try { + jarName = + new File(Main.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()) + .getName(); + } catch (URISyntaxException e) { + jarName = ""; + } + + System.out.printf( + "java -jar %s \n" + + "Where can be 'export', 'import' or 'create-table'. To get further help, run: \n" + + "java -jar %s --help\n", + jarName, jarName); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigurationTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java similarity index 76% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigurationTest.java rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java index d5290a8562..0c8c6cd139 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigurationTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java @@ -17,13 +17,12 @@ import static org.junit.Assert.assertEquals; -import org.apache.beam.sdk.options.ValueProvider; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat; import org.apache.hadoop.mapreduce.InputFormat; import org.junit.Test; -public class HBaseSnapshotInputConfigurationTest { +public class HBaseSnapshotInputConfigBuilderTest { private static final String TEST_PROJECT = "test_project"; private static final String TEST_SNAPSHOT_DIR = "gs://test-bucket/hbase-export"; @@ -31,14 +30,14 @@ public class HBaseSnapshotInputConfigurationTest { private static final String TEST_RESTORE_DIR = "gs://test-bucket/hbase-restore"; @Test - public void testBuildingHBaseSnapshotInputConfiguration() { + public void testBuildingHBaseSnapshotInputConfigBuilder() { Configuration conf = - new HBaseSnapshotInputConfiguration( - ValueProvider.StaticValueProvider.of(TEST_PROJECT), - ValueProvider.StaticValueProvider.of(TEST_SNAPSHOT_DIR), - ValueProvider.StaticValueProvider.of(TEST_SNAPSHOT_NAME), - ValueProvider.StaticValueProvider.of(TEST_RESTORE_DIR)) - .getHbaseConf(); + new HBaseSnapshotInputConfigBuilder() + .setProjectId(TEST_PROJECT) + .setExportedSnapshotDir(TEST_SNAPSHOT_DIR) + .setSnapshotName(TEST_SNAPSHOT_NAME) + .setRestoreDir(TEST_RESTORE_DIR) + .build(); assertEquals( "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS", conf.get("fs.AbstractFileSystem.gs.impl")); assertEquals(TEST_PROJECT, conf.get("fs.gs.project.id")); From f6cdabf34f24b3d179742fb014d80adca9f62d7c Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Wed, 30 Dec 2020 01:35:43 +0000 Subject: [PATCH 11/36] use DataflowRunner instead of DirectRunner for integration tests --- bigtable-dataflow-parent/bigtable-beam-import/pom.xml | 1 + .../cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index cb20c36528..235822c48b 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -26,6 +26,7 @@ limitations under the License. com.google.cloud.bigtable.beam.Main + false diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index cd142bd790..66ce467135 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -25,8 +25,8 @@ import java.util.ArrayList; import java.util.List; import java.util.UUID; +import org.apache.beam.runners.dataflow.DataflowRunner; import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; -import org.apache.beam.runners.direct.DirectRunner; import org.apache.beam.sdk.PipelineResult.State; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; @@ -64,6 +64,7 @@ public class EndToEndIT { private static final String TEST_SNAPSHOT_NAME = "test-snapshot"; // Location of test data hosted on Google Cloud Storage, for on-cloud dataflow tests. private static final String CLOUD_TEST_DATA_FOLDER = "cloud.test.data.folder"; + private static final String DATAFLOW_REGION = "region"; // Column family name used in all test bigtables. private static final String CF = "cf"; @@ -75,6 +76,7 @@ public class EndToEndIT { private String projectId; private String instanceId; private String tableId; + private String region; private GcsUtil gcsUtil; private String dataflowStagingLocation; @@ -90,6 +92,7 @@ public void setup() throws Exception { projectId = getTestProperty(BigtableOptionsFactory.PROJECT_ID_KEY); instanceId = getTestProperty(BigtableOptionsFactory.INSTANCE_ID_KEY); dataflowStagingLocation = getTestProperty(GOOGLE_DATAFLOW_STAGING_LOCATION); + region = getTestProperty(DATAFLOW_REGION); String cloudTestDataFolder = getTestProperty(CLOUD_TEST_DATA_FOLDER); if (!cloudTestDataFolder.endsWith(File.separator)) { cloudTestDataFolder = cloudTestDataFolder + File.separator; @@ -159,10 +162,11 @@ public void testHBaseSnapshotImport() throws Exception { System.out.println("DEBUG (import snapshot) ==>"); DataflowPipelineOptions importPipelineOpts = PipelineOptionsFactory.as(DataflowPipelineOptions.class); - importPipelineOpts.setRunner(DirectRunner.class); + importPipelineOpts.setRunner(DataflowRunner.class); importPipelineOpts.setGcpTempLocation(dataflowStagingLocation); importPipelineOpts.setNumWorkers(1); importPipelineOpts.setProject(projectId); + importPipelineOpts.setRegion(region); ImportJobFromHbaseSnapshot.ImportOptions importOpts = importPipelineOpts.as(ImportJobFromHbaseSnapshot.ImportOptions.class); From bf7409e6739f9fda5e8cbfc9e5ddc2691b41440a Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Wed, 30 Dec 2020 01:37:49 +0000 Subject: [PATCH 12/36] revert pom file override --- bigtable-dataflow-parent/bigtable-beam-import/pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 235822c48b..cb20c36528 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -26,7 +26,6 @@ limitations under the License. com.google.cloud.bigtable.beam.Main - false From fa0d8a8f1f2712358b18b2fc21330aa51bc61b4c Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Wed, 30 Dec 2020 17:20:10 +0000 Subject: [PATCH 13/36] Remove all ValueProvider for now --- .../ImportJobFromHbaseSnapshot.java | 27 +++++++++++-------- .../beam/hbasesnapshots/EndToEndIT.java | 7 ++--- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index 6c48038fa2..cb3f444021 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -29,7 +29,6 @@ import org.apache.beam.sdk.io.hadoop.format.HadoopFormatIO; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.values.PCollection; @@ -90,23 +89,29 @@ public class ImportJobFromHbaseSnapshot { private static final Log LOG = LogFactory.getLog(ImportJobFromHbaseSnapshot.class); public interface ImportOptions extends ImportJob.ImportOptions { + @Description("The GCP project id for the GCS bucket") + String getGcsProject(); + + @SuppressWarnings("unused") + void setGcsProject(String gcsProjectId); + @Description("The HBase root dir where HBase snapshot files resides.") - ValueProvider getHbaseRootDir(); + String getHbaseRootDir(); @SuppressWarnings("unused") - void setHbaseRootDir(ValueProvider hbaseRootDir); + void setHbaseRootDir(String hbaseRootDir); @Description("Temp location for restoring snapshots") - ValueProvider getRestoreDir(); + String getRestoreDir(); @SuppressWarnings("unused") - void setRestoreDir(ValueProvider restoreDir); + void setRestoreDir(String restoreDir); @Description("Snapshot name") - ValueProvider getSnapshotName(); + String getSnapshotName(); @SuppressWarnings("unused") - void setSnapshotName(ValueProvider snapshotName); + void setSnapshotName(String snapshotName); } public static void main(String[] args) { @@ -135,10 +140,10 @@ static Pipeline buildPipeline(ImportOptions opts) { HadoopFormatIO.read() .withConfiguration( new HBaseSnapshotInputConfigBuilder() - .setProjectId(opts.getBigtableProject().get()) - .setExportedSnapshotDir(opts.getHbaseRootDir().get()) - .setSnapshotName(opts.getSnapshotName().get()) - .setRestoreDir(opts.getRestoreDir().get()) + .setProjectId(opts.getGcsProject()) + .setExportedSnapshotDir(opts.getHbaseRootDir()) + .setSnapshotName(opts.getSnapshotName()) + .setRestoreDir(opts.getRestoreDir()) .build())) .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) .apply("Write to Bigtable", createSink(opts)); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index 66ce467135..79153de2ea 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -177,9 +177,10 @@ public void testHBaseSnapshotImport() throws Exception { importOpts.setBigtableAppProfileId(null); // setup Hbase snapshot info - importOpts.setHbaseRootDir(StaticValueProvider.of(hbaseSnapshotDir)); - importOpts.setRestoreDir(StaticValueProvider.of(restoreDir)); - importOpts.setSnapshotName(StaticValueProvider.of(TEST_SNAPSHOT_NAME)); + importOpts.setGcsProject(projectId); + importOpts.setHbaseRootDir(hbaseSnapshotDir); + importOpts.setRestoreDir(restoreDir); + importOpts.setSnapshotName(TEST_SNAPSHOT_NAME); // run pipeline State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); From 1ca1fd81a88857c6bbf2cadb75db3b1ab7777dec Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Wed, 30 Dec 2020 17:33:48 +0000 Subject: [PATCH 14/36] Add gcsProject parameter and remove template related document --- .../ImportJobFromHbaseSnapshot.java | 27 +------------------ 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index cb3f444021..b931ed96bc 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -53,36 +53,11 @@ * --project=$PROJECT \ * --bigtableInstanceId=$INSTANCE \ * --bigtableTableId=$TABLE \ + * --gcsProject=$PROJECT \ * --hbaseRootDir=gs://$HBASE_EXPORT_ROOT_PATH \ * --snapshotName=$SNAPSHOT_NAME \ * --restoreDir=gs://$RESTORE_PATH * - * - *

Execute the following command to create the Dataflow template: - * - *

- * mvn compile exec:java \
- *   -DmainClass=com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot \
- *   -Dexec.args="--runner=DataflowRunner \
- *                --project=$PROJECT \
- *                --stagingLocation=gs://$STAGING_PATH \
- *                --templateLocation=gs://$TEMPLATE_PATH \
- *                --wait=false"
- * 
- * - *

There are a few ways to run the pipeline using the template. See Dataflow doc for details: - * https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload - * a metadata file that contains information about the runtime parameters that can be used for - * parameter validation purpose and more. A sample metadata file can be found at - * "src/main/resources/ImportJob_metadata". - * - *

An example using gcloud command line: - * - *

- * gcloud beta dataflow jobs run $JOB_NAME \
- *   --gcs-location gs://$TEMPLATE_PATH \
- *   --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,hbaseRootDir=gs://$HBASE_EXPORT_ROOT_PATH,snapshotName=$SNAPSHOT_NAME,restoreDir=gs://$RESTORE_PATH
- * 
*/ @InternalExtensionOnly public class ImportJobFromHbaseSnapshot { From f5b086f8ea32d24b8831f29df0319f96f4f4e55a Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Wed, 30 Dec 2020 18:38:05 +0000 Subject: [PATCH 15/36] recover the dependency missed in the rebase --- bigtable-dataflow-parent/bigtable-beam-import/pom.xml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index cb20c36528..357b3e22fd 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -75,6 +75,13 @@ limitations under the License. beam-sdks-java-core ${beam.version}
+ + org.apache.beam + + beam-sdks-java-extensions-google-cloud-platform-core + + ${beam.version} + org.apache.beam beam-sdks-java-io-hadoop-common From d0389c06a805b21530066ac3356c41265e238b1f Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 5 Jan 2021 21:12:49 +0000 Subject: [PATCH 16/36] Update new files using latest header comment format and update year to 2021 --- .../src/main/java/com/google/cloud/bigtable/beam/Main.java | 2 +- .../beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java | 2 +- .../beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java | 2 +- .../google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java | 2 +- .../hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java index 52fee350d7..32e7eaa665 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java @@ -1,5 +1,5 @@ /* - * Copyright 2017 Google Inc. All Rights Reserved. + * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index 21091c6e8b..46c0330550 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -1,5 +1,5 @@ /* - * Copyright 2017 Google Inc. All Rights Reserved. + * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index b931ed96bc..8c271ff212 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -1,5 +1,5 @@ /* - * Copyright 2017 Google Inc. All Rights Reserved. + * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index 79153de2ea..b30823eb3d 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java index 0c8c6cd139..8cd58a9da8 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 27e36571f39b788e3f56e6fa0847d9f4111ffea8 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Fri, 8 Jan 2021 22:34:11 +0000 Subject: [PATCH 17/36] Remove workaround for BIGTABLE_BULK_AUTOFLUSH_MS_KEY --- .../beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index 8c271ff212..c232f4826a 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -22,7 +22,6 @@ import com.google.cloud.bigtable.beam.sequencefiles.HBaseResultToMutationFn; import com.google.cloud.bigtable.beam.sequencefiles.ImportJob; import com.google.cloud.bigtable.beam.sequencefiles.Utils; -import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; import com.google.common.annotations.VisibleForTesting; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; @@ -128,11 +127,6 @@ static Pipeline buildPipeline(ImportOptions opts) { static PTransform, PDone> createSink(ImportOptions opts) { CloudBigtableTableConfiguration config = TemplateUtils.BuildImportConfig(opts); - config = - config - .toBuilder() - .withConfiguration(BigtableOptionsFactory.BIGTABLE_BULK_AUTOFLUSH_MS_KEY, "0") - .build(); return CloudBigtableIO.writeToTable(config); } } From 0a5ced1272d60ca2e243df9f2afc502dcab552bc Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Mon, 11 Jan 2021 16:54:51 +0000 Subject: [PATCH 18/36] Exclude hbase-shaded-client --- bigtable-dataflow-parent/bigtable-beam-import/pom.xml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 357b3e22fd..9c13e1c6d7 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -67,6 +67,10 @@ limitations under the License. io.opencensus * + + org.apache.hbase + hbase-shaded-client + @@ -93,12 +97,6 @@ limitations under the License. ${beam.version} - - org.apache.hbase - hbase-shaded-client - ${hbase.version} - - From 19b2a5c0cfcde513157dabdf1b5505418996c1c2 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 12 Jan 2021 16:52:17 +0000 Subject: [PATCH 19/36] Clean up all transitive depdendencies on hbase-shaded-client --- bigtable-dataflow-parent/bigtable-beam-import/pom.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 9c13e1c6d7..d824e7ef22 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -49,6 +49,12 @@ limitations under the License. ${project.groupId} bigtable-hbase-beam ${project.version} + + + org.apache.hbase + hbase-shaded-client + + com.google.cloud.bigtable From 61987c36a08d30ccad26ec016140d023c9b6d5b2 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Thu, 14 Jan 2021 00:05:22 +0000 Subject: [PATCH 20/36] Add document for integration test generation instructions remove unnecessary code --- .../com/google/cloud/bigtable/beam/Main.java | 2 +- .../HBaseSnapshotInputConfigBuilder.java | 18 +++- .../ImportJobFromHbaseSnapshot.java | 2 +- .../beam/hbasesnapshots/EndToEndIT.java | 90 +++++++++---------- .../src/test/resources/README.md | 18 ++++ 5 files changed, 78 insertions(+), 52 deletions(-) create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/README.md diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java index 32e7eaa665..b346b90837 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java @@ -27,7 +27,7 @@ /** Entry point for create-table/import/export job submission. */ @InternalExtensionOnly -public class Main { +final class Main { /** For internal use only - public for technical reasons. */ @InternalApi("For internal usage only") public Main() {} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index 46c0330550..055655b0c7 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -15,8 +15,6 @@ */ package com.google.cloud.bigtable.beam.hbasesnapshots; -import static java.lang.System.*; - import com.google.common.base.Preconditions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -52,21 +50,34 @@ class HBaseSnapshotInputConfigBuilder { public HBaseSnapshotInputConfigBuilder() {} + /* + * Set the project id use to access the GCS bucket with HBase snapshot data to be imported + */ public HBaseSnapshotInputConfigBuilder setProjectId(String projectId) { this.projectId = projectId; return this; } + /* + * Set the GCS path where the HBase snapshot data is located + */ public HBaseSnapshotInputConfigBuilder setExportedSnapshotDir(String exportedSnapshotDir) { this.exportedSnapshotDir = exportedSnapshotDir; return this; } + /* + * Set the name of the snapshot to be imported + */ public HBaseSnapshotInputConfigBuilder setSnapshotName(String snapshotName) { this.snapshotName = snapshotName; return this; } + /* + * Set the temporal GCS path used by TableSnapshotInputFormat while reading the HBase snapshot + * This path should not be under {@code exportedSnapshotDir} + */ public HBaseSnapshotInputConfigBuilder setRestoreDir(String restoreDir) { this.restoreDir = restoreDir; return this; @@ -76,7 +87,7 @@ public Configuration build() { Preconditions.checkNotNull(projectId); Preconditions.checkNotNull(exportedSnapshotDir); Preconditions.checkNotNull(snapshotName); - Preconditions.checkArgument( + Preconditions.checkState( exportedSnapshotDir.startsWith("gs://"), "snapshot folder must be hosted in a GCS bucket "); Configuration conf = HBaseConfiguration.create(); @@ -93,7 +104,6 @@ public Configuration build() { ClientProtos.Scan proto = ProtobufUtil.toScan(new Scan().setBatch(BATCH_SIZE)); conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())); - // LOG.debug(conf); Job job = Job.getInstance(conf); // creates internal clone of hbaseConf TableSnapshotInputFormat.setInput(job, snapshotName, new Path(restoreDir)); return job.getConfiguration(); // extract the modified clone diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index c232f4826a..42bf6a3b04 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -75,7 +75,7 @@ public interface ImportOptions extends ImportJob.ImportOptions { @SuppressWarnings("unused") void setHbaseRootDir(String hbaseRootDir); - @Description("Temp location for restoring snapshots") + @Description("Temporal location for restoring snapshots") String getRestoreDir(); @SuppressWarnings("unused") diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index b30823eb3d..d270e8b8db 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -148,51 +148,49 @@ public void teardown() throws IOException { @Test public void testHBaseSnapshotImport() throws Exception { - try (Connection connection = BigtableConfiguration.connect(projectId, instanceId)) { - // Crete table - System.out.println("DEBUG (create test table) ==>"); - TableName tableName = TableName.valueOf(tableId); - HTableDescriptor descriptor = new HTableDescriptor(tableName); - - descriptor.addFamily(new HColumnDescriptor(CF)); - - connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys()); - - // Start import - System.out.println("DEBUG (import snapshot) ==>"); - DataflowPipelineOptions importPipelineOpts = - PipelineOptionsFactory.as(DataflowPipelineOptions.class); - importPipelineOpts.setRunner(DataflowRunner.class); - importPipelineOpts.setGcpTempLocation(dataflowStagingLocation); - importPipelineOpts.setNumWorkers(1); - importPipelineOpts.setProject(projectId); - importPipelineOpts.setRegion(region); - - ImportJobFromHbaseSnapshot.ImportOptions importOpts = - importPipelineOpts.as(ImportJobFromHbaseSnapshot.ImportOptions.class); - // setup GCP and bigtable - importOpts.setBigtableProject(StaticValueProvider.of(projectId)); - importOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId)); - importOpts.setBigtableTableId(StaticValueProvider.of(tableId)); - importOpts.setBigtableAppProfileId(null); - - // setup Hbase snapshot info - importOpts.setGcsProject(projectId); - importOpts.setHbaseRootDir(hbaseSnapshotDir); - importOpts.setRestoreDir(restoreDir); - importOpts.setSnapshotName(TEST_SNAPSHOT_NAME); - - // run pipeline - State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); - Assert.assertEquals(State.DONE, state); - - // check data in bigtable - BigtableTableUtils destTable = new BigtableTableUtils(connection, tableId, CF); - Assert.assertEquals( - 100 /* There are 100 rows in test snapshot*/, - destTable.readAllCellsFromTable().toArray().length); - - // TODO(vermas2012): Add more validations after this. - } + // Crete table + System.out.println("DEBUG (create test table) ==>"); + TableName tableName = TableName.valueOf(tableId); + HTableDescriptor descriptor = new HTableDescriptor(tableName); + + descriptor.addFamily(new HColumnDescriptor(CF)); + + connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys()); + + // Start import + System.out.println("DEBUG (import snapshot) ==>"); + DataflowPipelineOptions importPipelineOpts = + PipelineOptionsFactory.as(DataflowPipelineOptions.class); + importPipelineOpts.setRunner(DataflowRunner.class); + importPipelineOpts.setGcpTempLocation(dataflowStagingLocation); + importPipelineOpts.setNumWorkers(1); + importPipelineOpts.setProject(projectId); + importPipelineOpts.setRegion(region); + + ImportJobFromHbaseSnapshot.ImportOptions importOpts = + importPipelineOpts.as(ImportJobFromHbaseSnapshot.ImportOptions.class); + + // setup GCP and bigtable + importOpts.setBigtableProject(StaticValueProvider.of(projectId)); + importOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId)); + importOpts.setBigtableTableId(StaticValueProvider.of(tableId)); + + // setup HBase snapshot info + importOpts.setGcsProject(projectId); + importOpts.setHbaseRootDir(hbaseSnapshotDir); + importOpts.setRestoreDir(restoreDir); + importOpts.setSnapshotName(TEST_SNAPSHOT_NAME); + + // run pipeline + State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + // check data in bigtable + BigtableTableUtils destTable = new BigtableTableUtils(connection, tableId, CF); + Assert.assertEquals( + 100 /* There are 100 rows in test snapshot*/, + destTable.readAllCellsFromTable().toArray().length); + + // TODO(vermas2012): Add more validations after this. } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/README.md b/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/README.md new file mode 100644 index 0000000000..3d9b722bb9 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/README.md @@ -0,0 +1,18 @@ +# Generating the test HBase snapshot for HBase snapshot import integration tests + +The file `generate_test_data.txt` is an HBase command line command sequence +used to generated the testing HBase snapshot data. + +If you need to modify the test data used by `bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java`, +Please make sure you have HBase installed and export `/bin` to your PATH. + +Then: + + $ hbase shell ./generate_test_data.txt + $ hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -Dmapreduce.framework.name=local -snapshot test-snapshot -copy-to file:////data + + $ cd + $ gsutil -m cp -r ./data/ gs:///integration-test/ + +After this, you use be able to run the integration test with your new data by specifying +`-Dcloud.test.data.folder=gs:///integration-test/` \ No newline at end of file From 77f528bd00dabdc18bca817be1a20f5cf25ba1a7 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Fri, 15 Jan 2021 04:49:34 +0000 Subject: [PATCH 21/36] Update document --- .../beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index 055655b0c7..fe7434186c 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -68,6 +68,9 @@ public HBaseSnapshotInputConfigBuilder setExportedSnapshotDir(String exportedSna /* * Set the name of the snapshot to be imported + * e.g when importing snapshot 'gs:///hbase-export/table_snapshot' + * put 'table_snapshot' as the {@code snapshotName} + * and 'gs:///hbase-export' as {@code exportedSnapshotDir} */ public HBaseSnapshotInputConfigBuilder setSnapshotName(String snapshotName) { this.snapshotName = snapshotName; @@ -75,7 +78,7 @@ public HBaseSnapshotInputConfigBuilder setSnapshotName(String snapshotName) { } /* - * Set the temporal GCS path used by TableSnapshotInputFormat while reading the HBase snapshot + * Set the temporary restore GCS path used by TableSnapshotInputFormat while reading the HBase snapshot * This path should not be under {@code exportedSnapshotDir} */ public HBaseSnapshotInputConfigBuilder setRestoreDir(String restoreDir) { From 703fe6a8554bf36884f28fd346ce4b9490115ecb Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 19 Jan 2021 05:26:44 +0000 Subject: [PATCH 22/36] Fail the pipeline building when there is an exception configuring input updated unit test more comments --- .../bigtable-beam-import/pom.xml | 1 + .../HBaseSnapshotInputConfigBuilder.java | 51 +++++++++++-------- .../ImportJobFromHbaseSnapshot.java | 45 ++++++++-------- .../beam/hbasesnapshots/EndToEndIT.java | 1 - .../HBaseSnapshotInputConfigBuilderTest.java | 2 +- 5 files changed, 57 insertions(+), 43 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index d824e7ef22..6b300b0426 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -26,6 +26,7 @@ limitations under the License. com.google.cloud.bigtable.beam.Main + false diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index fe7434186c..d959c914d8 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -41,6 +41,7 @@ class HBaseSnapshotInputConfigBuilder { private static final Log LOG = LogFactory.getLog(HBaseSnapshotInputConfigBuilder.class); + // Batch size used for HBase snapshot scans private static final int BATCH_SIZE = 1000; private String projectId; @@ -86,33 +87,43 @@ public HBaseSnapshotInputConfigBuilder setRestoreDir(String restoreDir) { return this; } - public Configuration build() { + public Configuration build() throws Exception { Preconditions.checkNotNull(projectId); Preconditions.checkNotNull(exportedSnapshotDir); Preconditions.checkNotNull(snapshotName); Preconditions.checkState( exportedSnapshotDir.startsWith("gs://"), "snapshot folder must be hosted in a GCS bucket "); + Configuration conf = createHBaseConfiguration(); + + // Configuring a MapReduce Job base on HBaseConfiguration + // and return the job Configuration + ClientProtos.Scan proto = ProtobufUtil.toScan(new Scan().setBatch(BATCH_SIZE)); + conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())); + Job job = Job.getInstance(conf); // creates internal clone of hbaseConf + TableSnapshotInputFormat.setInput(job, snapshotName, new Path(restoreDir)); + return job.getConfiguration(); // extract the modified clone + } + + // separate static part for unit testing + public Configuration createHBaseConfiguration() { Configuration conf = HBaseConfiguration.create(); - try { - conf.set("hbase.rootdir", exportedSnapshotDir); - conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); - conf.set("fs.gs.project.id", projectId); - conf.set("fs.defaultFS", exportedSnapshotDir); - conf.set("google.cloud.auth.service.account.enable", "true"); - conf.setClass( - "mapreduce.job.inputformat.class", TableSnapshotInputFormat.class, InputFormat.class); - conf.setClass("key.class", ImmutableBytesWritable.class, Writable.class); - conf.setClass("value.class", Result.class, Object.class); - ClientProtos.Scan proto = ProtobufUtil.toScan(new Scan().setBatch(BATCH_SIZE)); - conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())); - - Job job = Job.getInstance(conf); // creates internal clone of hbaseConf - TableSnapshotInputFormat.setInput(job, snapshotName, new Path(restoreDir)); - return job.getConfiguration(); // extract the modified clone - } catch (Exception e) { - LOG.fatal(e); - } + + // Setup the input data location for HBase snapshot import + // exportedSnapshotDir should be a GCS Bucket path. + conf.set("hbase.rootdir", exportedSnapshotDir); + conf.set("fs.defaultFS", exportedSnapshotDir); + + // Setup GCS connector to use GCS as Hadoop filesystem + conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); + conf.set("fs.gs.project.id", projectId); + conf.set("google.cloud.auth.service.account.enable", "true"); + + // Setup MapReduce config for TableSnapshotInputFormat + conf.setClass( + "mapreduce.job.inputformat.class", TableSnapshotInputFormat.class, InputFormat.class); + conf.setClass("key.class", ImmutableBytesWritable.class, Writable.class); + conf.setClass("value.class", Result.class, Object.class); return conf; } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index 42bf6a3b04..3529ee5b1d 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -34,13 +34,14 @@ import org.apache.beam.sdk.values.PDone; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.client.Mutation; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; /** * A job that imports data from HBase snapshot exports hosted in Cloud Storage bucket into Cloud - * Bigtable. This job can be run directly or as a Dataflow template. + * Bigtable. * *

Execute the following command to run the job directly: * @@ -63,12 +64,6 @@ public class ImportJobFromHbaseSnapshot { private static final Log LOG = LogFactory.getLog(ImportJobFromHbaseSnapshot.class); public interface ImportOptions extends ImportJob.ImportOptions { - @Description("The GCP project id for the GCS bucket") - String getGcsProject(); - - @SuppressWarnings("unused") - void setGcsProject(String gcsProjectId); - @Description("The HBase root dir where HBase snapshot files resides.") String getHbaseRootDir(); @@ -107,21 +102,29 @@ public static void main(String[] args) { @VisibleForTesting static Pipeline buildPipeline(ImportOptions opts) { - Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); - pipeline - .apply( - "Read from HBase Snapshot", - HadoopFormatIO.read() - .withConfiguration( - new HBaseSnapshotInputConfigBuilder() - .setProjectId(opts.getGcsProject()) - .setExportedSnapshotDir(opts.getHbaseRootDir()) - .setSnapshotName(opts.getSnapshotName()) - .setRestoreDir(opts.getRestoreDir()) - .build())) - .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) - .apply("Write to Bigtable", createSink(opts)); + Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); + try { + Configuration configuration = + new HBaseSnapshotInputConfigBuilder() + .setProjectId(opts.getProject()) + .setExportedSnapshotDir(opts.getHbaseRootDir()) + .setSnapshotName(opts.getSnapshotName()) + .setRestoreDir(opts.getRestoreDir()) + .build(); + pipeline + .apply( + "Read from HBase Snapshot", + HadoopFormatIO.read() + .withConfiguration(configuration)) + .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) + .apply("Write to Bigtable", createSink(opts)); + + } catch (Exception e) { + LOG.fatal("Failed to create HBaseConfiguration for HadoopFormatIO"); + LOG.fatal(e); + System.exit(-1); + } return pipeline; } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index d270e8b8db..5a32f2de35 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -176,7 +176,6 @@ public void testHBaseSnapshotImport() throws Exception { importOpts.setBigtableTableId(StaticValueProvider.of(tableId)); // setup HBase snapshot info - importOpts.setGcsProject(projectId); importOpts.setHbaseRootDir(hbaseSnapshotDir); importOpts.setRestoreDir(restoreDir); importOpts.setSnapshotName(TEST_SNAPSHOT_NAME); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java index 8cd58a9da8..e806125cba 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java @@ -37,7 +37,7 @@ public void testBuildingHBaseSnapshotInputConfigBuilder() { .setExportedSnapshotDir(TEST_SNAPSHOT_DIR) .setSnapshotName(TEST_SNAPSHOT_NAME) .setRestoreDir(TEST_RESTORE_DIR) - .build(); + .createHBaseConfiguration(); assertEquals( "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS", conf.get("fs.AbstractFileSystem.gs.impl")); assertEquals(TEST_PROJECT, conf.get("fs.gs.project.id")); From 049cc433d051c70bd7959dd89e85d375631f1362 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 19 Jan 2021 05:51:20 +0000 Subject: [PATCH 23/36] renaming according to review comments --- .../bigtable-beam-import/README.md | 2 +- .../HBaseSnapshotInputConfigBuilder.java | 15 ++++++++------- .../ImportJobFromHbaseSnapshot.java | 6 +++--- .../src/main/resources/log4j.properties | 2 -- .../bigtable/beam/hbasesnapshots/EndToEndIT.java | 2 +- .../HBaseSnapshotInputConfigBuilderTest.java | 2 +- 6 files changed, 14 insertions(+), 15 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/README.md b/bigtable-dataflow-parent/bigtable-beam-import/README.md index abe8037f30..ccbc627603 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/README.md +++ b/bigtable-dataflow-parent/bigtable-beam-import/README.md @@ -42,4 +42,4 @@ java -jar bigtable-beam-import-1.14.1-shaded.jar import \ --maxNumWorkers=[3x number of nodes] \ --zone=[zone of your cluster] ``` -[//]: # ({x-version-update-end}) \ No newline at end of file +[//]: # ({x-version-update-end}) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index d959c914d8..1fdf57433c 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -45,7 +45,7 @@ class HBaseSnapshotInputConfigBuilder { private static final int BATCH_SIZE = 1000; private String projectId; - private String exportedSnapshotDir; + private String hbaseSnapshotSourceDir; private String snapshotName; private String restoreDir; @@ -62,8 +62,8 @@ public HBaseSnapshotInputConfigBuilder setProjectId(String projectId) { /* * Set the GCS path where the HBase snapshot data is located */ - public HBaseSnapshotInputConfigBuilder setExportedSnapshotDir(String exportedSnapshotDir) { - this.exportedSnapshotDir = exportedSnapshotDir; + public HBaseSnapshotInputConfigBuilder setHbaseSnapshotSourceDir(String hbaseSnapshotSourceDir) { + this.hbaseSnapshotSourceDir = hbaseSnapshotSourceDir; return this; } @@ -89,10 +89,11 @@ public HBaseSnapshotInputConfigBuilder setRestoreDir(String restoreDir) { public Configuration build() throws Exception { Preconditions.checkNotNull(projectId); - Preconditions.checkNotNull(exportedSnapshotDir); + Preconditions.checkNotNull(hbaseSnapshotSourceDir); Preconditions.checkNotNull(snapshotName); Preconditions.checkState( - exportedSnapshotDir.startsWith("gs://"), "snapshot folder must be hosted in a GCS bucket "); + hbaseSnapshotSourceDir.startsWith("gs://"), + "snapshot folder must be hosted in a GCS bucket "); Configuration conf = createHBaseConfiguration(); @@ -111,8 +112,8 @@ public Configuration createHBaseConfiguration() { // Setup the input data location for HBase snapshot import // exportedSnapshotDir should be a GCS Bucket path. - conf.set("hbase.rootdir", exportedSnapshotDir); - conf.set("fs.defaultFS", exportedSnapshotDir); + conf.set("hbase.rootdir", hbaseSnapshotSourceDir); + conf.set("fs.defaultFS", hbaseSnapshotSourceDir); // Setup GCS connector to use GCS as Hadoop filesystem conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index 3529ee5b1d..3177b53836 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -65,10 +65,10 @@ public class ImportJobFromHbaseSnapshot { public interface ImportOptions extends ImportJob.ImportOptions { @Description("The HBase root dir where HBase snapshot files resides.") - String getHbaseRootDir(); + String getHbaseSnapshotSourceDir(); @SuppressWarnings("unused") - void setHbaseRootDir(String hbaseRootDir); + void setHbaseSnapshotSourceDir(String hbaseSnapshotSourceDir); @Description("Temporal location for restoring snapshots") String getRestoreDir(); @@ -108,7 +108,7 @@ static Pipeline buildPipeline(ImportOptions opts) { Configuration configuration = new HBaseSnapshotInputConfigBuilder() .setProjectId(opts.getProject()) - .setExportedSnapshotDir(opts.getHbaseRootDir()) + .setHbaseSnapshotSourceDir(opts.getHbaseSnapshotSourceDir()) .setSnapshotName(opts.getSnapshotName()) .setRestoreDir(opts.getRestoreDir()) .build(); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/resources/log4j.properties b/bigtable-dataflow-parent/bigtable-beam-import/src/main/resources/log4j.properties index cff29c2435..04f0fab6bb 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/resources/log4j.properties +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/resources/log4j.properties @@ -22,5 +22,3 @@ log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n log4j.category.org.apache.hadoop.io.compress.CodecPool=WARN -log4j.logger.org.apache.hadoop.io.compress.Compression=TRACE -log4j.logger.org.apache.hadoop.util.NativeCodeLoader=DEBUG diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index 5a32f2de35..eae4a33e44 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -176,7 +176,7 @@ public void testHBaseSnapshotImport() throws Exception { importOpts.setBigtableTableId(StaticValueProvider.of(tableId)); // setup HBase snapshot info - importOpts.setHbaseRootDir(hbaseSnapshotDir); + importOpts.setHbaseSnapshotSourceDir(hbaseSnapshotDir); importOpts.setRestoreDir(restoreDir); importOpts.setSnapshotName(TEST_SNAPSHOT_NAME); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java index e806125cba..67aaf6a445 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java @@ -34,7 +34,7 @@ public void testBuildingHBaseSnapshotInputConfigBuilder() { Configuration conf = new HBaseSnapshotInputConfigBuilder() .setProjectId(TEST_PROJECT) - .setExportedSnapshotDir(TEST_SNAPSHOT_DIR) + .setHbaseSnapshotSourceDir(TEST_SNAPSHOT_DIR) .setSnapshotName(TEST_SNAPSHOT_NAME) .setRestoreDir(TEST_RESTORE_DIR) .createHBaseConfiguration(); From 2f44f4de2b9169be5be4f2407f3b7ff1d1018aba Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 19 Jan 2021 05:56:32 +0000 Subject: [PATCH 24/36] update comments --- .../beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index 3177b53836..a091bea89f 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -53,8 +53,7 @@ * --project=$PROJECT \ * --bigtableInstanceId=$INSTANCE \ * --bigtableTableId=$TABLE \ - * --gcsProject=$PROJECT \ - * --hbaseRootDir=gs://$HBASE_EXPORT_ROOT_PATH \ + * --hbaseSnapshotSourceDir=gs://$HBASE_EXPORT_ROOT_PATH \ * --snapshotName=$SNAPSHOT_NAME \ * --restoreDir=gs://$RESTORE_PATH * From 39b003e0f95fe50404df7e93904bd72bf9bd0e3c Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 19 Jan 2021 06:05:26 +0000 Subject: [PATCH 25/36] More document about hbase snapshot file structure --- .../beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index a091bea89f..f204d1ba64 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -43,7 +43,10 @@ * A job that imports data from HBase snapshot exports hosted in Cloud Storage bucket into Cloud * Bigtable. * - *

Execute the following command to run the job directly: + * Example: + * If you have exported your HBase Snapshot to GCS bucket gs://$HBASE_EXPORT_ROOT_PATH and want to + * import snapshot gs://$HBASE_EXPORT_ROOT_PATH/.hbase-snapshot/$SNAPSHOT_NAME into Cloud Bigtable + * $TABLE in $INSTANCE, execute the following command to run the job directly: * *

  * mvn compile exec:java \
@@ -57,6 +60,9 @@
  *                --snapshotName=$SNAPSHOT_NAME  \
  *                --restoreDir=gs://$RESTORE_PATH
  * 
+ * + * Note: restoreDir is a GCS path used to save temporal files generated by Snapshot scans. + * and it must not be a sub-folder of the hbaseSnapshotSourceDir. */ @InternalExtensionOnly public class ImportJobFromHbaseSnapshot { From d96cffbafdb763afb2c7c38e4c77f78adb25a1c9 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 19 Jan 2021 15:00:55 +0000 Subject: [PATCH 26/36] System.out -> LOG --- .../hbasesnapshots/ImportJobFromHbaseSnapshot.java | 11 +++++------ .../bigtable/beam/hbasesnapshots/EndToEndIT.java | 9 ++++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index f204d1ba64..46beeb60f2 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -43,10 +43,9 @@ * A job that imports data from HBase snapshot exports hosted in Cloud Storage bucket into Cloud * Bigtable. * - * Example: - * If you have exported your HBase Snapshot to GCS bucket gs://$HBASE_EXPORT_ROOT_PATH and want to - * import snapshot gs://$HBASE_EXPORT_ROOT_PATH/.hbase-snapshot/$SNAPSHOT_NAME into Cloud Bigtable - * $TABLE in $INSTANCE, execute the following command to run the job directly: + *

Example: If you have exported your HBase Snapshot to GCS bucket gs://$HBASE_EXPORT_ROOT_PATH + * and want to import snapshot gs://$HBASE_EXPORT_ROOT_PATH/.hbase-snapshot/$SNAPSHOT_NAME into + * Cloud Bigtable $TABLE in $INSTANCE, execute the following command to run the job directly: * *

  * mvn compile exec:java \
@@ -61,8 +60,8 @@
  *                --restoreDir=gs://$RESTORE_PATH
  * 
* - * Note: restoreDir is a GCS path used to save temporal files generated by Snapshot scans. - * and it must not be a sub-folder of the hbaseSnapshotSourceDir. + * Note: restoreDir is a GCS path used to save temporal files generated by Snapshot scans. and it + * must not be a sub-folder of the hbaseSnapshotSourceDir. */ @InternalExtensionOnly public class ImportJobFromHbaseSnapshot { diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index eae4a33e44..4dc924f62e 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -33,6 +33,8 @@ import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.TableName; @@ -61,6 +63,7 @@ */ public class EndToEndIT { + private final Log LOG = LogFactory.getLog(getClass()); private static final String TEST_SNAPSHOT_NAME = "test-snapshot"; // Location of test data hosted on Google Cloud Storage, for on-cloud dataflow tests. private static final String CLOUD_TEST_DATA_FOLDER = "cloud.test.data.folder"; @@ -111,7 +114,7 @@ public void setup() throws Exception { connection = BigtableConfiguration.connect(projectId, instanceId); tableId = "test_" + UUID.randomUUID().toString(); - System.out.println("Setting up integration tests"); + LOG.info("Setting up integration tests"); String[] keys = new String[] {"1", "2", "3", "4", "5", "6", "7", "8", "9"}; keySplits = new byte[keys.length][]; @@ -149,7 +152,7 @@ public void teardown() throws IOException { public void testHBaseSnapshotImport() throws Exception { // Crete table - System.out.println("DEBUG (create test table) ==>"); + LOG.debug("DEBUG (create test table) ==>"); TableName tableName = TableName.valueOf(tableId); HTableDescriptor descriptor = new HTableDescriptor(tableName); @@ -158,7 +161,7 @@ public void testHBaseSnapshotImport() throws Exception { connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys()); // Start import - System.out.println("DEBUG (import snapshot) ==>"); + LOG.debug("DEBUG (import snapshot) ==>"); DataflowPipelineOptions importPipelineOpts = PipelineOptionsFactory.as(DataflowPipelineOptions.class); importPipelineOpts.setRunner(DataflowRunner.class); From ce4e2cfe6f1919892b6fccc4a3453d8c18d0f0b1 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Thu, 21 Jan 2021 14:45:56 +0000 Subject: [PATCH 27/36] throw out exception instead of terminating JVM --- .../ImportJobFromHbaseSnapshot.java | 54 +++++++------------ 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index 46beeb60f2..4b31f4a709 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -17,7 +17,6 @@ import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; import com.google.cloud.bigtable.beam.CloudBigtableIO; -import com.google.cloud.bigtable.beam.CloudBigtableTableConfiguration; import com.google.cloud.bigtable.beam.TemplateUtils; import com.google.cloud.bigtable.beam.sequencefiles.HBaseResultToMutationFn; import com.google.cloud.bigtable.beam.sequencefiles.ImportJob; @@ -28,14 +27,10 @@ import org.apache.beam.sdk.io.hadoop.format.HadoopFormatIO; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PDone; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.client.Mutation; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; @@ -87,16 +82,15 @@ public interface ImportOptions extends ImportJob.ImportOptions { void setSnapshotName(String snapshotName); } - public static void main(String[] args) { + public static void main(String[] args) throws Exception { PipelineOptionsFactory.register(ImportOptions.class); ImportOptions opts = PipelineOptionsFactory.fromArgs(args).withValidation().as(ImportOptions.class); - LOG.info("DEBUG===> Building Pipeline"); + LOG.info("Building Pipeline"); Pipeline pipeline = buildPipeline(opts); - - LOG.info("DEBUG===> Running Pipeline"); + LOG.info("Running Pipeline"); PipelineResult result = pipeline.run(); if (opts.getWait()) { @@ -105,35 +99,25 @@ public static void main(String[] args) { } @VisibleForTesting - static Pipeline buildPipeline(ImportOptions opts) { + static Pipeline buildPipeline(ImportOptions opts) throws Exception { Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); - try { - Configuration configuration = - new HBaseSnapshotInputConfigBuilder() - .setProjectId(opts.getProject()) - .setHbaseSnapshotSourceDir(opts.getHbaseSnapshotSourceDir()) - .setSnapshotName(opts.getSnapshotName()) - .setRestoreDir(opts.getRestoreDir()) - .build(); - pipeline - .apply( - "Read from HBase Snapshot", - HadoopFormatIO.read() - .withConfiguration(configuration)) - .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) - .apply("Write to Bigtable", createSink(opts)); + Configuration configuration = + new HBaseSnapshotInputConfigBuilder() + .setProjectId(opts.getProject()) + .setHbaseSnapshotSourceDir(opts.getHbaseSnapshotSourceDir()) + .setSnapshotName(opts.getSnapshotName()) + .setRestoreDir(opts.getRestoreDir()) + .build(); + pipeline + .apply( + "Read from HBase Snapshot", + HadoopFormatIO.read().withConfiguration(configuration)) + .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) + .apply( + "Write to Bigtable", + CloudBigtableIO.writeToTable(TemplateUtils.BuildImportConfig(opts))); - } catch (Exception e) { - LOG.fatal("Failed to create HBaseConfiguration for HadoopFormatIO"); - LOG.fatal(e); - System.exit(-1); - } return pipeline; } - - static PTransform, PDone> createSink(ImportOptions opts) { - CloudBigtableTableConfiguration config = TemplateUtils.BuildImportConfig(opts); - return CloudBigtableIO.writeToTable(config); - } } From fc6e1e415ab3f3376a2922b2e7906acff7eaf31f Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Fri, 22 Jan 2021 21:40:17 +0000 Subject: [PATCH 28/36] Remove outside visible parameter restoreDir, use a default dir instead Add cleanup phase --- .../CleanupHBaseSnapshotRestoreFilesFn.java | 68 +++++++++++++++++++ .../HBaseSnapshotInputConfigBuilder.java | 17 ++--- .../ImportJobFromHbaseSnapshot.java | 35 ++++++---- ...leanupHBaseSnapshotRestoreFilesFnTest.java | 48 +++++++++++++ .../beam/hbasesnapshots/EndToEndIT.java | 19 ------ .../HBaseSnapshotInputConfigBuilderTest.java | 2 - 6 files changed, 142 insertions(+), 47 deletions(-) create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java new file mode 100644 index 0000000000..24ec52e352 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java @@ -0,0 +1,68 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import com.google.common.base.Preconditions; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.beam.sdk.io.FileSystems; +import org.apache.beam.sdk.io.fs.MoveOptions; +import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.values.KV; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class CleanupHBaseSnapshotRestoreFilesFn extends DoFn, Boolean> { + private static final Log LOG = LogFactory.getLog(CleanupHBaseSnapshotRestoreFilesFn.class); + + @ProcessElement + public void processElement(ProcessContext context) throws IOException { + KV elem = context.element(); + + String hbaseSnapshotDir = elem.getKey(); + String restorePath = elem.getValue(); + String restoreDir = getRestoreDir(hbaseSnapshotDir, restorePath); + List paths = + FileSystems.match(restoreDir + "**").metadata().stream() + .map(metadata -> metadata.resourceId()) + .collect(Collectors.toList()); + FileSystems.delete(paths, MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES); + FileSystems.delete( + Collections.singletonList(FileSystems.matchSingleFileSpec(restoreDir).resourceId()), + MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES); + context.output(true); + } + + public static String getRestoreDir(String hbaseSnapshotDir, String restoreDir) { + Preconditions.checkState( + hbaseSnapshotDir.startsWith("gs://"), "snapshot folder must be hosted in a GCS bucket "); + Preconditions.checkState( + restoreDir.startsWith("/"), + "restore folder must be an absolute path in current filesystem"); + int bucketNameEndIndex = hbaseSnapshotDir.indexOf('/', 5); // "offset gs://" + String bucketName; + if (bucketNameEndIndex > 0) { + bucketName = hbaseSnapshotDir.substring(0, bucketNameEndIndex); + } else { + bucketName = hbaseSnapshotDir; + } + + return String.format("%s%s", bucketName, restoreDir); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index 1fdf57433c..e82e2b9d42 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -44,10 +44,12 @@ class HBaseSnapshotInputConfigBuilder { // Batch size used for HBase snapshot scans private static final int BATCH_SIZE = 1000; + // a temp location to store metadata extracted from snapshot + public static final String RESTORE_DIR = "/.restore"; + private String projectId; private String hbaseSnapshotSourceDir; private String snapshotName; - private String restoreDir; public HBaseSnapshotInputConfigBuilder() {} @@ -78,15 +80,6 @@ public HBaseSnapshotInputConfigBuilder setSnapshotName(String snapshotName) { return this; } - /* - * Set the temporary restore GCS path used by TableSnapshotInputFormat while reading the HBase snapshot - * This path should not be under {@code exportedSnapshotDir} - */ - public HBaseSnapshotInputConfigBuilder setRestoreDir(String restoreDir) { - this.restoreDir = restoreDir; - return this; - } - public Configuration build() throws Exception { Preconditions.checkNotNull(projectId); Preconditions.checkNotNull(hbaseSnapshotSourceDir); @@ -102,7 +95,9 @@ public Configuration build() throws Exception { ClientProtos.Scan proto = ProtobufUtil.toScan(new Scan().setBatch(BATCH_SIZE)); conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())); Job job = Job.getInstance(conf); // creates internal clone of hbaseConf - TableSnapshotInputFormat.setInput(job, snapshotName, new Path(restoreDir)); + // the restore folder need to under current bucket root so to be considered + // within the same filesystem with the hbaseSnapshotSourceDir + TableSnapshotInputFormat.setInput(job, snapshotName, new Path(RESTORE_DIR)); return job.getConfiguration(); // extract the modified clone } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index 4b31f4a709..63e08b5a74 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -22,12 +22,18 @@ import com.google.cloud.bigtable.beam.sequencefiles.ImportJob; import com.google.cloud.bigtable.beam.sequencefiles.Utils; import com.google.common.annotations.VisibleForTesting; +import java.util.Arrays; +import java.util.List; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.io.hadoop.format.HadoopFormatIO; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Wait; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -51,12 +57,8 @@ * --bigtableInstanceId=$INSTANCE \ * --bigtableTableId=$TABLE \ * --hbaseSnapshotSourceDir=gs://$HBASE_EXPORT_ROOT_PATH \ - * --snapshotName=$SNAPSHOT_NAME \ - * --restoreDir=gs://$RESTORE_PATH + * --snapshotName=$SNAPSHOT_NAME * - * - * Note: restoreDir is a GCS path used to save temporal files generated by Snapshot scans. and it - * must not be a sub-folder of the hbaseSnapshotSourceDir. */ @InternalExtensionOnly public class ImportJobFromHbaseSnapshot { @@ -69,12 +71,6 @@ public interface ImportOptions extends ImportJob.ImportOptions { @SuppressWarnings("unused") void setHbaseSnapshotSourceDir(String hbaseSnapshotSourceDir); - @Description("Temporal location for restoring snapshots") - String getRestoreDir(); - - @SuppressWarnings("unused") - void setRestoreDir(String restoreDir); - @Description("Snapshot name") String getSnapshotName(); @@ -107,17 +103,26 @@ static Pipeline buildPipeline(ImportOptions opts) throws Exception { .setProjectId(opts.getProject()) .setHbaseSnapshotSourceDir(opts.getHbaseSnapshotSourceDir()) .setSnapshotName(opts.getSnapshotName()) - .setRestoreDir(opts.getRestoreDir()) .build(); - pipeline - .apply( + PCollection> readResult = + pipeline.apply( "Read from HBase Snapshot", - HadoopFormatIO.read().withConfiguration(configuration)) + HadoopFormatIO.read().withConfiguration(configuration)); + + readResult .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) .apply( "Write to Bigtable", CloudBigtableIO.writeToTable(TemplateUtils.BuildImportConfig(opts))); + final List> tempFiles = + Arrays.asList( + KV.of(opts.getHbaseSnapshotSourceDir(), HBaseSnapshotInputConfigBuilder.RESTORE_DIR)); + pipeline + .apply(Create.of(tempFiles)) + .apply(Wait.on(readResult)) + .apply(ParDo.of(new CleanupHBaseSnapshotRestoreFilesFn())); + return pipeline; } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java new file mode 100644 index 0000000000..64127badec --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java @@ -0,0 +1,48 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +public class CleanupHBaseSnapshotRestoreFilesFnTest { + private static final String TEST_BUCKET_PATH = "gs://test-bucket"; + private static final String TEST_SNAPSHOT_PATH = TEST_BUCKET_PATH + "/hbase-export"; + private static final String TEST_RESTORE_PATH = HBaseSnapshotInputConfigBuilder.RESTORE_DIR; + + @Test + public void testGetRestorePath() { + assertEquals( + "gs://test-bucket" + TEST_RESTORE_PATH + '/', + CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir(TEST_SNAPSHOT_PATH, TEST_RESTORE_PATH)); + + assertEquals( + "gs://test-bucket" + TEST_RESTORE_PATH + '/', + CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir( + TEST_SNAPSHOT_PATH + '/', TEST_RESTORE_PATH)); + + // These are not valid case as one could not use bucket root as hbase snapshot folder. + assertEquals( + "gs://test-bucket" + TEST_RESTORE_PATH + '/', + CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir( + TEST_BUCKET_PATH + '/', TEST_RESTORE_PATH)); + + assertEquals( + "gs://test-bucket" + TEST_RESTORE_PATH + '/', + CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir(TEST_BUCKET_PATH, TEST_RESTORE_PATH)); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index 4dc924f62e..d49d5500db 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -22,15 +22,12 @@ import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import java.util.UUID; import org.apache.beam.runners.dataflow.DataflowRunner; import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; import org.apache.beam.sdk.PipelineResult.State; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; -import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; import org.apache.commons.logging.Log; @@ -88,7 +85,6 @@ public class EndToEndIT { // Snapshot data setup private String hbaseSnapshotDir; - private String restoreDir; @Before public void setup() throws Exception { @@ -103,7 +99,6 @@ public void setup() throws Exception { hbaseSnapshotDir = cloudTestDataFolder + "data/"; UUID test_uuid = UUID.randomUUID(); - restoreDir = cloudTestDataFolder + "restore/" + test_uuid; // Cloud Storage config GcpOptions gcpOptions = PipelineOptionsFactory.create().as(GcpOptions.class); @@ -129,17 +124,6 @@ private static String getTestProperty(String name) { @After public void teardown() throws IOException { - final List paths = gcsUtil.expand(GcsPath.fromUri(restoreDir + "/*")); - - if (!paths.isEmpty()) { - final List pathStrs = new ArrayList<>(); - - for (GcsPath path : paths) { - pathStrs.add(path.toString()); - } - this.gcsUtil.remove(pathStrs); - } - connection.close(); // delete test table @@ -152,7 +136,6 @@ public void teardown() throws IOException { public void testHBaseSnapshotImport() throws Exception { // Crete table - LOG.debug("DEBUG (create test table) ==>"); TableName tableName = TableName.valueOf(tableId); HTableDescriptor descriptor = new HTableDescriptor(tableName); @@ -161,7 +144,6 @@ public void testHBaseSnapshotImport() throws Exception { connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys()); // Start import - LOG.debug("DEBUG (import snapshot) ==>"); DataflowPipelineOptions importPipelineOpts = PipelineOptionsFactory.as(DataflowPipelineOptions.class); importPipelineOpts.setRunner(DataflowRunner.class); @@ -180,7 +162,6 @@ public void testHBaseSnapshotImport() throws Exception { // setup HBase snapshot info importOpts.setHbaseSnapshotSourceDir(hbaseSnapshotDir); - importOpts.setRestoreDir(restoreDir); importOpts.setSnapshotName(TEST_SNAPSHOT_NAME); // run pipeline diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java index 67aaf6a445..579a57c238 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java @@ -27,7 +27,6 @@ public class HBaseSnapshotInputConfigBuilderTest { private static final String TEST_PROJECT = "test_project"; private static final String TEST_SNAPSHOT_DIR = "gs://test-bucket/hbase-export"; private static final String TEST_SNAPSHOT_NAME = "test_snapshot"; - private static final String TEST_RESTORE_DIR = "gs://test-bucket/hbase-restore"; @Test public void testBuildingHBaseSnapshotInputConfigBuilder() { @@ -36,7 +35,6 @@ public void testBuildingHBaseSnapshotInputConfigBuilder() { .setProjectId(TEST_PROJECT) .setHbaseSnapshotSourceDir(TEST_SNAPSHOT_DIR) .setSnapshotName(TEST_SNAPSHOT_NAME) - .setRestoreDir(TEST_RESTORE_DIR) .createHBaseConfiguration(); assertEquals( "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS", conf.get("fs.AbstractFileSystem.gs.impl")); From e871418c32edc87e080250e0ac07a4450788415f Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Sun, 24 Jan 2021 04:13:01 +0000 Subject: [PATCH 29/36] use pattern without ending '/' --- .../CleanupHBaseSnapshotRestoreFilesFn.java | 4 ---- .../CleanupHBaseSnapshotRestoreFilesFnTest.java | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java index 24ec52e352..e9ac467904 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java @@ -17,7 +17,6 @@ import com.google.common.base.Preconditions; import java.io.IOException; -import java.util.Collections; import java.util.List; import java.util.stream.Collectors; import org.apache.beam.sdk.io.FileSystems; @@ -43,9 +42,6 @@ public void processElement(ProcessContext context) throws IOException { .map(metadata -> metadata.resourceId()) .collect(Collectors.toList()); FileSystems.delete(paths, MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES); - FileSystems.delete( - Collections.singletonList(FileSystems.matchSingleFileSpec(restoreDir).resourceId()), - MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES); context.output(true); } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java index 64127badec..628eaeac96 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java @@ -27,22 +27,22 @@ public class CleanupHBaseSnapshotRestoreFilesFnTest { @Test public void testGetRestorePath() { assertEquals( - "gs://test-bucket" + TEST_RESTORE_PATH + '/', + "gs://test-bucket" + TEST_RESTORE_PATH, CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir(TEST_SNAPSHOT_PATH, TEST_RESTORE_PATH)); assertEquals( - "gs://test-bucket" + TEST_RESTORE_PATH + '/', + "gs://test-bucket" + TEST_RESTORE_PATH, CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir( TEST_SNAPSHOT_PATH + '/', TEST_RESTORE_PATH)); // These are not valid case as one could not use bucket root as hbase snapshot folder. assertEquals( - "gs://test-bucket" + TEST_RESTORE_PATH + '/', + "gs://test-bucket" + TEST_RESTORE_PATH, CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir( TEST_BUCKET_PATH + '/', TEST_RESTORE_PATH)); assertEquals( - "gs://test-bucket" + TEST_RESTORE_PATH + '/', + "gs://test-bucket" + TEST_RESTORE_PATH, CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir(TEST_BUCKET_PATH, TEST_RESTORE_PATH)); } } From 6f15d81ea7300c7503cf4fa9c86ced7e1afa941b Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 26 Jan 2021 06:31:51 +0000 Subject: [PATCH 30/36] use listObject instead of match since GcsUtil expand intentionally filter out directories --- .../bigtable-beam-import/pom.xml | 1 - .../CleanupHBaseSnapshotRestoreFilesFn.java | 60 ++++++++++++------- ...leanupHBaseSnapshotRestoreFilesFnTest.java | 38 +++++++----- .../beam/hbasesnapshots/EndToEndIT.java | 11 ++++ 4 files changed, 71 insertions(+), 39 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 6b300b0426..d824e7ef22 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -26,7 +26,6 @@ limitations under the License. com.google.cloud.bigtable.beam.Main - false diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java index e9ac467904..57774ea252 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java @@ -15,13 +15,15 @@ */ package com.google.cloud.bigtable.beam.hbasesnapshots; +import com.google.api.services.storage.model.Objects; import com.google.common.base.Preconditions; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; -import org.apache.beam.sdk.io.FileSystems; -import org.apache.beam.sdk.io.fs.MoveOptions; -import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; +import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.values.KV; import org.apache.commons.logging.Log; @@ -36,29 +38,43 @@ public void processElement(ProcessContext context) throws IOException { String hbaseSnapshotDir = elem.getKey(); String restorePath = elem.getValue(); - String restoreDir = getRestoreDir(hbaseSnapshotDir, restorePath); - List paths = - FileSystems.match(restoreDir + "**").metadata().stream() - .map(metadata -> metadata.resourceId()) - .collect(Collectors.toList()); - FileSystems.delete(paths, MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES); + String prefix = getListPrefix(restorePath); + String bucketName = getWorkingBucketName(hbaseSnapshotDir); + Preconditions.checkState( + !prefix.isEmpty() && !hbaseSnapshotDir.contains(String.format("%s/%s", bucketName, prefix)), + "restore folder should not be empty or a subfolder of hbaseSnapshotSourceDir"); + GcpOptions gcpOptions = context.getPipelineOptions().as(GcpOptions.class); + GcsUtil gcsUtil = new GcsUtil.GcsUtilFactory().create(gcpOptions); + + String pageToken = null; + List results = new ArrayList<>(); + do { + Objects objects = gcsUtil.listObjects(bucketName, prefix, pageToken); + if (objects.getItems() == null) { + break; + } + results.addAll( + objects.getItems().stream() + .map(storageObject -> GcsPath.fromObject(storageObject).toString()) + .collect(Collectors.toList())); + pageToken = objects.getNextPageToken(); + } while (pageToken != null); + gcsUtil.remove(results); context.output(true); } - public static String getRestoreDir(String hbaseSnapshotDir, String restoreDir) { - Preconditions.checkState( + public static String getWorkingBucketName(String hbaseSnapshotDir) { + Preconditions.checkArgument( hbaseSnapshotDir.startsWith("gs://"), "snapshot folder must be hosted in a GCS bucket "); - Preconditions.checkState( - restoreDir.startsWith("/"), - "restore folder must be an absolute path in current filesystem"); - int bucketNameEndIndex = hbaseSnapshotDir.indexOf('/', 5); // "offset gs://" - String bucketName; - if (bucketNameEndIndex > 0) { - bucketName = hbaseSnapshotDir.substring(0, bucketNameEndIndex); - } else { - bucketName = hbaseSnapshotDir; - } - return String.format("%s%s", bucketName, restoreDir); + return GcsPath.fromUri(hbaseSnapshotDir).getBucket(); + } + // getListPrefix convert absolut restorePath in a Hadoop filesystem + // to a match prefix in a GCS bucket + public static String getListPrefix(String restorePath) { + Preconditions.checkArgument( + restorePath.startsWith("/"), + "restore folder must be an absolute path in current filesystem"); + return restorePath.substring(1); } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java index 628eaeac96..42093b168c 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java @@ -16,33 +16,39 @@ package com.google.cloud.bigtable.beam.hbasesnapshots; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; import org.junit.Test; public class CleanupHBaseSnapshotRestoreFilesFnTest { - private static final String TEST_BUCKET_PATH = "gs://test-bucket"; - private static final String TEST_SNAPSHOT_PATH = TEST_BUCKET_PATH + "/hbase-export"; + private static final String TEST_BUCKET_NAME = "test-bucket"; + private static final String TEST_SNAPSHOT_PATH = "gs://" + TEST_BUCKET_NAME + "/hbase-export"; private static final String TEST_RESTORE_PATH = HBaseSnapshotInputConfigBuilder.RESTORE_DIR; + private static final String TEST_RESTORE_PREFIX = + HBaseSnapshotInputConfigBuilder.RESTORE_DIR.substring(1); @Test - public void testGetRestorePath() { + public void testGetWorkingBucketName() { assertEquals( - "gs://test-bucket" + TEST_RESTORE_PATH, - CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir(TEST_SNAPSHOT_PATH, TEST_RESTORE_PATH)); + TEST_BUCKET_NAME, + CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_SNAPSHOT_PATH)); - assertEquals( - "gs://test-bucket" + TEST_RESTORE_PATH, - CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir( - TEST_SNAPSHOT_PATH + '/', TEST_RESTORE_PATH)); + assertThrows( + IllegalArgumentException.class, + () -> { + CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_BUCKET_NAME); + }); + } - // These are not valid case as one could not use bucket root as hbase snapshot folder. + @Test + public void testGetListPrefix() { assertEquals( - "gs://test-bucket" + TEST_RESTORE_PATH, - CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir( - TEST_BUCKET_PATH + '/', TEST_RESTORE_PATH)); + TEST_RESTORE_PREFIX, CleanupHBaseSnapshotRestoreFilesFn.getListPrefix(TEST_RESTORE_PATH)); - assertEquals( - "gs://test-bucket" + TEST_RESTORE_PATH, - CleanupHBaseSnapshotRestoreFilesFn.getRestoreDir(TEST_BUCKET_PATH, TEST_RESTORE_PATH)); + assertThrows( + IllegalArgumentException.class, + () -> { + CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_RESTORE_PREFIX); + }); } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index d49d5500db..62f1cdced2 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -17,6 +17,7 @@ import static com.google.common.base.Preconditions.checkNotNull; +import com.google.api.services.storage.model.Objects; import com.google.cloud.bigtable.beam.sequencefiles.testing.BigtableTableUtils; import com.google.cloud.bigtable.hbase.BigtableConfiguration; import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; @@ -28,6 +29,7 @@ import org.apache.beam.sdk.PipelineResult.State; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; +import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; import org.apache.commons.logging.Log; @@ -174,6 +176,15 @@ public void testHBaseSnapshotImport() throws Exception { 100 /* There are 100 rows in test snapshot*/, destTable.readAllCellsFromTable().toArray().length); + // check that the .restore dir used for temp files has been removed + Objects objects = + gcsUtil.listObjects( + GcsPath.fromUri(hbaseSnapshotDir).getBucket(), + CleanupHBaseSnapshotRestoreFilesFn.getListPrefix( + HBaseSnapshotInputConfigBuilder.RESTORE_DIR), + null); + Assert.assertNull(objects.getItems()); + // TODO(vermas2012): Add more validations after this. } } From 5db7ce60b029cb90657c6eee2ba0a01337fd8b39 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 26 Jan 2021 15:21:35 +0000 Subject: [PATCH 31/36] Using a unique suffix for restore dir to avoid conflict --- .../HBaseSnapshotInputConfigBuilder.java | 15 ++++++++++++++- .../ImportJobFromHbaseSnapshot.java | 7 ++++++- .../CleanupHBaseSnapshotRestoreFilesFnTest.java | 7 ++++--- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index e82e2b9d42..bc36231f08 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -50,6 +50,7 @@ class HBaseSnapshotInputConfigBuilder { private String projectId; private String hbaseSnapshotSourceDir; private String snapshotName; + private String restoreDirSuffix; public HBaseSnapshotInputConfigBuilder() {} @@ -80,6 +81,18 @@ public HBaseSnapshotInputConfigBuilder setSnapshotName(String snapshotName) { return this; } + /* + * Set the unique suffix to be used for restore folder to avoid conflicts + */ + public HBaseSnapshotInputConfigBuilder setRestoreDirSuffix(String suffix) { + this.restoreDirSuffix = suffix; + return this; + } + + public String getRestoreDir() { + return RESTORE_DIR + this.restoreDirSuffix; + } + public Configuration build() throws Exception { Preconditions.checkNotNull(projectId); Preconditions.checkNotNull(hbaseSnapshotSourceDir); @@ -97,7 +110,7 @@ public Configuration build() throws Exception { Job job = Job.getInstance(conf); // creates internal clone of hbaseConf // the restore folder need to under current bucket root so to be considered // within the same filesystem with the hbaseSnapshotSourceDir - TableSnapshotInputFormat.setInput(job, snapshotName, new Path(RESTORE_DIR)); + TableSnapshotInputFormat.setInput(job, snapshotName, new Path(getRestoreDir())); return job.getConfiguration(); // extract the modified clone } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index 63e08b5a74..e15f203086 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -103,6 +103,7 @@ static Pipeline buildPipeline(ImportOptions opts) throws Exception { .setProjectId(opts.getProject()) .setHbaseSnapshotSourceDir(opts.getHbaseSnapshotSourceDir()) .setSnapshotName(opts.getSnapshotName()) + .setRestoreDirSuffix(opts.getJobName()) .build(); PCollection> readResult = pipeline.apply( @@ -117,7 +118,11 @@ static Pipeline buildPipeline(ImportOptions opts) throws Exception { final List> tempFiles = Arrays.asList( - KV.of(opts.getHbaseSnapshotSourceDir(), HBaseSnapshotInputConfigBuilder.RESTORE_DIR)); + KV.of( + opts.getHbaseSnapshotSourceDir(), + new HBaseSnapshotInputConfigBuilder() + .setRestoreDirSuffix(opts.getJobName()) + .getRestoreDir())); pipeline .apply(Create.of(tempFiles)) .apply(Wait.on(readResult)) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java index 42093b168c..0183f856f1 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java @@ -18,14 +18,15 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThrows; +import java.util.UUID; import org.junit.Test; public class CleanupHBaseSnapshotRestoreFilesFnTest { private static final String TEST_BUCKET_NAME = "test-bucket"; private static final String TEST_SNAPSHOT_PATH = "gs://" + TEST_BUCKET_NAME + "/hbase-export"; - private static final String TEST_RESTORE_PATH = HBaseSnapshotInputConfigBuilder.RESTORE_DIR; - private static final String TEST_RESTORE_PREFIX = - HBaseSnapshotInputConfigBuilder.RESTORE_DIR.substring(1); + private static final String TEST_RESTORE_PATH = + HBaseSnapshotInputConfigBuilder.RESTORE_DIR + UUID.randomUUID(); + private static final String TEST_RESTORE_PREFIX = TEST_RESTORE_PATH.substring(1); @Test public void testGetWorkingBucketName() { From 87b0efff3e5dea8e7d174ed513f6b8ee13b9fe1d Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 26 Jan 2021 15:43:51 +0000 Subject: [PATCH 32/36] Add dependency to pom.xml --- bigtable-dataflow-parent/bigtable-beam-import/pom.xml | 8 ++++++++ .../hbasesnapshots/HBaseSnapshotInputConfigBuilder.java | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index d824e7ef22..c35c5738e2 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -175,6 +175,14 @@ limitations under the License. shaded
+ + + com.google.apis + google-api-services-storage + v1-rev171-1.25.0 + + + org.apache.beam diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index bc36231f08..b1e15b7aca 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -126,7 +126,7 @@ public Configuration createHBaseConfiguration() { // Setup GCS connector to use GCS as Hadoop filesystem conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); conf.set("fs.gs.project.id", projectId); - conf.set("google.cloud.auth.service.account.enable", "true"); + conf.setBoolean("google.cloud.auth.service.account.enable", true); // Setup MapReduce config for TableSnapshotInputFormat conf.setClass( From c967d17c35b0e827d1da016a2583c92f2ab73546 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Thu, 28 Jan 2021 03:13:20 +0000 Subject: [PATCH 33/36] minimize accessibility for class --- .../beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java index 57774ea252..326caeef5e 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java @@ -29,7 +29,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -public class CleanupHBaseSnapshotRestoreFilesFn extends DoFn, Boolean> { +class CleanupHBaseSnapshotRestoreFilesFn extends DoFn, Boolean> { private static final Log LOG = LogFactory.getLog(CleanupHBaseSnapshotRestoreFilesFn.class); @ProcessElement From 59e7a551d77d5a66895d0f22184aa7bfe64a57c2 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Thu, 28 Jan 2021 14:52:19 +0000 Subject: [PATCH 34/36] Fix typo and Add header comment for CleanupHBaseSnapshotRestoreFilesFn --- .../CleanupHBaseSnapshotRestoreFilesFn.java | 9 +++++++-- .../hbasesnapshots/HBaseSnapshotInputConfigBuilder.java | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java index 326caeef5e..c862347322 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java @@ -29,6 +29,10 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +/** + * A {@link DoFn} that could be used for cleaning up temp files generated during HBase snapshot + * scans in Google Cloud Storage(GCS) bucket via GCS connector. + */ class CleanupHBaseSnapshotRestoreFilesFn extends DoFn, Boolean> { private static final Log LOG = LogFactory.getLog(CleanupHBaseSnapshotRestoreFilesFn.class); @@ -65,11 +69,12 @@ public void processElement(ProcessContext context) throws IOException { public static String getWorkingBucketName(String hbaseSnapshotDir) { Preconditions.checkArgument( - hbaseSnapshotDir.startsWith("gs://"), "snapshot folder must be hosted in a GCS bucket "); + hbaseSnapshotDir.startsWith(GcsPath.SCHEME), + "snapshot folder must be hosted in a GCS bucket "); return GcsPath.fromUri(hbaseSnapshotDir).getBucket(); } - // getListPrefix convert absolut restorePath in a Hadoop filesystem + // getListPrefix convert absolute restorePath in a Hadoop filesystem // to a match prefix in a GCS bucket public static String getListPrefix(String restorePath) { Preconditions.checkArgument( diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index b1e15b7aca..e3ea3342af 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -16,6 +16,7 @@ package com.google.cloud.bigtable.beam.hbasesnapshots; import com.google.common.base.Preconditions; +import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -98,7 +99,7 @@ public Configuration build() throws Exception { Preconditions.checkNotNull(hbaseSnapshotSourceDir); Preconditions.checkNotNull(snapshotName); Preconditions.checkState( - hbaseSnapshotSourceDir.startsWith("gs://"), + hbaseSnapshotSourceDir.startsWith(GcsPath.SCHEME), "snapshot folder must be hosted in a GCS bucket "); Configuration conf = createHBaseConfiguration(); From 2596d882081559df8412e790b7b64ec5b0a5a3b8 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Mon, 1 Feb 2021 18:38:21 +0000 Subject: [PATCH 35/36] Adding more error messages for HBaseSnapshotInputConfigBuilder Minor code refactorig to reduce confusion --- .../CleanupHBaseSnapshotRestoreFilesFn.java | 9 ++++----- .../HBaseSnapshotInputConfigBuilder.java | 7 ++++--- .../ImportJobFromHbaseSnapshot.java | 18 +++++++----------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java index c862347322..e0bdca69d5 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; @@ -57,10 +56,10 @@ public void processElement(ProcessContext context) throws IOException { if (objects.getItems() == null) { break; } - results.addAll( - objects.getItems().stream() - .map(storageObject -> GcsPath.fromObject(storageObject).toString()) - .collect(Collectors.toList())); + + objects.getItems().stream() + .map(storageObject -> GcsPath.fromObject(storageObject).toString()) + .forEach(results::add); pageToken = objects.getNextPageToken(); } while (pageToken != null); gcsUtil.remove(results); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java index e3ea3342af..63ebddb20a 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -95,9 +95,10 @@ public String getRestoreDir() { } public Configuration build() throws Exception { - Preconditions.checkNotNull(projectId); - Preconditions.checkNotNull(hbaseSnapshotSourceDir); - Preconditions.checkNotNull(snapshotName); + Preconditions.checkNotNull(projectId, "Required value projectId must be set"); + Preconditions.checkNotNull( + hbaseSnapshotSourceDir, "Required value hbaseSnapshotSourceDir must be set"); + Preconditions.checkNotNull(snapshotName, "Required value snapshotName must be set"); Preconditions.checkState( hbaseSnapshotSourceDir.startsWith(GcsPath.SCHEME), "snapshot folder must be hosted in a GCS bucket "); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index e15f203086..ce994ffe9c 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -36,7 +36,6 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; @@ -98,17 +97,18 @@ public static void main(String[] args) throws Exception { static Pipeline buildPipeline(ImportOptions opts) throws Exception { Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); - Configuration configuration = + HBaseSnapshotInputConfigBuilder configurationBuilder = new HBaseSnapshotInputConfigBuilder() .setProjectId(opts.getProject()) .setHbaseSnapshotSourceDir(opts.getHbaseSnapshotSourceDir()) .setSnapshotName(opts.getSnapshotName()) .setRestoreDirSuffix(opts.getJobName()) - .build(); + .setRestoreDirSuffix(opts.getJobName()); PCollection> readResult = pipeline.apply( "Read from HBase Snapshot", - HadoopFormatIO.read().withConfiguration(configuration)); + HadoopFormatIO.read() + .withConfiguration(configurationBuilder.build())); readResult .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) @@ -116,15 +116,11 @@ static Pipeline buildPipeline(ImportOptions opts) throws Exception { "Write to Bigtable", CloudBigtableIO.writeToTable(TemplateUtils.BuildImportConfig(opts))); - final List> tempFiles = + final List> sourceAndRestoreFolders = Arrays.asList( - KV.of( - opts.getHbaseSnapshotSourceDir(), - new HBaseSnapshotInputConfigBuilder() - .setRestoreDirSuffix(opts.getJobName()) - .getRestoreDir())); + KV.of(opts.getHbaseSnapshotSourceDir(), configurationBuilder.getRestoreDir())); pipeline - .apply(Create.of(tempFiles)) + .apply(Create.of(sourceAndRestoreFolders)) .apply(Wait.on(readResult)) .apply(ParDo.of(new CleanupHBaseSnapshotRestoreFilesFn())); From 94270af1b1867c2f601fd088814d687198cb19c9 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Mon, 1 Feb 2021 19:18:16 +0000 Subject: [PATCH 36/36] Add document about how to handle temp files during job failures --- .../beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java index ce994ffe9c..2d8ce7c31f 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -58,6 +58,10 @@ * --hbaseSnapshotSourceDir=gs://$HBASE_EXPORT_ROOT_PATH \ * --snapshotName=$SNAPSHOT_NAME * + * + * Note that in the case of job failures, the temp files generated in the .restore-$JOB_NAME + * directory under the snapshot export bucket will not get deleted. Hence one need to either launch + * a replacement job with the same jobName to re-run the job or manually delete this directory. */ @InternalExtensionOnly public class ImportJobFromHbaseSnapshot {