diff --git a/.kokoro/nightly/integration-beam.cfg b/.kokoro/nightly/integration-beam.cfg new file mode 100644 index 0000000000..f91f157259 --- /dev/null +++ b/.kokoro/nightly/integration-beam.cfg @@ -0,0 +1,38 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Configure the docker image for kokoro-trampoline. +env_vars: { + key: "TRAMPOLINE_IMAGE" + value: "gcr.io/cloud-devrel-kokoro-resources/java8" +} + +env_vars: { + key: "INTEGRATION_TEST_ARGS" + value: "-PbeamIntegrationTest -Dgoogle.bigtable.project.id=gcloud-devel -Dgoogle.bigtable.instance.id=google-cloud-bigtable -Dgoogle.dataflow.stagingLocation=gs://java-bigtable-hbase-testing/staging -Dcloud.test.data.folder=gs://java-bigtable-hbase-testing/hbase-snapshot-import-integration-tests -Dregion=us-central1" +} + +env_vars: { + key: "JOB_TYPE" + value: "integration" +} + +# TODO: remove this after we've migrated all tests and scripts +env_vars: { + key: "GCLOUD_PROJECT" + value: "gcloud-devel" +} + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "gcloud-devel" +} + +env_vars: { + key: "GOOGLE_APPLICATION_CREDENTIALS" + value: "secret_manager/java-it-service-account" +} + +env_vars: { + key: "SECRET_MANAGER_KEYS" + value: "java-it-service-account" +} diff --git a/.kokoro/presubmit/integration-beam.cfg b/.kokoro/presubmit/integration-beam.cfg new file mode 100644 index 0000000000..f91f157259 --- /dev/null +++ b/.kokoro/presubmit/integration-beam.cfg @@ -0,0 +1,38 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Configure the docker image for kokoro-trampoline. +env_vars: { + key: "TRAMPOLINE_IMAGE" + value: "gcr.io/cloud-devrel-kokoro-resources/java8" +} + +env_vars: { + key: "INTEGRATION_TEST_ARGS" + value: "-PbeamIntegrationTest -Dgoogle.bigtable.project.id=gcloud-devel -Dgoogle.bigtable.instance.id=google-cloud-bigtable -Dgoogle.dataflow.stagingLocation=gs://java-bigtable-hbase-testing/staging -Dcloud.test.data.folder=gs://java-bigtable-hbase-testing/hbase-snapshot-import-integration-tests -Dregion=us-central1" +} + +env_vars: { + key: "JOB_TYPE" + value: "integration" +} + +# TODO: remove this after we've migrated all tests and scripts +env_vars: { + key: "GCLOUD_PROJECT" + value: "gcloud-devel" +} + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "gcloud-devel" +} + +env_vars: { + key: "GOOGLE_APPLICATION_CREDENTIALS" + value: "secret_manager/java-it-service-account" +} + +env_vars: { + key: "SECRET_MANAGER_KEYS" + value: "java-it-service-account" +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/README.md b/bigtable-dataflow-parent/bigtable-beam-import/README.md index 7d7b4025ec..783de14443 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/README.md +++ b/bigtable-dataflow-parent/bigtable-beam-import/README.md @@ -4,8 +4,8 @@ This project supports importing and exporting HBase Sequence Files to Google Clo Cloud Dataflow. ## Instructions - -Download [the import/export jar](http://search.maven.org/remotecontent?filepath=com/google/cloud/bigtable/bigtable-beam-import/1.1.0/bigtable-beam-import-1.1.0-shaded.jar), which is an aggregation of all required jars. +[//]: # ({x-version-update-start:bigtable-dataflow-parent:released}) +Download [the import/export jar](http://search.maven.org/remotecontent?filepath=com/google/cloud/bigtable/bigtable-beam-import/1.19.3/bigtable-beam-import-1.19.3-shaded.jar), which is an aggregation of all required jars. Please pay attention to the Cluster CPU usage and adjust the number of Dataflow workers accordingly. @@ -14,7 +14,7 @@ Please pay attention to the Cluster CPU usage and adjust the number of Dataflow On the command line: ``` -java -jar bigtable-beam-import-1.1.0-shaded.jar export \ +java -jar bigtable-beam-import-1.19.3-shaded.jar export \ --runner=dataflow \ --project=[your_project_id] \ --bigtableInstanceId=[your_instance_id] \ @@ -32,7 +32,7 @@ Create the table in your cluster. On the command line: ``` -java -jar bigtable-beam-import-1.1.0-shaded.jar import \ +java -jar bigtable-beam-import-1.19.3-shaded.jar import \ --runner=dataflow \ --project=[your_project_id] \ --bigtableInstanceId=[your_instance_id] \ @@ -42,3 +42,4 @@ java -jar bigtable-beam-import-1.1.0-shaded.jar import \ --maxNumWorkers=[3x number of nodes] \ --zone=[zone of your cluster] ``` +[//]: # ({x-version-update-end}) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index ed488f86c4..2448fdea12 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -25,7 +25,7 @@ limitations under the License. bigtable-beam-import - com.google.cloud.bigtable.beam.sequencefiles.Main + com.google.cloud.bigtable.beam.Main @@ -46,6 +46,12 @@ limitations under the License. ${project.groupId} bigtable-hbase-beam ${project.version} + + + org.apache.hbase + hbase-shaded-client + + com.google.cloud.bigtable @@ -64,9 +70,12 @@ limitations under the License. io.opencensus * + + org.apache.hbase + hbase-shaded-client + - org.apache.beam beam-sdks-java-core @@ -84,10 +93,18 @@ limitations under the License. beam-sdks-java-io-hadoop-common ${beam.version} + + org.apache.beam + beam-sdks-java-io-hadoop-format + ${beam.version} + + org.apache.hbase - hbase-shaded-client + hbase-shaded-server ${hbase.version} @@ -101,7 +118,7 @@ limitations under the License. com.google.guava guava - ${beam-guava.version} + ${gcs-guava.version} @@ -121,6 +138,12 @@ limitations under the License. + + com.google.code.findbugs + jsr305 + ${jsr305.version} + + @@ -146,6 +169,21 @@ limitations under the License. slf4j-api ${slf4j.version} + + + com.google.cloud.bigdataoss + gcs-connector + hadoop2-2.1.4 + shaded + + + + + com.google.apis + google-api-services-storage + v1-rev171-1.25.0 + + @@ -178,6 +216,24 @@ limitations under the License. ${junit.version} test + + org.apache.hbase + hbase-shaded-testing-util + ${hbase.version} + test + + + com.google.truth + truth + 1.0.1 + test + + + com.google.cloud + google-cloud-bigtable-emulator + 0.124.0 + test + @@ -213,7 +269,7 @@ limitations under the License. org.codehaus.mojo exec-maven-plugin - 1.6.0 + 3.0.0 @@ -229,7 +285,7 @@ limitations under the License. org.codehaus.mojo build-helper-maven-plugin - 3.0.0 + 3.2.0 add-source @@ -262,6 +318,16 @@ limitations under the License. + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + @@ -332,7 +398,7 @@ limitations under the License. 1 - **/*IT.java + **/sequencefiles/*IT.java beamIntegrationTest @@ -364,6 +431,7 @@ limitations under the License. 1 **/CloudBigtableBeamITTest.java + **/*IT.java bigtable-beam @@ -373,5 +441,38 @@ limitations under the License. + + + hbasesnapshotsIntegrationTest + + + + false + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + hbasesnapshots-integration-test + + integration-test + + integration-test + + 1 + + **/hbasesnapshots/*IT.java + + false + + + + + + + diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java new file mode 100644 index 0000000000..1f52f5125a --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java @@ -0,0 +1,83 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; +import com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot; +import com.google.cloud.bigtable.beam.sequencefiles.CreateTableHelper; +import com.google.cloud.bigtable.beam.sequencefiles.ExportJob; +import com.google.cloud.bigtable.beam.sequencefiles.ImportJob; +import com.google.cloud.bigtable.beam.validation.SyncTableJob; +import java.io.File; +import java.net.URISyntaxException; +import java.util.Arrays; + +/** Entry point for create-table/import/export job submission. */ +@InternalExtensionOnly +final class Main { + /** For internal use only - public for technical reasons. */ + @InternalApi("For internal usage only") + public Main() {} + + public static void main(String[] args) throws Exception { + if (args.length < 1) { + usage(); + System.exit(1); + } + + String[] subArgs = Arrays.copyOfRange(args, 1, args.length); + + switch (args[0]) { + case "export": + ExportJob.main(subArgs); + break; + case "import": + ImportJob.main(subArgs); + break; + case "importsnapshot": + ImportJobFromHbaseSnapshot.main(subArgs); + break; + case "create-table": + CreateTableHelper.main(subArgs); + break; + case "sync-table": + SyncTableJob.main(subArgs); + break; + default: + usage(); + System.exit(1); + } + } + + private static void usage() { + String jarName; + + try { + jarName = + new File(Main.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()) + .getName(); + } catch (URISyntaxException e) { + jarName = ""; + } + + System.out.printf( + "java -jar %s \n" + + "Where can be 'export', 'import' , 'importsnapshot' or 'create-table'. To get further help, run: \n" + + "java -jar %s --help\n", + jarName, jarName); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java index e64507317b..f839a50b23 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java @@ -26,6 +26,7 @@ import com.google.bigtable.repackaged.com.google.cloud.bigtable.data.v2.models.Query; import com.google.cloud.bigtable.beam.sequencefiles.ExportJob.ExportOptions; import com.google.cloud.bigtable.beam.sequencefiles.ImportJob.ImportOptions; +import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions; import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; import com.google.cloud.bigtable.hbase.adapters.Adapters; import com.google.cloud.bigtable.hbase.adapters.read.DefaultReadHooks; @@ -72,6 +73,19 @@ public static CloudBigtableTableConfiguration BuildImportConfig(ImportOptions op return builder.build(); } + /** Builds CloudBigtableTableConfiguration from input runtime parameters for import job. */ + public static CloudBigtableTableConfiguration BuildSyncTableConfig(SyncTableOptions opts) { + CloudBigtableTableConfiguration.Builder builder = + new CloudBigtableTableConfiguration.Builder() + .withProjectId(opts.getBigtableProject()) + .withInstanceId(opts.getBigtableInstanceId()) + .withTableId(opts.getBigtableTableId()); + if (opts.getBigtableAppProfileId() != null) { + builder.withAppProfileId(opts.getBigtableAppProfileId()); + } + return builder.build(); + } + /** Provides a request that is constructed with some attributes. */ private static class RequestValueProvider implements ValueProvider, Serializable { diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java new file mode 100644 index 0000000000..e0bdca69d5 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFn.java @@ -0,0 +1,84 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import com.google.api.services.storage.model.Objects; +import com.google.common.base.Preconditions; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; +import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.values.KV; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A {@link DoFn} that could be used for cleaning up temp files generated during HBase snapshot + * scans in Google Cloud Storage(GCS) bucket via GCS connector. + */ +class CleanupHBaseSnapshotRestoreFilesFn extends DoFn, Boolean> { + private static final Log LOG = LogFactory.getLog(CleanupHBaseSnapshotRestoreFilesFn.class); + + @ProcessElement + public void processElement(ProcessContext context) throws IOException { + KV elem = context.element(); + + String hbaseSnapshotDir = elem.getKey(); + String restorePath = elem.getValue(); + String prefix = getListPrefix(restorePath); + String bucketName = getWorkingBucketName(hbaseSnapshotDir); + Preconditions.checkState( + !prefix.isEmpty() && !hbaseSnapshotDir.contains(String.format("%s/%s", bucketName, prefix)), + "restore folder should not be empty or a subfolder of hbaseSnapshotSourceDir"); + GcpOptions gcpOptions = context.getPipelineOptions().as(GcpOptions.class); + GcsUtil gcsUtil = new GcsUtil.GcsUtilFactory().create(gcpOptions); + + String pageToken = null; + List results = new ArrayList<>(); + do { + Objects objects = gcsUtil.listObjects(bucketName, prefix, pageToken); + if (objects.getItems() == null) { + break; + } + + objects.getItems().stream() + .map(storageObject -> GcsPath.fromObject(storageObject).toString()) + .forEach(results::add); + pageToken = objects.getNextPageToken(); + } while (pageToken != null); + gcsUtil.remove(results); + context.output(true); + } + + public static String getWorkingBucketName(String hbaseSnapshotDir) { + Preconditions.checkArgument( + hbaseSnapshotDir.startsWith(GcsPath.SCHEME), + "snapshot folder must be hosted in a GCS bucket "); + + return GcsPath.fromUri(hbaseSnapshotDir).getBucket(); + } + // getListPrefix convert absolute restorePath in a Hadoop filesystem + // to a match prefix in a GCS bucket + public static String getListPrefix(String restorePath) { + Preconditions.checkArgument( + restorePath.startsWith("/"), + "restore folder must be an absolute path in current filesystem"); + return restorePath.substring(1); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java new file mode 100644 index 0000000000..63ebddb20a --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilder.java @@ -0,0 +1,140 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import com.google.common.base.Preconditions; +import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.TableInputFormat; +import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat; +import org.apache.hadoop.hbase.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.protobuf.generated.ClientProtos; +import org.apache.hadoop.hbase.util.Base64; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.InputFormat; +import org.apache.hadoop.mapreduce.Job; + +/** + * A {@link Configuration} that could be used in {@link HadoopFormatIO} for reading HBase snapshot + * hosted in Google Cloud Storage(GCS) bucket via GCS connector. It uses {@link + * TableSnapshotInputFormat} for reading HBase snapshots. + */ +class HBaseSnapshotInputConfigBuilder { + + private static final Log LOG = LogFactory.getLog(HBaseSnapshotInputConfigBuilder.class); + // Batch size used for HBase snapshot scans + private static final int BATCH_SIZE = 1000; + + // a temp location to store metadata extracted from snapshot + public static final String RESTORE_DIR = "/.restore"; + + private String projectId; + private String hbaseSnapshotSourceDir; + private String snapshotName; + private String restoreDirSuffix; + + public HBaseSnapshotInputConfigBuilder() {} + + /* + * Set the project id use to access the GCS bucket with HBase snapshot data to be imported + */ + public HBaseSnapshotInputConfigBuilder setProjectId(String projectId) { + this.projectId = projectId; + return this; + } + + /* + * Set the GCS path where the HBase snapshot data is located + */ + public HBaseSnapshotInputConfigBuilder setHbaseSnapshotSourceDir(String hbaseSnapshotSourceDir) { + this.hbaseSnapshotSourceDir = hbaseSnapshotSourceDir; + return this; + } + + /* + * Set the name of the snapshot to be imported + * e.g when importing snapshot 'gs:///hbase-export/table_snapshot' + * put 'table_snapshot' as the {@code snapshotName} + * and 'gs:///hbase-export' as {@code exportedSnapshotDir} + */ + public HBaseSnapshotInputConfigBuilder setSnapshotName(String snapshotName) { + this.snapshotName = snapshotName; + return this; + } + + /* + * Set the unique suffix to be used for restore folder to avoid conflicts + */ + public HBaseSnapshotInputConfigBuilder setRestoreDirSuffix(String suffix) { + this.restoreDirSuffix = suffix; + return this; + } + + public String getRestoreDir() { + return RESTORE_DIR + this.restoreDirSuffix; + } + + public Configuration build() throws Exception { + Preconditions.checkNotNull(projectId, "Required value projectId must be set"); + Preconditions.checkNotNull( + hbaseSnapshotSourceDir, "Required value hbaseSnapshotSourceDir must be set"); + Preconditions.checkNotNull(snapshotName, "Required value snapshotName must be set"); + Preconditions.checkState( + hbaseSnapshotSourceDir.startsWith(GcsPath.SCHEME), + "snapshot folder must be hosted in a GCS bucket "); + + Configuration conf = createHBaseConfiguration(); + + // Configuring a MapReduce Job base on HBaseConfiguration + // and return the job Configuration + ClientProtos.Scan proto = ProtobufUtil.toScan(new Scan().setBatch(BATCH_SIZE)); + conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())); + Job job = Job.getInstance(conf); // creates internal clone of hbaseConf + // the restore folder need to under current bucket root so to be considered + // within the same filesystem with the hbaseSnapshotSourceDir + TableSnapshotInputFormat.setInput(job, snapshotName, new Path(getRestoreDir())); + return job.getConfiguration(); // extract the modified clone + } + + // separate static part for unit testing + public Configuration createHBaseConfiguration() { + Configuration conf = HBaseConfiguration.create(); + + // Setup the input data location for HBase snapshot import + // exportedSnapshotDir should be a GCS Bucket path. + conf.set("hbase.rootdir", hbaseSnapshotSourceDir); + conf.set("fs.defaultFS", hbaseSnapshotSourceDir); + + // Setup GCS connector to use GCS as Hadoop filesystem + conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); + conf.set("fs.gs.project.id", projectId); + conf.setBoolean("google.cloud.auth.service.account.enable", true); + + // Setup MapReduce config for TableSnapshotInputFormat + conf.setClass( + "mapreduce.job.inputformat.class", TableSnapshotInputFormat.class, InputFormat.class); + conf.setClass("key.class", ImmutableBytesWritable.class, Writable.class); + conf.setClass("value.class", Result.class, Object.class); + return conf; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java new file mode 100644 index 0000000000..2d8ce7c31f --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshot.java @@ -0,0 +1,133 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; +import com.google.cloud.bigtable.beam.CloudBigtableIO; +import com.google.cloud.bigtable.beam.TemplateUtils; +import com.google.cloud.bigtable.beam.sequencefiles.HBaseResultToMutationFn; +import com.google.cloud.bigtable.beam.sequencefiles.ImportJob; +import com.google.cloud.bigtable.beam.sequencefiles.Utils; +import com.google.common.annotations.VisibleForTesting; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.io.hadoop.format.HadoopFormatIO; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Wait; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * A job that imports data from HBase snapshot exports hosted in Cloud Storage bucket into Cloud + * Bigtable. + * + *

Example: If you have exported your HBase Snapshot to GCS bucket gs://$HBASE_EXPORT_ROOT_PATH + * and want to import snapshot gs://$HBASE_EXPORT_ROOT_PATH/.hbase-snapshot/$SNAPSHOT_NAME into + * Cloud Bigtable $TABLE in $INSTANCE, execute the following command to run the job directly: + * + *

+ * mvn compile exec:java \
+ *   -DmainClass=com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot \
+ *   -Dexec.args="--runner=DataflowRunner \
+ *                --stagingLocation=gs://$STAGING_PATH \
+ *                --project=$PROJECT \
+ *                --bigtableInstanceId=$INSTANCE \
+ *                --bigtableTableId=$TABLE \
+ *                --hbaseSnapshotSourceDir=gs://$HBASE_EXPORT_ROOT_PATH \
+ *                --snapshotName=$SNAPSHOT_NAME
+ * 
+ * + * Note that in the case of job failures, the temp files generated in the .restore-$JOB_NAME + * directory under the snapshot export bucket will not get deleted. Hence one need to either launch + * a replacement job with the same jobName to re-run the job or manually delete this directory. + */ +@InternalExtensionOnly +public class ImportJobFromHbaseSnapshot { + private static final Log LOG = LogFactory.getLog(ImportJobFromHbaseSnapshot.class); + + public interface ImportOptions extends ImportJob.ImportOptions { + @Description("The HBase root dir where HBase snapshot files resides.") + String getHbaseSnapshotSourceDir(); + + @SuppressWarnings("unused") + void setHbaseSnapshotSourceDir(String hbaseSnapshotSourceDir); + + @Description("Snapshot name") + String getSnapshotName(); + + @SuppressWarnings("unused") + void setSnapshotName(String snapshotName); + } + + public static void main(String[] args) throws Exception { + PipelineOptionsFactory.register(ImportOptions.class); + + ImportOptions opts = + PipelineOptionsFactory.fromArgs(args).withValidation().as(ImportOptions.class); + + LOG.info("Building Pipeline"); + Pipeline pipeline = buildPipeline(opts); + LOG.info("Running Pipeline"); + PipelineResult result = pipeline.run(); + + if (opts.getWait()) { + Utils.waitForPipelineToFinish(result); + } + } + + @VisibleForTesting + static Pipeline buildPipeline(ImportOptions opts) throws Exception { + + Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); + HBaseSnapshotInputConfigBuilder configurationBuilder = + new HBaseSnapshotInputConfigBuilder() + .setProjectId(opts.getProject()) + .setHbaseSnapshotSourceDir(opts.getHbaseSnapshotSourceDir()) + .setSnapshotName(opts.getSnapshotName()) + .setRestoreDirSuffix(opts.getJobName()) + .setRestoreDirSuffix(opts.getJobName()); + PCollection> readResult = + pipeline.apply( + "Read from HBase Snapshot", + HadoopFormatIO.read() + .withConfiguration(configurationBuilder.build())); + + readResult + .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) + .apply( + "Write to Bigtable", + CloudBigtableIO.writeToTable(TemplateUtils.BuildImportConfig(opts))); + + final List> sourceAndRestoreFolders = + Arrays.asList( + KV.of(opts.getHbaseSnapshotSourceDir(), configurationBuilder.getRestoreDir())); + pipeline + .apply(Create.of(sourceAndRestoreFolders)) + .apply(Wait.on(readResult)) + .apply(ParDo.of(new CleanupHBaseSnapshotRestoreFilesFn())); + + return pipeline; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/CreateTableHelper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/CreateTableHelper.java index b4b3862817..4c794ed7eb 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/CreateTableHelper.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/CreateTableHelper.java @@ -57,7 +57,7 @@ * intended to be a preparation step before running an {@link ImportJob}. */ @InternalApi -class CreateTableHelper { +public class CreateTableHelper { private static final Log LOG = LogFactory.getLog(CreateTableHelper.class); @InternalApi diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/HBaseResultToMutationFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/HBaseResultToMutationFn.java index 6b2e628a5d..45954c7762 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/HBaseResultToMutationFn.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/HBaseResultToMutationFn.java @@ -15,6 +15,7 @@ */ package com.google.cloud.bigtable.beam.sequencefiles; +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Predicate; @@ -43,7 +44,8 @@ * A {@link DoFn} function that converts a {@link Result} in the pipeline input to a {@link * Mutation} for output. */ -class HBaseResultToMutationFn extends DoFn, Mutation> { +@InternalApi +public class HBaseResultToMutationFn extends DoFn, Mutation> { private static Logger logger = LoggerFactory.getLogger(HBaseResultToMutationFn.class); private static final long serialVersionUID = 1L; diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java index 62bad8d92b..7098a239d8 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java @@ -15,6 +15,7 @@ */ package com.google.cloud.bigtable.beam.sequencefiles; +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; import org.apache.beam.runners.dataflow.DataflowRunner; import org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions; import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; @@ -29,7 +30,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -class Utils { +@InternalApi +public class Utils { private static final Log LOG = LogFactory.getLog(Utils.class); /** @@ -98,7 +100,7 @@ public ResourceId apply(String input) { * * @param result */ - static void waitForPipelineToFinish(PipelineResult result) { + public static void waitForPipelineToFinish(PipelineResult result) { try { // Check to see if we are creating a template. // This should throw {@link UnsupportedOperationException} when creating a template. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java new file mode 100644 index 0000000000..e62b3c8215 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java @@ -0,0 +1,199 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.coders.ListCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.BoundedSource; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Buffers the RangeHashes generated by {@link HadoopHashTableSource}. This is an optimization that + * allows {@link ComputeAndValidateHashFromBigtableDoFn} to issue fewer ReadRow APIs with larger row + * ranges. + * + *

Hadoop HashTable output is sorted by row-key and contains a row-range and hash. Beam + * Pcollection do not guarantee any ordering. To fetch a batch of ranges in 1 ReadRows operation, + * this source buffers then and outputs a List guaranteeing the sorted order of ranges. + * + *

Emits a batch of sorted RangeHashes keyed by the start key of the first range. + */ +class BufferedHadoopHashTableSource extends BoundedSource>> { + + private static final long serialVersionUID = 39842743L; + + private static final int DEFAULT_BATCH_SIZE = 50; + private static final Coder>> CODER = + KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of()));; + + // Max number of RangeHashes to buffer. + private final int maxBufferSize; + private final HadoopHashTableSource hashTableSource; + + public BufferedHadoopHashTableSource(HadoopHashTableSource source) { + this(source, DEFAULT_BATCH_SIZE); + } + + public BufferedHadoopHashTableSource(HadoopHashTableSource hashTableSource, int maxBufferSize) { + this.hashTableSource = hashTableSource; + this.maxBufferSize = maxBufferSize; + } + + @Override + public List>>> split( + long desiredBundleSizeBytes, PipelineOptions options) throws IOException { + + @SuppressWarnings("unchecked") + List splitHashTableSources = + (List) hashTableSource.split(desiredBundleSizeBytes, options); + + List splitSources = + new ArrayList<>(splitHashTableSources.size()); + // Keep the splits same as HashTableSource. + for (HadoopHashTableSource splitHashTableSource : splitHashTableSources) { + // Add the last range for [lastPartition, stopRow). + splitSources.add(new BufferedHadoopHashTableSource(splitHashTableSource)); + } + return splitSources; + } + + @Override + public Coder>> getOutputCoder() { + return CODER; + } + + @Override + public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { + // HashTable data files don't expose a method to estimate size or lineCount. + return hashTableSource.getEstimatedSizeBytes(options); + } + + @Override + public BoundedReader>> createReader(PipelineOptions options) + throws IOException { + return new BufferedHashBasedReader(this, hashTableSource.createReader(options)); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof BufferedHadoopHashTableSource)) { + return false; + } + BufferedHadoopHashTableSource that = (BufferedHadoopHashTableSource) o; + return maxBufferSize == that.maxBufferSize + && Objects.equal(hashTableSource, that.hashTableSource); + } + + @Override + public int hashCode() { + return Objects.hashCode(maxBufferSize, hashTableSource); + } + + @Override + public String toString() { + return "BufferedHadoopHashTableSource [" + + immutableBytesToString(hashTableSource.startRowInclusive) + + ", " + + immutableBytesToString(hashTableSource.stopRowExclusive) + + "), maxBufferSize=" + + maxBufferSize; + } + + private static class BufferedHashBasedReader extends BoundedReader>> { + + private final BoundedReader hashReader; + private final BufferedHadoopHashTableSource source; + + private List buffer; + + public BufferedHashBasedReader( + BufferedHadoopHashTableSource source, BoundedReader hashReader) { + this.source = source; + this.hashReader = hashReader; + this.buffer = new ArrayList<>(source.maxBufferSize); + } + + @Override + public boolean start() throws IOException { + if (!hashReader.start()) { + // HashReader does not have any hashes, return empty reader. + return false; + } + // Start returned true, consume the current RangeHash. + buffer.add(hashReader.getCurrent()); + bufferRangeHashes(); + // Buffer is not empty, return true to consume the current buffer. + return true; + } + + // Reads from hashReader and buffers the RangeHashes. + // Returns true if any RangeHashes were read from hashReader. + private boolean bufferRangeHashes() throws IOException { + boolean readRangeHashes = false; + while (buffer.size() < source.maxBufferSize && hashReader.advance()) { + readRangeHashes = true; + buffer.add(hashReader.getCurrent()); + } + return readRangeHashes; + } + + @Override + public boolean advance() throws IOException { + // Reset the buffer for next batch. + buffer = new ArrayList<>(source.maxBufferSize); + + return bufferRangeHashes(); + } + + @Override + public KV> getCurrent() { + // getCurrent only gets called when buffer is not empty. + Preconditions.checkState( + !buffer.isEmpty(), "getCurrent() should only be called when start/advance return true."); + // GroupBy key is a string and not ImmutableBytesWritable because the WritableCoder is not + // deterministic. The outputted PCollection is grouped by the K and needs a deterministic + // coder. Having a String K leads to an unfortunate double encoding, ImmutableBytesWritable-> + // HEX string -> UTF8 encoded string. The number of batches are significantly smaller than + // data fetched from Bigtable and should not have meaningful impact on the job performance. + return KV.of(Bytes.toStringBinary(buffer.get(0).startInclusive.copyBytes()), buffer); + } + + @Override + public void close() throws IOException { + hashReader.close(); + } + + @Override + public BoundedSource>> getCurrentSource() { + return source; + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java new file mode 100644 index 0000000000..82e24b55ef --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java @@ -0,0 +1,222 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.bigtable.repackaged.com.google.common.base.Preconditions; +import com.google.bigtable.repackaged.com.google.common.collect.Lists; +import com.google.cloud.bigtable.beam.AbstractCloudBigtableTableDoFn; +import com.google.cloud.bigtable.beam.CloudBigtableConfiguration; +import com.google.cloud.bigtable.beam.TemplateUtils; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions; +import com.google.common.annotations.VisibleForTesting; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher; + +/** + * A {@link DoFn} that takes a row range and hash from HBase and validates the hash from rows read + * from Cloud Bigtable. + */ +class ComputeAndValidateHashFromBigtableDoFn + extends AbstractCloudBigtableTableDoFn>>, RangeHash> { + + private static final long serialVersionUID = 2349094L; + private final ValueProvider tableName; + private final ValueProvider projectId; + private final ValueProvider sourceHashDir; + + private final TableHashWrapperFactory tableHashWrapperFactory; + + // Counter for reporting matching and mismatching ranges. Names are similar to HBase sync-table + // job. + private final Counter matches = Metrics.counter("cbt-dataflow-validate", "ranges_matched"); + private final Counter mismatches = Metrics.counter("cbt-dataflow-validate", "ranges_not_matched"); + + public ComputeAndValidateHashFromBigtableDoFn(SyncTableOptions options) { + super(TemplateUtils.BuildSyncTableConfig(options)); + this.tableName = options.getBigtableTableId(); + // Create a local copy of ValueProviders, PipelineOptions are not serializable. + projectId = options.getBigtableProject(); + sourceHashDir = options.getHashTableOutputDir(); + tableHashWrapperFactory = new TableHashWrapperFactory(); + } + + @VisibleForTesting + ComputeAndValidateHashFromBigtableDoFn( + CloudBigtableConfiguration config, + ValueProvider tableName, + ValueProvider projectId, + ValueProvider sourceHashDir, + TableHashWrapperFactory factory) { + super(config); + this.tableName = tableName; + this.tableHashWrapperFactory = factory; + this.sourceHashDir = projectId; + this.projectId = sourceHashDir; + } + + @ProcessElement + public void processElement(ProcessContext context) throws Exception { + List> wrapperdRangeHashes = Lists.newArrayList(context.element().getValue()); + // BufferedHadoopHashTableSource generates only 1 item per groupby key, key is startKey for the + // Sorted ranges. + Preconditions.checkState( + wrapperdRangeHashes.size() == 1, "Can not have multiple entries for a key"); + List rangeHashes = wrapperdRangeHashes.get(0); + Preconditions.checkState(!rangeHashes.isEmpty(), "Can not have empty ranges in DO_FN"); + + // If a metric is not logged, it is absent from all the metrics (as opposed to being + // 0). By logging a 0 value for the metrics we guarantee that they shows up on Dataflow UIs. + mismatches.inc(0); + matches.inc(0); + + ImmutableBytesWritable rangeStartInclusive = rangeHashes.get(0).startInclusive; + ImmutableBytesWritable rangeEndExclusive = + rangeHashes.get(rangeHashes.size() - 1).stopExclusive; + + BigtableResultHasher resultHasher = new BigtableResultHasher(); + resultHasher.startBatch(rangeStartInclusive); + + // Since all the row-ranges are sorted in HashTable's data files, 1 big scan can be used + // to read all the row ranges. Parallelism is achieved by splitting the HashTable's data + // files into smaller bundle of row-ranges in GroupBy. + ResultScanner scanner = + createBigtableScan(rangeStartInclusive.copyBytes(), rangeEndExclusive.copyBytes()); + + Iterator rangeHashIterator = rangeHashes.iterator(); + long numRows = 0; + + RangeHash currentRangeHash = rangeHashIterator.next(); + + // Process each row and validate hashes + for (Result result : scanner) { + numRows++; + if (numRows % 10_000 == 0) { + // Heartbeat in logs in case a large scan gets hung. + DOFN_LOG.debug("Processed " + numRows + " rows "); + } + + ImmutableBytesWritable rowKey = new ImmutableBytesWritable(result.getRow()); + + // Check if the rowKey belongs to current range, if not keep iterating through the + // rangeHashes until rowKey's range is found. + while (!isWithinUpperBound(currentRangeHash.stopExclusive, rowKey)) { + validateBatchHash(context, resultHasher, currentRangeHash); + // THIS SHOULD NEVER HAPPEN. Bigtable is being scanned till the last + // RangeHash.endKeyExclusive(), so bigtable's result should not outlast the + // rangeHashes. + Preconditions.checkState( + rangeHashIterator.hasNext(), + "Buffer reached to end while scan is still active at row : %s. " + + "Affected Range: [%s, %s)." + + immutableBytesToString(result.getRow()) + + immutableBytesToString(rangeStartInclusive) + + immutableBytesToString(rangeEndExclusive)); + currentRangeHash = rangeHashIterator.next(); + } + + // Always Hash the current row. + resultHasher.hashResult(result); + } + + // Bigtable scan is finished at this point and rangeHashes may contain additional row ranges. + // Last range will always be unverified as the range end is exclusive and + // currentRow > rangeEndExclusive will never by true. Verify the last range. + validateBatchHash(context, resultHasher, currentRangeHash); + + // If there are remaining ranges in the rangeHashes they all need to reported as mismatched as + // there is nothing in Cloud Bigtable for those row ranges. + // for (int i = bufferIndex; i < rangeHashes.size(); i++) { + while (rangeHashIterator.hasNext()) { + currentRangeHash = rangeHashIterator.next(); + reportMismatch(context, currentRangeHash); + } + + DOFN_LOG.debug( + "Finishing context by outputting {} keys in range [{}, {}).", + rangeHashes.size(), + immutableBytesToString(rangeStartInclusive), + immutableBytesToString(rangeEndExclusive)); + } + + private ResultScanner createBigtableScan(byte[] startKeyInclusive, byte[] stopKeyExclusive) + throws IOException { + Table table = getConnection().getTable(TableName.valueOf(tableName.get())); + // Get the scan from TableHash, HashTable can be run to hash a small part of data (selected + // column families, timestamp range, maxVersions etc), this scan allows us to fetch the same + // data from Cloud Bigtable to match. + TableHashWrapper tableHash = + tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get()); + Scan scan = tableHash.getScan(); + // Set the workitem boundaries on the scan. + if (startKeyInclusive.length > 0) { + scan.withStartRow(startKeyInclusive, true); + } + if (stopKeyExclusive.length > 0) { + scan.withStopRow(stopKeyExclusive, false); + } + + return table.getScanner(scan); + } + + /** + * Determines if row >= stopExclusive for a row range (start, stopExclusive). Empty stopExclusive + * represents a range with no upper bound. + */ + private static boolean isWithinUpperBound( + ImmutableBytesWritable stopExclusive, ImmutableBytesWritable row) { + return stopExclusive.equals(HConstants.EMPTY_END_ROW) || row.compareTo(stopExclusive) < 0; + } + + private void validateBatchHash( + ProcessContext context, BigtableResultHasher resultHasher, RangeHash currentRangeHash) { + // The batch is always started, so its safe to finish the batch. If there were no rows, we will + // get a hash for empty batch. + resultHasher.finishBatch(); + if (!resultHasher.getBatchHash().equals(currentRangeHash.hash)) { + reportMismatch(context, currentRangeHash); + } else { + matches.inc(); + } + // Start a new batch + resultHasher.startBatch(currentRangeHash.stopExclusive); + } + + private void reportMismatch(ProcessContext context, RangeHash currentRangeHash) { + mismatches.inc(); + DOFN_LOG.info( + "MISMATCH ON RANGE [{}, {}).", + immutableBytesToString(currentRangeHash.startInclusive), + immutableBytesToString(currentRangeHash.stopExclusive)); + context.output(currentRangeHash); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java new file mode 100644 index 0000000000..f6ecf21e24 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java @@ -0,0 +1,440 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import com.google.bigtable.repackaged.com.google.common.annotations.VisibleForTesting; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.beam.validation.TableHashWrapper.TableHashReader; +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import javax.annotation.Nullable; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.DefaultCoder; +import org.apache.beam.sdk.io.BoundedSource; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * A beam source to read output of Hadoop HashTable job. The source creates 1 workitem per HashTable + * data file and emits a row-range/hash pair. + */ +@InternalApi +public class HadoopHashTableSource extends BoundedSource implements Serializable { + + private static final long serialVersionUID = 2383724L; + + private static final Coder CODER = RangeHashCoder.of(); + + /** + * A simple POJO encapsulating a row range and the corresponding hash generated by HashTable job. + * TODO Evaluate if we can use AutoValue for this class. + */ + @DefaultCoder(RangeHashCoder.class) + public static class RangeHash { + + public final ImmutableBytesWritable startInclusive; + public final ImmutableBytesWritable stopExclusive; + public final ImmutableBytesWritable hash; + + private RangeHash( + ImmutableBytesWritable startInclusive, + ImmutableBytesWritable stopExclusive, + ImmutableBytesWritable hash) { + this.startInclusive = startInclusive; + this.stopExclusive = stopExclusive; + this.hash = hash; + } + + static RangeHash of( + ImmutableBytesWritable startInclusive, + ImmutableBytesWritable stopExclusive, + ImmutableBytesWritable hash) { + Preconditions.checkNotNull(startInclusive); + Preconditions.checkNotNull(stopExclusive); + Preconditions.checkNotNull(hash); + return new RangeHash(startInclusive, stopExclusive, hash); + } + + @Override + public String toString() { + return String.format( + "RangeHash{ range = [ %s, %s), hash: %s }", + immutableBytesToString(startInclusive), + immutableBytesToString(stopExclusive), + immutableBytesToString(hash)); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof RangeHash)) { + return false; + } + RangeHash rangeHash = (RangeHash) o; + return Objects.equal(startInclusive, rangeHash.startInclusive) + && Objects.equal(stopExclusive, rangeHash.stopExclusive) + && Objects.equal(hash, rangeHash.hash); + } + + @Override + public int hashCode() { + return Objects.hashCode(startInclusive, stopExclusive, hash); + } + } + + public static final Log LOG = LogFactory.getLog(HadoopHashTableSource.class); + + private final ValueProvider projectId; + + // Path to the output of HashTable job. Usually in GCS. + private final ValueProvider sourceHashDir; + + // Row range owned by this source. + // The Start and Stop row are serialized in a custom way. + @VisibleForTesting @Nullable transient ImmutableBytesWritable startRowInclusive; + + @VisibleForTesting @Nullable transient ImmutableBytesWritable stopRowExclusive; + + private final TableHashWrapperFactory tableHashWrapperFactory; + + /** + * Creates a HadoopHashTableSource that reads HashTable data from hashTableOutputDir in GCS bucket + * in project $(projectId). + */ + public HadoopHashTableSource( + ValueProvider projectId, ValueProvider sourceHashDir) { + this(projectId, sourceHashDir, /*startRowInclusive*/ null, /*stopRowExclusive*/ null); + } + + /** + * Constructor to initialize a HadoopHashTableSource for a given row-range. Used for creating + * split sources. + */ + @VisibleForTesting + HadoopHashTableSource( + ValueProvider projectId, + ValueProvider sourceHashDir, + @Nullable ImmutableBytesWritable startRowInclusive, + @Nullable ImmutableBytesWritable stopRowExclusive) { + this( + projectId, + sourceHashDir, + startRowInclusive, + stopRowExclusive, + new TableHashWrapperFactory()); + } + + @VisibleForTesting + HadoopHashTableSource( + ValueProvider projectId, + ValueProvider hadoopHashTableOutputDir, + @Nullable ImmutableBytesWritable startRowInclusive, + @Nullable ImmutableBytesWritable stopRowExclusive, + TableHashWrapperFactory tableHashWrapperFactory) { + this.projectId = projectId; + this.sourceHashDir = hadoopHashTableOutputDir; + // startRow and stopRow will be null when the template is initialized. startRow and stopRow are + // read from the hashTableOutputDir, which is only available at pipeline runtime. + this.startRowInclusive = startRowInclusive; + this.stopRowExclusive = stopRowExclusive; + this.tableHashWrapperFactory = tableHashWrapperFactory; + } + + @Override + public List> split( + long desiredBundleSizeBytes, PipelineOptions options) throws IOException { + // This method relies on the partitioning done by HBase-HashTable job. There is a possibility + // of stragglers. SyncTable handles it by using a group by and further splitting workitems. + TableHashWrapper hash = + tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get()); + + ImmutableList partitions = hash.getPartitions(); + int numPartitions = partitions.size(); + + List splitSources = new ArrayList<>(numPartitions + 1); + if (numPartitions == 0) { + // There are 0 partitions and 1 hashfile, return single source with full key range. + splitSources.add( + new HadoopHashTableSource( + projectId, + sourceHashDir, + hash.getStartRow(), + hash.getStopRow(), + tableHashWrapperFactory)); + return splitSources; + } + + // Use the HashTable start key. The value is HConstants.EMPTY_START_ROW for full table scan. + ImmutableBytesWritable nextStartRow = hash.getStartRow(); + ImmutableBytesWritable stopRow = hash.getStopRow(); + + // The output of HashTable is organized as partition file and a set of datafiles. + // Partition file contains a list of partitions, these partitions split the key-range of a table + // into roughly equal row-ranges and hashes for these row-ranges are stored in a single + // datafile. + // + // There are always numPartitions +1 data files. Datafile(i) covers hashes for [partition{i-1}, + // partition{i}). + // So a partition file containing entries [b,f] for a table with row range [a,z] will have 3 + // data files containing hashes. + // file0 will contain [a(nextStartRow), b), file1 will contain [b,f), and file3 will contain + // [f,z(stopRow)) + for (int i = 0; i < numPartitions; i++) { + // TODO make a utility function that generates [start, end) format from start/end. + LOG.debug( + "Adding: [" + + immutableBytesToString(nextStartRow.get()) + + ", " + + immutableBytesToString(partitions.get(i).get()) + + ")"); + splitSources.add( + new HadoopHashTableSource( + projectId, sourceHashDir, nextStartRow, partitions.get(i), tableHashWrapperFactory)); + nextStartRow = partitions.get(i); + } + // Add the last range for [lastPartition, stopRow). + LOG.debug( + "Adding: [" + + immutableBytesToString(nextStartRow.get()) + + ", " + + immutableBytesToString(stopRow.get()) + + ")"); + // Add the last range for [lastPartition, stopRow). + splitSources.add( + new HadoopHashTableSource( + projectId, sourceHashDir, nextStartRow, stopRow, tableHashWrapperFactory)); + LOG.info("Returning " + splitSources.size() + " sources from " + numPartitions + " partitions"); + return splitSources; + } + + @Override + public Coder getOutputCoder() { + return CODER; + } + + @Override + public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { + // HashTable data files don't expose a method to estimate size or lineCount. + return 0; + } + + @Override + public BoundedReader createReader(PipelineOptions options) throws IOException { + TableHashWrapper hash = + tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get()); + + // The row range for an un-split source is determined from the output of HashTable job. + // HashTableOutputDir is a runtime parameter and hence not available at construction time, so + // populate the start and stop here. + if (startRowInclusive == null || stopRowExclusive == null) { + startRowInclusive = hash.getStartRow(); + stopRowExclusive = hash.getStopRow(); + } + + return new HashBasedReader( + this, + startRowInclusive, + stopRowExclusive, + hash.newReader( + SyncTableUtils.createConfiguration(this.projectId.get(), this.sourceHashDir.get()), + startRowInclusive)); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof HadoopHashTableSource)) { + return false; + } + HadoopHashTableSource that = (HadoopHashTableSource) o; + return Objects.equal(projectId, that.projectId) + && Objects.equal(sourceHashDir, that.sourceHashDir) + && Objects.equal(startRowInclusive, that.startRowInclusive) + && Objects.equal(stopRowExclusive, that.stopRowExclusive); + } + + @Override + public int hashCode() { + return Objects.hashCode(projectId, sourceHashDir, startRowInclusive, stopRowExclusive); + } + + @Override + public String toString() { + return "HadoopHashTableSource [" + + immutableBytesToString(startRowInclusive) + + ", " + + immutableBytesToString(stopRowExclusive) + + ')'; + } + + private void writeObject(ObjectOutputStream s) throws IOException { + s.defaultWriteObject(); + // Start and Stop can be null, write a boolean to indicate if start/stop is expected. + if (startRowInclusive == null) { + s.writeBoolean(false); + } else { + s.writeBoolean(true); + s.writeObject(startRowInclusive.copyBytes()); + } + + if (stopRowExclusive == null) { + s.writeBoolean(false); + } else { + s.writeBoolean(true); + s.writeObject(stopRowExclusive.copyBytes()); + } + } + + private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { + s.defaultReadObject(); + // start/stop can be null, they are preceded by a boolean indicating their presence. + if (s.readBoolean() == true) { + startRowInclusive = new ImmutableBytesWritable((byte[]) s.readObject()); + } + if (s.readBoolean() == true) { + stopRowExclusive = new ImmutableBytesWritable((byte[]) s.readObject()); + } + } + + @VisibleForTesting + static class HashBasedReader extends BoundedReader { + + private final HadoopHashTableSource source; + private final TableHashReader reader; + + @VisibleForTesting final ImmutableBytesWritable startRowInclusive; + @VisibleForTesting final ImmutableBytesWritable stopRowExclusive; + + // Flag indicating that this workitem is finished. + private boolean isDone = false; + private ImmutableBytesWritable currentRangeStartKey; + // Hash for the current range. + private ImmutableBytesWritable currentHash; + private RangeHash currentRangeHash; + + public HashBasedReader( + HadoopHashTableSource source, + ImmutableBytesWritable startRowInclusive, + ImmutableBytesWritable stopRowExclusive, + TableHashReader reader) { + this.source = source; + this.startRowInclusive = startRowInclusive; + this.stopRowExclusive = stopRowExclusive; + this.reader = reader; + } + + @Override + public boolean start() throws IOException { + LOG.debug( + "Starting a new reader at key range [" + + immutableBytesToString(startRowInclusive) + + " ," + + immutableBytesToString(stopRowExclusive) + + ")."); + + if (readNextKey()) { + // Dataflow calls start, followed by getCurrent. HashBased reader needs to read on TableHash + // twice to return a RangeHash since it specifies both range-start and range-end. + advance(); + return true; + } + + isDone = true; + return false; + } + + @Override + public boolean advance() throws IOException { + if (isDone) { + LOG.debug("Ending workitem at key " + immutableBytesToString(currentRangeStartKey) + " ."); + return false; + } + + ImmutableBytesWritable startKey = this.currentRangeStartKey; + ImmutableBytesWritable hash = this.currentHash; + + // if there is nothing to read, we are done. readNextKey advances the currentRangeStartKey. + isDone = !readNextKey(); + currentRangeHash = RangeHash.of(startKey, currentRangeStartKey, hash); + + return true; + } + + // Returns true if a key can be read for this workitem. + private boolean readNextKey() throws IOException { + if (reader.next()) { + currentRangeStartKey = reader.getCurrentKey(); + if ( // StopRow is not set, everything is in bounds. + (stopRowExclusive.equals(HConstants.EMPTY_END_ROW) + || currentRangeStartKey.compareTo(stopRowExclusive) < 0)) { // currentKey < stopKey + // There is a key to read and the key is within the bounds of this workitem. Return true. + currentHash = reader.getCurrentHash(); + return true; + } else { + // There is a key to read but its outside of the bounds of this workitem. + currentHash = null; + return false; + } + } + + // Nothing left to read for this workitem. Next range would have started from + // stopRowExclusive. + currentRangeStartKey = stopRowExclusive; + currentHash = null; + return false; + } + + @Override + public RangeHash getCurrent() { + return currentRangeHash; + } + + @Override + public void close() throws IOException { + LOG.info( + "Finishing a reader for key range [" + + immutableBytesToString(startRowInclusive) + + " ," + + immutableBytesToString(stopRowExclusive) + + "). Ending at " + + immutableBytesToString(currentRangeStartKey)); + reader.close(); + } + + @Override + public BoundedSource getCurrentSource() { + return source; + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java new file mode 100644 index 0000000000..d6341a08f2 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java @@ -0,0 +1,105 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InvalidObjectException; +import java.io.OutputStream; +import java.util.Collections; +import java.util.List; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.CoderException; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** Coder used by beam to encode/decode @{@link RangeHash} objects. */ +public class RangeHashCoder extends Coder { + + public static Coder of() { + return new RangeHashCoder(); + } + + @Override + public void encode(RangeHash value, OutputStream outStream) throws IOException { + if (value == null) { + throw new CoderException("Can not encode null objects."); + } + DataOutputStream dataOutputStream = new DataOutputStream(outStream); + // RangeHash fields can never be null. + value.startInclusive.write(dataOutputStream); + value.stopExclusive.write(dataOutputStream); + value.hash.write(dataOutputStream); + } + + @Override + public RangeHash decode(InputStream inStream) throws IOException { + DataInputStream dataInputStream = new DataInputStream(inStream); + + ImmutableBytesWritable startInclusive = new ImmutableBytesWritable(); + startInclusive.readFields(dataInputStream); + + ImmutableBytesWritable stopExclusive = new ImmutableBytesWritable(); + stopExclusive.readFields(dataInputStream); + + ImmutableBytesWritable hash = new ImmutableBytesWritable(); + hash.readFields(dataInputStream); + + return RangeHash.of(startInclusive, stopExclusive, hash); + } + + @Override + public List> getCoderArguments() { + return Collections.emptyList(); + } + + @Override + public void verifyDeterministic() throws NonDeterministicException { + // This is a deterministic coder as it writes the byte[] in order. + } + + /** + * !!! DO NOT DELETE !!! + * + *

See readObjectNoData method in: + * https://docs.oracle.com/javase/7/docs/platform/serialization/spec/input.html#6053. + * + *

Disable backwards compatibility with previous versions that were serialized. + * + * @throws InvalidObjectException + */ + @SuppressWarnings("unused") + private void readObjectNoData() throws InvalidObjectException { + throw new InvalidObjectException("Hash data required"); + } + + @Override + protected Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + @Override + public boolean equals(Object other) { + return other instanceof RangeHashCoder; + } + + @Override + public int hashCode() { + return RangeHashCoder.class.hashCode(); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java new file mode 100644 index 0000000000..56b38fc3cb --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java @@ -0,0 +1,193 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; +import com.google.bigtable.repackaged.com.google.gson.Gson; +import com.google.cloud.bigtable.beam.sequencefiles.Utils; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.common.annotations.VisibleForTesting; +import java.util.List; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.io.Read; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.metrics.MetricQueryResults; +import org.apache.beam.sdk.metrics.MetricResult; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.beam.sdk.transforms.GroupByKey; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A job that takes HBase HashTable output and compares the hashes from Cloud Bigtable table. + * + *

Execute the following command to run the job directly: + * + *

+ *   mvn compile exec:java \
+ *      -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
+ *      -Dexec.args="--runner=DataflowRunner \
+ *            --project=$PROJECT \
+ *            --bigtableInstanceId=$INSTANCE \
+ *            --bigtableTableId=$TABLE \
+ *            --sourceHashDir=$SOURCE_HASH_DIR \
+ *            --outputPrefix=$OUtPUT_PREFIX \
+ *            --stagingLocation=$STAGING_LOC \
+ *            --tempLocation=$TMP_LOC \
+ *            --region=$REGION \
+ *            --workerZone=$WORKER_ZONE"
+ * 
+ * + *

Execute the following command to create the Dataflow template: + * + *

+ * mvn compile exec:java \
+ *   -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
+ *   -Dexec.args="--runner=DataflowRunner \
+ *                --project=$PROJECT \
+ *                --stagingLocation=gs://$STAGING_PATH \
+ *                --templateLocation=gs://$TEMPLATE_PATH \
+ *                --wait=false"
+ * 
+ * + *

There are a few ways to run the pipeline using the template. See Dataflow doc for details: + * https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload + * a metadata file that contains information about the runtime parameters that can be used for + * parameter validation purpose and more. A sample metadata file can be found at + * "src/main/resources/SyncTableJob_metadata". + * + *

An example using gcloud command line: + * + *

+ * gcloud beta dataflow jobs run $JOB_NAME \
+ *   --gcs-location gs://$TEMPLATE_PATH \
+ *   --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,sourceHashDir=gs://$SOURCE_HASH_DIR,outputPrefix=$OUTPUT_PREFIX
+ * 
+ */ +@InternalExtensionOnly +public class SyncTableJob { + + private static final Log LOG = LogFactory.getLog(SyncTableJob.class); + + public interface SyncTableOptions extends GcpOptions { + + @Description("This Bigtable App Profile id.") + ValueProvider getBigtableAppProfileId(); + + @SuppressWarnings("unused") + void setBigtableAppProfileId(ValueProvider appProfileId); + + @Description("The project that contains the table to export. Defaults to --project.") + @Default.InstanceFactory(Utils.DefaultBigtableProjectFactory.class) + ValueProvider getBigtableProject(); + + @SuppressWarnings("unused") + void setBigtableProject(ValueProvider projectId); + + @Description("The Bigtable instance id that contains the table to export.") + ValueProvider getBigtableInstanceId(); + + @SuppressWarnings("unused") + void setBigtableInstanceId(ValueProvider instanceId); + + @Description("The Bigtable table id to export.") + ValueProvider getBigtableTableId(); + + @SuppressWarnings("unused") + void setBigtableTableId(ValueProvider tableId); + + @Description("HBase HashTable job output dir.") + ValueProvider getHashTableOutputDir(); + + @SuppressWarnings("unused") + // Rename it to sourceHashDir as in HBase sync table job. + void setHashTableOutputDir(ValueProvider hashTableOutputDir); + + @Description("File pattern for files containing mismatched row ranges.") + ValueProvider getOutputPrefix(); + + @SuppressWarnings("unused") + void setOutputPrefix(ValueProvider outputPrefix); + + // When creating a template, this flag must be set to false. + @Description("Wait for pipeline to finish.") + @Default.Boolean(true) + boolean getWait(); + + @SuppressWarnings("unused") + void setWait(boolean wait); + } + + public static void main(String[] args) { + PipelineOptionsFactory.register(SyncTableOptions.class); + + SyncTableOptions opts = + PipelineOptionsFactory.fromArgs(args).withValidation().as(SyncTableOptions.class); + + LOG.info("===> Building Pipeline"); + Pipeline pipeline = buildPipeline(opts); + + LOG.info("===> Running Pipeline"); + PipelineResult result = pipeline.run(); + + if (opts.getWait()) { + Utils.waitForPipelineToFinish(result); + } + + // Log all the counters for number of matches and number of mismatches. + MetricQueryResults metrics = result.metrics().allMetrics(); + for (MetricResult counter : metrics.getCounters()) { + LOG.warn(counter.getName() + ":" + counter.getAttempted()); + } + } + + @VisibleForTesting + public static Pipeline buildPipeline(SyncTableOptions opts) { + Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); + pipeline + .apply( + "Read HBase HashTable output", + Read.from( + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + opts.getBigtableProject(), opts.getHashTableOutputDir())))) + .apply( + "group by and create granular workitems", GroupByKey.>create()) + .apply("validate hash", ParDo.of(new ComputeAndValidateHashFromBigtableDoFn(opts))) + .apply("Serialize the ranges", MapElements.via(new RangeHashToString())) + .apply("Write to file", TextIO.write().to(opts.getOutputPrefix()).withSuffix(".txt")); + return pipeline; + } + + static class RangeHashToString extends SimpleFunction { + // TODO maybe explore a sequenceFile sink for RangeHash. Hadoop jobs using this output may be + // easier to write for sequence file. + private static final Gson GSON = new Gson(); + + @Override + public String apply(RangeHash input) { + return GSON.toJson(input); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java new file mode 100644 index 0000000000..cc92bea6a4 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java @@ -0,0 +1,57 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; + +/** Utility class for SyncTable job. */ +@InternalApi +public class SyncTableUtils { + + private SyncTableUtils() {} + + public static String immutableBytesToString(ImmutableBytesWritable bytes) { + if (bytes == null) { + return ""; + } + return immutableBytesToString(bytes.get()); + } + + public static String immutableBytesToString(byte[] bytes) { + return Bytes.toStringBinary(bytes); + } + + /** + * Creates a HBase configuration for reading HashTable output from GCS bucket located in + * projectId. + * + * @param projectId project containing the GCS bucket holding hashtable output. + * @param sourceHashDir location of hashtable output from HBase. + * @return + */ + public static Configuration createConfiguration(String projectId, String sourceHashDir) { + Configuration conf = HBaseConfiguration.create(); + conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); + conf.set("fs.gs.project.id", projectId); + conf.set("fs.defaultFS", sourceHashDir); + conf.set("google.cloud.auth.service.account.enable", "true"); + return conf; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java new file mode 100644 index 0000000000..55200570ed --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java @@ -0,0 +1,55 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import com.google.common.collect.ImmutableList; +import java.io.Closeable; +import java.io.IOException; +import java.io.Serializable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * Wraps HashTable.TableHash object and delegates the calls to it. This class exposes the minimal + * interface required from TableHash. This class is required for mocking purposes in unit tests. + */ +@InternalApi +public interface TableHashWrapper extends Serializable { + + int getNumHashFiles(); + + ImmutableList getPartitions(); + + ImmutableBytesWritable getStartRow(); + + ImmutableBytesWritable getStopRow(); + + Scan getScan(); + + TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow); + + interface TableHashReader extends Closeable { + boolean next() throws IOException; + + ImmutableBytesWritable getCurrentKey(); + + ImmutableBytesWritable getCurrentHash(); + + void close() throws IOException; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java new file mode 100644 index 0000000000..a4e3544519 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java @@ -0,0 +1,35 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.createConfiguration; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import java.io.IOException; +import java.io.Serializable; + +/** Factory to create a TableHashWrapper. */ +@InternalApi +public class TableHashWrapperFactory implements Serializable { + + private static final long serialVersionUID = 265433454L; + + public TableHashWrapper getTableHash(String projectId, String sourceHashDir) throws IOException { + return TableHashWrapperImpl.create( + createConfiguration(projectId, sourceHashDir), sourceHashDir); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java new file mode 100644 index 0000000000..b04bd538a6 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java @@ -0,0 +1,118 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor; +import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash; +import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash.Reader; + +class TableHashWrapperImpl implements TableHashWrapper { + + static TableHashWrapper create(Configuration conf, String hashTableOutputDir) throws IOException { + TableHash tableHash = TableHash.read(conf, new Path(hashTableOutputDir)); + + TableHashWrapper tableHashWrapper = new TableHashWrapperImpl(tableHash); + Preconditions.checkArgument( + tableHashWrapper.getNumHashFiles() == (tableHashWrapper.getPartitions().size() + 1), + "Corrupt hashtable output. %d hash files for %d partitions. Expected %d files.", + tableHashWrapper.getNumHashFiles(), + tableHashWrapper.getPartitions().size(), + tableHashWrapper.getPartitions().size() + 1); + return tableHashWrapper; + } + + private final TableHash hash; + + private TableHashWrapperImpl(TableHash hash) { + this.hash = hash; + } + + public int getNumHashFiles() { + return BigtableTableHashAccessor.getNumHashFiles(hash); + } + + public ImmutableList getPartitions() { + return BigtableTableHashAccessor.getPartitions(hash); + } + + public ImmutableBytesWritable getStartRow() { + return BigtableTableHashAccessor.getStartRow(hash); + } + + public ImmutableBytesWritable getStopRow() { + return BigtableTableHashAccessor.getStopRow(hash); + } + + public Scan getScan() { + try { + return BigtableTableHashAccessor.getScan(hash); + } catch (IOException e) { + throw new RuntimeException("Failed to init a scan from TableHash: ", e); + } + } + + public TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow) { + try { + return TableHashReaderImpl.create(hash.newReader(conf, startRow)); + } catch (IOException e) { + throw new RuntimeException( + "Failed to open reader at " + immutableBytesToString(startRow.copyBytes()), e); + } + } + + static class TableHashReaderImpl implements TableHashReader { + + private final Reader reader; + + static TableHashReaderImpl create(TableHash.Reader reader) { + Preconditions.checkNotNull(reader, "Reader can not be null."); + return new TableHashReaderImpl(reader); + } + + private TableHashReaderImpl(TableHash.Reader reader) { + this.reader = reader; + } + + @Override + public boolean next() throws IOException { + return reader.next(); + } + + @Override + public ImmutableBytesWritable getCurrentKey() { + return reader.getCurrentKey(); + } + + @Override + public ImmutableBytesWritable getCurrentHash() { + return reader.getCurrentHash(); + } + + @Override + public void close() throws IOException { + reader.close(); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java new file mode 100644 index 0000000000..a7db0add1c --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java @@ -0,0 +1,79 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.mapreduce; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.HashTable.ResultHasher; +import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash; + +/** A helper class to access package private fields of HashTable.TableHash. */ +@InternalApi +public class BigtableTableHashAccessor { + + // Restrict object creation. This class should only be used to access state from TableHash. + private BigtableTableHashAccessor() {} + + public static int getNumHashFiles(TableHash hash) { + return hash.numHashFiles; + } + + public static ImmutableList getPartitions(TableHash hash) { + return ImmutableList.copyOf(hash.partitions); + } + + public static ImmutableBytesWritable getStartRow(TableHash hash) { + return new ImmutableBytesWritable(hash.startRow); + } + + public static ImmutableBytesWritable getStopRow(TableHash hash) { + return new ImmutableBytesWritable(hash.stopRow); + } + + public static Scan getScan(TableHash hash) throws IOException { + return hash.initScan(); + } + + // Wrapper to access package private class ResultHasher. Delegates all the calls to underlying + // TableHash.ResultHasher, helps in mocking for unit tests. + public static class BigtableResultHasher { + private final ResultHasher hasher; + + public BigtableResultHasher() { + hasher = new ResultHasher(); + } + + public void startBatch(ImmutableBytesWritable batchStartKey) { + hasher.startBatch(batchStartKey); + } + + public void finishBatch() { + hasher.finishBatch(); + } + + public ImmutableBytesWritable getBatchHash() { + return hasher.getBatchHash(); + } + + public void hashResult(Result result) { + hasher.hashResult(result); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt b/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt new file mode 100644 index 0000000000..6e66d3e096 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt @@ -0,0 +1,133 @@ +// Run from HBase shell. Run `hbase shell` from unix terminal on HBase master. +create 'test', 'cf', {SPLITS => ["1", "2", "3", "4", "5", "6", "7", "8", "9"]} +put 'test','1', 'cf:a', 'value1', 100 +put 'test','2', 'cf:a', 'value2', 100 +put 'test','3', 'cf:a', 'value3', 100 +put 'test','4', 'cf:a', 'value4', 100 +put 'test','5', 'cf:a', 'value5', 100 +put 'test','6', 'cf:a', 'value6', 100 +put 'test','7', 'cf:a', 'value7', 100 +put 'test','8', 'cf:a', 'value8', 100 +put 'test','9', 'cf:a', 'value9', 100 +put 'test','10', 'cf:a', 'value10', 100 +put 'test','11', 'cf:a', 'value11', 100 +put 'test','12', 'cf:a', 'value12', 100 +put 'test','13', 'cf:a', 'value13', 100 +put 'test','14', 'cf:a', 'value14', 100 +put 'test','15', 'cf:a', 'value15', 100 +put 'test','16', 'cf:a', 'value16', 100 +put 'test','17', 'cf:a', 'value17', 100 +put 'test','18', 'cf:a', 'value18', 100 +put 'test','19', 'cf:a', 'value19', 100 +put 'test','20', 'cf:a', 'value20', 100 +put 'test','21', 'cf:a', 'value21', 100 +put 'test','22', 'cf:a', 'value22', 100 +put 'test','23', 'cf:a', 'value23', 100 +put 'test','24', 'cf:a', 'value24', 100 +put 'test','25', 'cf:a', 'value25', 100 +put 'test','26', 'cf:a', 'value26', 100 +put 'test','27', 'cf:a', 'value27', 100 +put 'test','28', 'cf:a', 'value28', 100 +put 'test','29', 'cf:a', 'value29', 100 +put 'test','30', 'cf:a', 'value30', 100 +put 'test','31', 'cf:a', 'value31', 100 +put 'test','32', 'cf:a', 'value32', 100 +put 'test','33', 'cf:a', 'value33', 100 +put 'test','34', 'cf:a', 'value34', 100 +put 'test','35', 'cf:a', 'value35', 100 +put 'test','36', 'cf:a', 'value36', 100 +put 'test','37', 'cf:a', 'value37', 100 +put 'test','38', 'cf:a', 'value38', 100 +put 'test','39', 'cf:a', 'value39', 100 +put 'test','40', 'cf:a', 'value40', 100 +put 'test','41', 'cf:a', 'value41', 100 +put 'test','42', 'cf:a', 'value42', 100 +put 'test','43', 'cf:a', 'value43', 100 +put 'test','44', 'cf:a', 'value44', 100 +put 'test','45', 'cf:a', 'value45', 100 +put 'test','46', 'cf:a', 'value46', 100 +put 'test','47', 'cf:a', 'value47', 100 +put 'test','48', 'cf:a', 'value48', 100 +put 'test','49', 'cf:a', 'value49', 100 +put 'test','50', 'cf:a', 'value50', 100 +put 'test','51', 'cf:a', 'value51', 100 +put 'test','52', 'cf:a', 'value52', 100 +put 'test','53', 'cf:a', 'value53', 100 +put 'test','54', 'cf:a', 'value54', 100 +put 'test','55', 'cf:a', 'value55', 100 +put 'test','56', 'cf:a', 'value56', 100 +put 'test','57', 'cf:a', 'value57', 100 +put 'test','58', 'cf:a', 'value58', 100 +put 'test','59', 'cf:a', 'value59', 100 +put 'test','60', 'cf:a', 'value60', 100 +put 'test','61', 'cf:a', 'value61', 100 +put 'test','62', 'cf:a', 'value62', 100 +put 'test','63', 'cf:a', 'value63', 100 +put 'test','64', 'cf:a', 'value64', 100 +put 'test','65', 'cf:a', 'value65', 100 +put 'test','66', 'cf:a', 'value66', 100 +put 'test','67', 'cf:a', 'value67', 100 +put 'test','68', 'cf:a', 'value68', 100 +put 'test','69', 'cf:a', 'value69', 100 +put 'test','70', 'cf:a', 'value70', 100 +put 'test','71', 'cf:a', 'value71', 100 +put 'test','72', 'cf:a', 'value72', 100 +put 'test','73', 'cf:a', 'value73', 100 +put 'test','74', 'cf:a', 'value74', 100 +put 'test','75', 'cf:a', 'value75', 100 +put 'test','76', 'cf:a', 'value76', 100 +put 'test','77', 'cf:a', 'value77', 100 +put 'test','78', 'cf:a', 'value78', 100 +put 'test','79', 'cf:a', 'value79', 100 +put 'test','80', 'cf:a', 'value80', 100 +put 'test','81', 'cf:a', 'value81', 100 +put 'test','82', 'cf:a', 'value82', 100 +put 'test','83', 'cf:a', 'value83', 100 +put 'test','84', 'cf:a', 'value84', 100 +put 'test','85', 'cf:a', 'value85', 100 +put 'test','86', 'cf:a', 'value86', 100 +put 'test','87', 'cf:a', 'value87', 100 +put 'test','88', 'cf:a', 'value88', 100 +put 'test','89', 'cf:a', 'value89', 100 +put 'test','90', 'cf:a', 'value90', 100 +put 'test','91', 'cf:a', 'value91', 100 +put 'test','92', 'cf:a', 'value92', 100 +put 'test','93', 'cf:a', 'value93', 100 +put 'test','94', 'cf:a', 'value94', 100 +put 'test','95', 'cf:a', 'value95', 100 +put 'test','96', 'cf:a', 'value96', 100 +put 'test','97', 'cf:a', 'value97', 100 +put 'test','98', 'cf:a', 'value98', 100 +put 'test','99', 'cf:a', 'value99', 100 +put 'test','100', 'cf:a', 'value100', 100 +snapshot 'test', 'test-snapshot' +list_snapshots + + +////////////////////Run from Unix shell on HBase master node////////////////// +// Export the snapshot +hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -snapshot test-snapshot -copy-to /integration-test/data -mappers 16 + +// Create the hashes for the table. Run the command from unix shell on an HBase +// node. +hbase org.apache.hadoop.hbase.mapreduce.HashTable --batchsize=10 --numhashfiles=10 test /integration-test/hashtable + +// Export the data into GCS +hadoop fs -copyToLocal /integration-test /tmp/ +gsutil cp -r /tmp/integration-test gs:/// + +// GCS bucket should look like this: +$ gsutil ls gs:///integration-test/data +gs:///integration-test/data/ +gs:///integration-test/data/.hbase-snapshot/ +gs:///integration-test/data/archive/ +$ gsutil ls gs:///integration-test/hashtable +gs:///integration-test/hashtable/manifest +gs:///integration-test/hashtable/partitions +gs:///integration-test/hashtable/hashes/ + +// Run from HBase shell. Run `hbase shell` from unix terminal on HBase master. +// clean up the table +disable 'test' +drop 'test' +exit diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo new file mode 100644 index 0000000000..03ac02e452 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo @@ -0,0 +1,2 @@ + + test-snapshottestϹ���. (@��������� \ No newline at end of file diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest new file mode 100644 index 0000000000..6439f06130 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b new file mode 100644 index 0000000000..1b91b948d8 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d new file mode 100644 index 0000000000..951eb512ac Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b new file mode 100644 index 0000000000..dc89f02ec2 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/3bfc13b0a9bf8148a91788a8d2b60117/cf/bab07e8089634e629a4c111ea2b415fe b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/3bfc13b0a9bf8148a91788a8d2b60117/cf/bab07e8089634e629a4c111ea2b415fe new file mode 100644 index 0000000000..c7fb208f72 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/3bfc13b0a9bf8148a91788a8d2b60117/cf/bab07e8089634e629a4c111ea2b415fe differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d new file mode 100644 index 0000000000..7638f6eabb Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b new file mode 100644 index 0000000000..c6ba1f760b Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/818d6b145a50cfc3bf8ee865486fdda3/cf/afe596ef5c61440983da2dcb54d581ab b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/818d6b145a50cfc3bf8ee865486fdda3/cf/afe596ef5c61440983da2dcb54d581ab new file mode 100644 index 0000000000..5a757daec8 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/818d6b145a50cfc3bf8ee865486fdda3/cf/afe596ef5c61440983da2dcb54d581ab differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2 new file mode 100644 index 0000000000..d29619e3ec Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2 differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85 new file mode 100644 index 0000000000..337b5f9280 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85 differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/_SUCCESS b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data new file mode 100644 index 0000000000..26334294df Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/index new file mode 100644 index 0000000000..f7ac1fc941 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data new file mode 100644 index 0000000000..87b715673c Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/index new file mode 100644 index 0000000000..4edcbd1ed5 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/data new file mode 100644 index 0000000000..4b59b346f0 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/index new file mode 100644 index 0000000000..4169ee8258 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data new file mode 100644 index 0000000000..a05197b51d Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index new file mode 100644 index 0000000000..9228013bfa Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/data new file mode 100644 index 0000000000..6e29b085e7 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/index new file mode 100644 index 0000000000..245c2ceb3f Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data new file mode 100644 index 0000000000..40cbf30418 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/index new file mode 100644 index 0000000000..dbbacaf8f0 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/data new file mode 100644 index 0000000000..3f0e32269c Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index new file mode 100644 index 0000000000..a0818358eb Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data new file mode 100644 index 0000000000..effda57ece Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index new file mode 100644 index 0000000000..a8eb1a1748 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data new file mode 100644 index 0000000000..011b956c5f Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index new file mode 100644 index 0000000000..fada13a256 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/data new file mode 100644 index 0000000000..f55fa79aca Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/index new file mode 100644 index 0000000000..8c8793cef8 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest new file mode 100644 index 0000000000..a95421d027 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest @@ -0,0 +1,4 @@ +#Wed Dec 30 01:23:41 UTC 2020 +numHashFiles=10 +table=test +targetBatchSize=10 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions new file mode 100644 index 0000000000..1d447dd67a Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java new file mode 100644 index 0000000000..0183f856f1 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java @@ -0,0 +1,55 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + +import java.util.UUID; +import org.junit.Test; + +public class CleanupHBaseSnapshotRestoreFilesFnTest { + private static final String TEST_BUCKET_NAME = "test-bucket"; + private static final String TEST_SNAPSHOT_PATH = "gs://" + TEST_BUCKET_NAME + "/hbase-export"; + private static final String TEST_RESTORE_PATH = + HBaseSnapshotInputConfigBuilder.RESTORE_DIR + UUID.randomUUID(); + private static final String TEST_RESTORE_PREFIX = TEST_RESTORE_PATH.substring(1); + + @Test + public void testGetWorkingBucketName() { + assertEquals( + TEST_BUCKET_NAME, + CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_SNAPSHOT_PATH)); + + assertThrows( + IllegalArgumentException.class, + () -> { + CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_BUCKET_NAME); + }); + } + + @Test + public void testGetListPrefix() { + assertEquals( + TEST_RESTORE_PREFIX, CleanupHBaseSnapshotRestoreFilesFn.getListPrefix(TEST_RESTORE_PATH)); + + assertThrows( + IllegalArgumentException.class, + () -> { + CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_RESTORE_PREFIX); + }); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java new file mode 100644 index 0000000000..1a681a2e05 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -0,0 +1,389 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import static com.google.common.base.Preconditions.checkNotNull; + +import com.google.api.services.storage.model.Objects; +import com.google.bigtable.repackaged.com.google.gson.Gson; +import com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot.ImportOptions; +import com.google.cloud.bigtable.beam.sequencefiles.HBaseResultToMutationFn; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.beam.validation.SyncTableJob; +import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions; +import com.google.cloud.bigtable.hbase.BigtableConfiguration; +import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; +import org.apache.beam.runners.dataflow.DataflowRunner; +import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.PipelineResult.State; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; +import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; +import org.apache.beam.sdk.metrics.MetricQueryResults; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.hadoop.hbase.Cell; +import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.snapshot.SnapshotTestingUtils; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/* + * End to end integration test for pipeline that import HBase snapshot data into Cloud Bigtable and + * validates the imported data with SyncTable. + * Prepare test data with gsutil(https://cloud.google.com/storage/docs/quickstart-gsutil): + * gsutil -m cp -r /bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test \ + * gs:/// + * + * Setup GCP credential: https://cloud.google.com/docs/authentication + * Ensure your credential have access to Bigtable and Dataflow + * + * Run with: + * mvn integration-test -PhbasesnapshotsIntegrationTest \ + * -Dgoogle.bigtable.project.id= \ + * -Dgoogle.bigtable.instance.id= \ + * -Dgoogle.dataflow.stagingLocation=gs:///staging \ + * -Dcloud.test.data.folder=gs:///integration-test/ + */ +public class EndToEndIT { + + private static Logger LOG = LoggerFactory.getLogger(HBaseResultToMutationFn.class); + private static final String TEST_SNAPSHOT_NAME = "test-snapshot"; + // Location of test data hosted on Google Cloud Storage, for on-cloud dataflow tests. + private static final String CLOUD_TEST_DATA_FOLDER = "cloud.test.data.folder"; + private static final String DATAFLOW_REGION = "region"; + + // Column family name used in all test bigtables. + private static final String CF = "cf"; + + // Full path of the Cloud Storage folder where dataflow jars are uploaded to. + private static final String GOOGLE_DATAFLOW_STAGING_LOCATION = "google.dataflow.stagingLocation"; + + private Connection connection; + private String projectId; + private String instanceId; + private String tableId; + private String region; + + private GcsUtil gcsUtil; + private String dataflowStagingLocation; + private String workDir; + private byte[][] keySplits; + + // Snapshot data setup + private String hbaseSnapshotDir; + private String hashDir; + private String syncTableOutputDir; + + @Before + public void setup() throws Exception { + projectId = getTestProperty(BigtableOptionsFactory.PROJECT_ID_KEY); + instanceId = getTestProperty(BigtableOptionsFactory.INSTANCE_ID_KEY); + dataflowStagingLocation = getTestProperty(GOOGLE_DATAFLOW_STAGING_LOCATION); + region = getTestProperty(DATAFLOW_REGION); + String cloudTestDataFolder = getTestProperty(CLOUD_TEST_DATA_FOLDER); + if (!cloudTestDataFolder.endsWith(File.separator)) { + cloudTestDataFolder = cloudTestDataFolder + File.separator; + } + + hbaseSnapshotDir = cloudTestDataFolder + "data/"; + UUID test_uuid = UUID.randomUUID(); + hashDir = cloudTestDataFolder + "hashtable/"; + + syncTableOutputDir = dataflowStagingLocation; + if (!syncTableOutputDir.endsWith(File.separator)) { + syncTableOutputDir = syncTableOutputDir + File.separator; + } + syncTableOutputDir = syncTableOutputDir + "sync-table-output/" + test_uuid + "/"; + + // Cloud Storage config + GcpOptions gcpOptions = PipelineOptionsFactory.create().as(GcpOptions.class); + gcpOptions.setProject(projectId); + gcsUtil = new GcsUtil.GcsUtilFactory().create(gcpOptions); + + // Bigtable config + connection = BigtableConfiguration.connect(projectId, instanceId); + tableId = "test_" + UUID.randomUUID().toString(); + + LOG.info("Setting up integration tests"); + + String[] keys = new String[] {"1", "2", "3", "4", "5", "6", "7", "8", "9"}; + keySplits = new byte[keys.length][]; + for (int i = 0; i < keys.length; i++) { + keySplits[i] = keys[i].getBytes(); + } + + // Create table in Bigtable + TableName tableName = TableName.valueOf(tableId); + HTableDescriptor descriptor = new HTableDescriptor(tableName); + descriptor.addFamily(new HColumnDescriptor(CF)); + connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys()); + } + + private static String getTestProperty(String name) { + return checkNotNull(System.getProperty(name), "Required property missing: " + name); + } + + @After + public void teardown() throws IOException { + final List paths = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "/*")); + + if (!paths.isEmpty()) { + final List pathStrs = new ArrayList<>(); + + for (GcsPath path : paths) { + pathStrs.add(path.toString()); + } + // TODO: cleanup fails when tests time out. Add a orphan cleaner in the setup() + // https://github.com/googleapis/java-bigtable/blob/35588d89b9b243eb691a29d3aff16b9f5a08fbb8/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/test_helpers/env/AbstractTestEnv.java#L108-L119 + this.gcsUtil.remove(pathStrs); + } + + connection.close(); + + // delete test table + BigtableConfiguration.connect(projectId, instanceId) + .getAdmin() + .deleteTable(TableName.valueOf(tableId)); + } + + private SyncTableOptions createSyncTableOptions() { + DataflowPipelineOptions syncTableOpts = + PipelineOptionsFactory.as(DataflowPipelineOptions.class); + syncTableOpts.setRunner(DataflowRunner.class); + syncTableOpts.setGcpTempLocation(dataflowStagingLocation); + syncTableOpts.setNumWorkers(1); + syncTableOpts.setProject(projectId); + syncTableOpts.setRegion(region); + + SyncTableOptions syncOpts = syncTableOpts.as(SyncTableOptions.class); + // Setup Bigtable params + syncOpts.setBigtableProject(StaticValueProvider.of(projectId)); + syncOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId)); + syncOpts.setBigtableTableId(StaticValueProvider.of(tableId)); + syncOpts.setBigtableAppProfileId(null); + + // Setup Hashes + syncOpts.setHashTableOutputDir(StaticValueProvider.of(hashDir)); + syncOpts.setOutputPrefix(StaticValueProvider.of(syncTableOutputDir)); + return syncOpts; + } + + private ImportOptions createImportOptions() { + DataflowPipelineOptions importPipelineOpts = + PipelineOptionsFactory.as(DataflowPipelineOptions.class); + importPipelineOpts.setRunner(DataflowRunner.class); + importPipelineOpts.setGcpTempLocation(dataflowStagingLocation); + importPipelineOpts.setNumWorkers(1); + importPipelineOpts.setProject(projectId); + importPipelineOpts.setRegion(region); + + ImportOptions importOpts = importPipelineOpts.as(ImportOptions.class); + + // setup Bigtable options + importOpts.setBigtableProject(StaticValueProvider.of(projectId)); + importOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId)); + importOpts.setBigtableTableId(StaticValueProvider.of(tableId)); + + // setup HBase snapshot info + importOpts.setHbaseSnapshotSourceDir(hbaseSnapshotDir); + importOpts.setSnapshotName(TEST_SNAPSHOT_NAME); + return importOpts; + } + + private Map getCountMap(PipelineResult result) { + MetricQueryResults metrics = result.metrics().allMetrics(); + return StreamSupport.stream(metrics.getCounters().spliterator(), false) + .collect(Collectors.toMap((m) -> m.getName().getName(), (m) -> m.getAttempted())); + } + + /** + * Reads the output of SyncTable job and returns a list of mismatched RangeHashes. + * + * @throws IOException + */ + private List readMismatchesFromOutputFiles() throws IOException { + Gson gson = new Gson(); + // Find output files + List outputFiles = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "*")); + List rangeHashes = new ArrayList<>(); + + // Read each file line by line and create a RangeHash from it. + for (GcsPath outputFile : outputFiles) { + int size = (int) gcsUtil.fileSize(outputFile); + byte[] fileContents = new byte[size]; + gcsUtil.open(outputFile).read(ByteBuffer.wrap(fileContents)); + BufferedReader reader = + new BufferedReader(new InputStreamReader(new ByteArrayInputStream(fileContents))); + String serializedRangeHash; + while ((serializedRangeHash = reader.readLine()) != null) { + try { + rangeHashes.add(gson.fromJson(serializedRangeHash.trim(), RangeHash.class)); + } catch (Exception e) { + LOG.error("Failed to parse JSON: [" + serializedRangeHash + "]", e); + throw e; + } + } + } + return rangeHashes; + } + + // Asserts that all the rowKeys belong in mismatches. + // Throws AssertionException + private void validateRowInRangeHashes(List rowKeys, Iterable mismatches) { + for (byte[] mismatchedRowKey : rowKeys) { + Assert.assertTrue(containsRow(mismatchedRowKey, mismatches)); + } + } + + // Returns true if the rowKey belongs in one of the ranges contained in rangeHashes. + private boolean containsRow(byte[] rowKey, Iterable rangeHashes) { + for (RangeHash mismatchedRange : rangeHashes) { + // TODO: There maybe a better Range.belongs() utility function somewhere? + // Empty start/end key means that there is no start/end key. + if ((mismatchedRange.startInclusive.equals(HConstants.EMPTY_BYTE_ARRAY) + || mismatchedRange.startInclusive.compareTo(rowKey) <= 0) + && (mismatchedRange.stopExclusive.equals(HConstants.EMPTY_BYTE_ARRAY) + || mismatchedRange.stopExclusive.compareTo(rowKey) > 0)) { + return true; + } + } + return false; + } + + @Test + public void testHBaseSnapshotImport() throws Exception { + + // Start import + ImportOptions importOpts = createImportOptions(); + + // run pipeline + State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + // check that the .restore dir used for temp files has been removed + Objects objects = + gcsUtil.listObjects( + GcsPath.fromUri(hbaseSnapshotDir).getBucket(), + CleanupHBaseSnapshotRestoreFilesFn.getListPrefix( + HBaseSnapshotInputConfigBuilder.RESTORE_DIR), + null); + Assert.assertNull(objects.getItems()); + + SyncTableOptions syncOpts = createSyncTableOptions(); + + PipelineResult result = SyncTableJob.buildPipeline(syncOpts).run(); + state = result.waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + // Read the output files and validate that there are no mismatches. + Assert.assertEquals(0, readMismatchesFromOutputFiles().size()); + + // Validate the counters. + Map counters = getCountMap(result); + Assert.assertEquals(counters.get("ranges_matched"), (Long) 101L); + Assert.assertEquals(counters.get("ranges_not_matched"), (Long) 0L); + } + + /** + * Introduces multiple corruptions in imported table and validates that sync-table can detect + * them. + */ + @Test + public void testHBaseSnapshotImportWithCorruptions() throws Exception { + // Import snapshot + ImportOptions importOpts = createImportOptions(); + State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + // Rows where corruptions will be added. + byte[] mismatchRowAtStart = "000".getBytes(); + byte[] mismatchRowInMiddle = "24".getBytes(); + byte[] mismatchRowDeleted = "64".getBytes(); + byte[] mismatchRowAtTheEnd = "999".getBytes(); + + // Introduce corruptions to the data in Bigtable. Delete data from Bigtable to simulate Bigtable + // missing data. Add data to Bigtable to simulate extra data in Bigtable. It is easier to update + // Bigtable than change the snapshots. + Table table = connection.getTable(TableName.valueOf(tableId)); + Cell cellInMiddle = table.get(new Get(mismatchRowInMiddle)).rawCells()[0]; + List puts = + Arrays.asList( + // Add a row at the start + new Put(mismatchRowAtStart) + .addColumn(CF.getBytes(), "random_col".getBytes(), 1L, "value000".getBytes()) + .addColumn(CF.getBytes(), "random_col".getBytes(), 2L, "value001".getBytes()), + // change a cell in middle + new Put(cellInMiddle.getRowArray()) + .addColumn( + cellInMiddle.getFamilyArray(), + cellInMiddle.getQualifierArray(), + cellInMiddle.getTimestamp(), + "corrupted_val".getBytes()), + // add a new row in the end + new Put(mismatchRowAtTheEnd) + .addColumn(CF.getBytes(), "random_col".getBytes(), 100L, "value999".getBytes())); + + table.put(puts); + // Delete a random row in the middle. We should see 4 ranges mismatch as table is split on + // 1,2...9. All the updates are happening on a different split. + table.delete(new Delete(mismatchRowDeleted)); + + // Run SyncTable job and expect 4 mismatches. + SyncTableOptions syncOpts = createSyncTableOptions(); + PipelineResult result = SyncTableJob.buildPipeline(syncOpts).run(); + state = result.waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + List syncTableOutputMismatches = readMismatchesFromOutputFiles(); + Assert.assertEquals(4, syncTableOutputMismatches.size()); + validateRowInRangeHashes( + Arrays.asList( + mismatchRowAtStart, mismatchRowAtTheEnd, mismatchRowDeleted, mismatchRowInMiddle), + syncTableOutputMismatches); + + // Assert that the output collection is the right one. + Map counters = getCountMap(result); + Assert.assertEquals(counters.get("ranges_matched"), (Long) 97L); + Assert.assertEquals(counters.get("ranges_not_matched"), (Long) 4L); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java new file mode 100644 index 0000000000..579a57c238 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java @@ -0,0 +1,48 @@ +/* + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.hbasesnapshots; + +import static org.junit.Assert.assertEquals; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat; +import org.apache.hadoop.mapreduce.InputFormat; +import org.junit.Test; + +public class HBaseSnapshotInputConfigBuilderTest { + + private static final String TEST_PROJECT = "test_project"; + private static final String TEST_SNAPSHOT_DIR = "gs://test-bucket/hbase-export"; + private static final String TEST_SNAPSHOT_NAME = "test_snapshot"; + + @Test + public void testBuildingHBaseSnapshotInputConfigBuilder() { + Configuration conf = + new HBaseSnapshotInputConfigBuilder() + .setProjectId(TEST_PROJECT) + .setHbaseSnapshotSourceDir(TEST_SNAPSHOT_DIR) + .setSnapshotName(TEST_SNAPSHOT_NAME) + .createHBaseConfiguration(); + assertEquals( + "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS", conf.get("fs.AbstractFileSystem.gs.impl")); + assertEquals(TEST_PROJECT, conf.get("fs.gs.project.id")); + assertEquals(TEST_SNAPSHOT_DIR, conf.get("hbase.rootdir")); + assertEquals( + TableSnapshotInputFormat.class, + conf.getClass( + "mapreduce.job.inputformat.class", TableSnapshotInputFormat.class, InputFormat.class)); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/it/CloudBigtableBeamITTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/it/CloudBigtableBeamITTest.java index d2a095a5e3..fd9909f37f 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/it/CloudBigtableBeamITTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/it/CloudBigtableBeamITTest.java @@ -102,13 +102,13 @@ public class CloudBigtableBeamITTest { private final Log LOG = LogFactory.getLog(getClass()); - private static final String STAGING_LOCATION_KEY = "dataflowStagingLocation"; - private static final String ZONE_ID_KEY = "dataflowZoneId"; + private static final String STAGING_LOCATION_KEY = "google.dataflow.stagingLocation"; + private static final String REGION_KEY = "region"; private static final String projectId = System.getProperty(PROJECT_ID_KEY); private static final String instanceId = System.getProperty(INSTANCE_ID_KEY); private static final String stagingLocation = System.getProperty(STAGING_LOCATION_KEY); - private static final String zoneId = System.getProperty(ZONE_ID_KEY); + private static final String region = System.getProperty(REGION_KEY); private static final String workerMachineType = System.getProperty("workerMachineType", "n1" + "-standard-8"); @@ -129,7 +129,7 @@ public class CloudBigtableBeamITTest { @BeforeClass public static void setUpConfiguration() { Preconditions.checkArgument(stagingLocation != null, "Set -D" + STAGING_LOCATION_KEY + "."); - Preconditions.checkArgument(zoneId != null, "Set -D" + ZONE_ID_KEY + "."); + Preconditions.checkArgument(region != null, "Set -D" + REGION_KEY + "."); Preconditions.checkArgument(projectId != null, "Set -D" + PROJECT_ID_KEY + "."); Preconditions.checkArgument(instanceId != null, "Set -D" + INSTANCE_ID_KEY + "."); } @@ -255,7 +255,7 @@ private static byte[] createRandomValue() { private DataflowPipelineOptions createOptions() { DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class); options.setProject(projectId); - options.setZone(zoneId); + options.setRegion(region); options.setStagingLocation(stagingLocation + "/stage"); options.setTempLocation(stagingLocation + "/temp"); options.setRunner(DataflowRunner.class); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/EndToEndIT.java index 8f5cd823c7..1958e04307 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/EndToEndIT.java @@ -55,6 +55,7 @@ public class EndToEndIT { // Location of test data hosted on Google Cloud Storage, for on-cloud dataflow tests. private static final String CLOUD_TEST_DATA_FOLDER = "cloud.test.data.folder"; + private static final String DATAFLOW_REGION = "region"; // Column family name used in all test bigtables. private static final String CF = "column_family"; @@ -66,6 +67,7 @@ public class EndToEndIT { private String projectId; private String instanceId; private String tableId; + private String region; private GcsUtil gcsUtil; private String cloudTestDataFolder; @@ -76,7 +78,7 @@ public class EndToEndIT { public void setup() throws Exception { projectId = getTestProperty(BigtableOptionsFactory.PROJECT_ID_KEY); instanceId = getTestProperty(BigtableOptionsFactory.INSTANCE_ID_KEY); - + region = getTestProperty(DATAFLOW_REGION); dataflowStagingLocation = getTestProperty(GOOGLE_DATAFLOW_STAGING_LOCATION); cloudTestDataFolder = getTestProperty(CLOUD_TEST_DATA_FOLDER); @@ -152,6 +154,7 @@ public void testExportImport() throws Exception { pipelineOpts.setGcpTempLocation(dataflowStagingLocation); pipelineOpts.setNumWorkers(1); pipelineOpts.setProject(projectId); + pipelineOpts.setRegion(region); ExportOptions exportOpts = pipelineOpts.as(ExportOptions.class); exportOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId)); @@ -172,6 +175,7 @@ public void testExportImport() throws Exception { PipelineOptionsFactory.as(DataflowPipelineOptions.class); createTablePipelineOpts.setRunner(DataflowRunner.class); createTablePipelineOpts.setProject(projectId); + createTablePipelineOpts.setRegion(region); CreateTableHelper.CreateTableOpts createOpts = createTablePipelineOpts.as(CreateTableHelper.CreateTableOpts.class); @@ -188,6 +192,7 @@ public void testExportImport() throws Exception { importPipelineOpts.setGcpTempLocation(dataflowStagingLocation); importPipelineOpts.setNumWorkers(1); importPipelineOpts.setProject(projectId); + importPipelineOpts.setRegion(region); ImportJob.ImportOptions importOpts = importPipelineOpts.as(ImportJob.ImportOptions.class); importOpts.setBigtableProject(StaticValueProvider.of(projectId)); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java new file mode 100644 index 0000000000..96d5960423 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java @@ -0,0 +1,162 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static org.junit.Assert.assertEquals; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.beam.sdk.testing.SourceTestUtils; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class BufferedHadoopHashTableSourceTest { + + private BufferedHadoopHashTableSource bufferedSource; + private FakeTableHashWrapper fakeTableHashWrapper; + + private static final String HASH_TABLE_OUTPUT_PATH_DIR = "gs://my-bucket/outputDir"; + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("AAAA".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = + new ImmutableBytesWritable("ZZZZ".getBytes()); + private static final ImmutableBytesWritable POST_STOP_ROW = + new ImmutableBytesWritable("z".getBytes()); // Lowercase z is lexicographically > uppercase Z + private static final ImmutableBytesWritable EMPTY_ROW = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + private static final ImmutableBytesWritable START_HASH = + new ImmutableBytesWritable("START-HASH".getBytes()); + private static final int BATCH_SIZE = 5; + + @Before + public void setUp() throws Exception { + fakeTableHashWrapper = + new FakeTableHashWrapper( + START_ROW, STOP_ROW, new ArrayList<>(), new ArrayList<>(), new Scan()); + bufferedSource = + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + StaticValueProvider.of("cbt-dev"), + StaticValueProvider.of(HASH_TABLE_OUTPUT_PATH_DIR), + START_ROW, + STOP_ROW, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)), + BATCH_SIZE); + } + + protected static ImmutableBytesWritable getKey(int keyIndex) { + return new ImmutableBytesWritable(("KEY-" + keyIndex).getBytes()); + } + + protected static ImmutableBytesWritable getHash(int hashIndex) { + return new ImmutableBytesWritable(("HASH-" + hashIndex).getBytes()); + } + + /** + * Populates the fakeTableHashWrapper with {@code numEntries} entries starting with startKey. + * Returns a List of expected RangeHashes for this data, for numEntries=1, single RangeHash is + * returned (startRow, stopRow, START_HASH). + */ + protected List>> setupTestData( + ImmutableBytesWritable startRow, ImmutableBytesWritable stopRow, int numEntries) { + fakeTableHashWrapper.startRowInclusive = startRow; + fakeTableHashWrapper.stopRowExclusive = stopRow; + fakeTableHashWrapper.hashes.add(KV.of(startRow, START_HASH)); + for (int i = 0; i < numEntries - 1; i++) { + fakeTableHashWrapper.hashes.add(KV.of(getKey(i), getHash(i))); + } + + List>> out = new ArrayList<>(); + // Setup RangeHashes to be returned + List expectedRangeHashes = new ArrayList<>(); + ImmutableBytesWritable key = startRow; + ImmutableBytesWritable hash = START_HASH; + for (int i = 0; i < numEntries - 1; i++) { + expectedRangeHashes.add(RangeHash.of(key, getKey(i), hash)); + key = getKey(i); + hash = getHash(i); + if (expectedRangeHashes.size() % BATCH_SIZE == 0) { + out.add( + KV.of( + Bytes.toStringBinary(expectedRangeHashes.get(0).startInclusive.copyBytes()), + expectedRangeHashes)); + expectedRangeHashes = new ArrayList<>(); + } + } + // Process the last range + expectedRangeHashes.add(RangeHash.of(key, stopRow, hash)); + // Finalize the last batch + out.add( + KV.of( + Bytes.toStringBinary(expectedRangeHashes.get(0).startInclusive.copyBytes()), + expectedRangeHashes)); + + return out; + } + + @Test + public void testHashReaderEmpty() throws IOException { + // The tableHashWrapper has no hashes, this should result in empty source. + assertEquals(Arrays.asList(), SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testHashReaderPartialBuffer() throws IOException { + // Setup 4 entries in this hashtable datafile. + List>> expected = setupTestData(START_ROW, STOP_ROW, 4); + assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testHashReaderMultipleBatches() throws IOException { + // Setup 4 entries in this hashtable datafile. + List>> expected = setupTestData(START_ROW, STOP_ROW, 20); + assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testHashReaderMultipleBatchesWithPartialBatchAtEnd() throws IOException { + // Setup 4 entries in this hashtable datafile. + List>> expected = setupTestData(START_ROW, STOP_ROW, 23); + assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testSplitEqualsUnsplit() throws Exception { + fakeTableHashWrapper.partitions = Arrays.asList(getKey(4), getKey(9)); + SourceTestUtils.assertSourcesEqualReferenceSource( + bufferedSource, bufferedSource.split(0, null), null); + } + + @Test + public void testUnstartedReaderEqualsStarted() throws Exception { + setupTestData(START_ROW, STOP_ROW, 6); + SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource( + bufferedSource.createReader(null), null); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java new file mode 100644 index 0000000000..a27288f7da --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java @@ -0,0 +1,469 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.models.GCRules.GCRULES; + +import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.BigtableTableAdminClient; +import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.BigtableTableAdminSettings; +import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.models.CreateTableRequest; +import com.google.cloud.bigtable.beam.CloudBigtableTableConfiguration; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.emulator.v2.BigtableEmulatorRule; +import com.google.cloud.bigtable.hbase.BigtableConfiguration; +import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.metrics.MetricQueryResults; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@RunWith(JUnit4.class) +public class ComputeAndValidateHashFromBigtableDoFnTest { + + private static final byte[] EMPTY_ROW_KEY = HConstants.EMPTY_BYTE_ARRAY; + protected final Logger LOG = LoggerFactory.getLogger(getClass()); + + public static final String FAKE_TABLE = "fake-table"; + private static final String ROW_KEY_PREFIX = "row-"; + private static final String VALUE_PREFIX = "value-"; + private static final byte[] EXTRA_VALUE = "add".getBytes(); + private static final byte[] CF = "cf".getBytes(); + private static final byte[] CF2 = "cf".getBytes(); + private static final byte[] COL = "col".getBytes(); + private static final long TS = 1000l; + private static final int FIRST_ROW_INDEX = 20; + private static final int LAST_ROW_INDEX = 31; + + @Rule public final BigtableEmulatorRule bigtableEmulator = BigtableEmulatorRule.create(); + + @Rule public final transient TestPipeline p = TestPipeline.create(); + + private ComputeAndValidateHashFromBigtableDoFn doFn; + + // Clients that will be connected to the emulator + private BigtableTableAdminClient tableAdminClient; + private Table table; + // Fake a TableHashWrapper. + private FakeTableHashWrapper fakeTableHashWrapper; + + private List hashes; + + @Before + public void setUp() throws IOException { + hashes = new ArrayList<>(); + // Initialize the clients to connect to the emulator + tableAdminClient = + BigtableTableAdminClient.create( + BigtableTableAdminSettings.newBuilderForEmulator(bigtableEmulator.getPort()) + .setProjectId("fake-project") + .setInstanceId("fake-instance") + .build()); + + CloudBigtableTableConfiguration config = + new CloudBigtableTableConfiguration.Builder() + .withProjectId("fake-project") + .withInstanceId("fake-instance") + .withTableId(FAKE_TABLE) + .withConfiguration( + BigtableOptionsFactory.BIGTABLE_EMULATOR_HOST_KEY, + "localhost:" + bigtableEmulator.getPort()) + .build(); + + Connection connection = BigtableConfiguration.connect(config.toHBaseConfig()); + table = connection.getTable(TableName.valueOf(FAKE_TABLE)); + fakeTableHashWrapper = new FakeTableHashWrapper(); + // Scan all the cells for the column, HBase scan fetches 1 cell/column by default + fakeTableHashWrapper.scan = new Scan().setMaxVersions(); + + FakeTableHashWrapperFactory fakeFactory = new FakeTableHashWrapperFactory(fakeTableHashWrapper); + + doFn = + new ComputeAndValidateHashFromBigtableDoFn( + config, + StaticValueProvider.of(FAKE_TABLE), + StaticValueProvider.of("proj"), + StaticValueProvider.of("hash"), + fakeFactory); + + // Create a test table that can be used in tests + tableAdminClient.createTable( + CreateTableRequest.of(FAKE_TABLE) + .addFamily(new String(CF), GCRULES.maxVersions(100)) + .addFamily(new String(CF2), GCRULES.maxVersions(100))); + + p.getCoderRegistry().registerCoderForClass(RangeHash.class, new RangeHashCoder()); + + // Fill CBT table with data. + writeDataToTable(); + } + + @After + public void tearDown() { + // TODO should we delete the table for each test? + tableAdminClient.deleteTable(FAKE_TABLE); + } + + private byte[] getRowKey(int i) { + return (ROW_KEY_PREFIX + i).getBytes(); + } + + private byte[] getValue(int rowIndex, int cellIndex) { + return (VALUE_PREFIX + rowIndex + "-" + cellIndex).getBytes(); + } + + private void writeDataToTable() throws IOException { + List puts = new ArrayList<>(); + // Tests use the rows 21-30. Setup some extra data simulate the real world scenario where + // there will be other workitems working parallely on the table. + for (int i = 20; i < 32; i++) { + for (int j = 0; j < 2; j++) { + // Insert rows with 2 cells each + Put put = new Put(getRowKey(i)); + put.addColumn(CF, COL, TS + j, getValue(i, j)); + puts.add(put); + } + } + table.put(puts); + } + + /** Deletes the row range [startIndex, stopIndex) */ + private void deleteRange(int startIndex, int stopIndex) throws IOException { + for (int i = startIndex; i < stopIndex; i++) { + table.delete(new Delete(getRowKey(i))); + } + } + + // Creates a RangeHash for range [startRow, stopRow). + private RangeHash createHash(byte[] startRow, byte[] stopRow) throws IOException { + LOG.debug("Creating hash for rows " + startRow + " to " + stopRow); + BigtableResultHasher hasher = new BigtableResultHasher(); + hasher.startBatch(new ImmutableBytesWritable(startRow)); + + // Scan all the cells for a column. + Scan scan = new Scan().setMaxVersions().withStartRow(startRow).withStopRow(stopRow, false); + + // Read the rows from Bigtable and compute the expected hash. + for (Result result : table.getScanner(scan)) { + LOG.debug("Adding result to hash: " + result); + hasher.hashResult(result); + } + hasher.finishBatch(); + return RangeHash.of( + new ImmutableBytesWritable(startRow), + new ImmutableBytesWritable(stopRow), + hasher.getBatchHash()); + } + + private void validateCounters( + PipelineResult result, Long expectedMatches, Long expectedMismatches) { + MetricQueryResults metrics = result.metrics().allMetrics(); + Map counters = + StreamSupport.stream(metrics.getCounters().spliterator(), false) + .collect(Collectors.toMap((m) -> m.getName().getName(), (m) -> m.getAttempted())); + Assert.assertEquals(expectedMatches, counters.get("ranges_matched")); + Assert.assertEquals(expectedMismatches, counters.get("ranges_not_matched")); + } + + ////////// Happy case tests for various setups////////////////////// + @Test + public void testHashMatchesForMultipleRange() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(28))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).empty(); + PipelineResult result = p.run(); + validateCounters(result, 2L, 0L); + } + + @Test + public void testHashMatchesForSingleRange() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(); + PipelineResult result = p.run(); + validateCounters(result, 1L, 0L); + } + + @Test + public void testHashMatchesForFullTableScanWithMultipleRange() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(24))); + hashes.add(createHash(getRowKey(24), EMPTY_ROW_KEY)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).empty(); + PipelineResult result = p.run(); + validateCounters(result, 2L, 0L); + } + + @Test + public void testHashMatchesForMultipleSingleRowRange() throws Exception { + hashes.add(createHash(getRowKey(22), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(25))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(22)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).empty(); + PipelineResult result = p.run(); + validateCounters(result, 3L, 0L); + } + + ///////////////// Test mismatches when Bigtable has extra rows //////////////////// + @Test + public void testAdditionalCellInMiddle() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(30))); + + // Add an extra cell in the table + table.put(new Put(getRowKey(25)).addColumn(CF, COL, EXTRA_VALUE)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes.get(1)); + PipelineResult result = p.run(); + validateCounters(result, 2L, 1L); + } + + @Test + public void testAdditionalRowsAtEnds() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(27))); + hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY)); + + // Add an extra row in the beginning + table.put(new Put(getRowKey(1)).addColumn(CF, COL, EXTRA_VALUE)); + + // Add an extra row at the end. + table.put(new Put(getRowKey(5)).addColumn(CF, COL, EXTRA_VALUE)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2)); + PipelineResult result = p.run(); + validateCounters(result, 1L, 2L); + } + + ///////////////////// Test different values /////////////////////////// + @Test + public void testDifferentValues() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY)); + + // Modify the CF + table.delete(new Delete(getRowKey(20)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(1)).addColumn(CF2, COL, TS, getValue(20, 0))); + + // Modify the qualifier + table.delete(new Delete(getRowKey(22)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(22)).addColumn(CF, "random-col".getBytes(), TS, getValue(22, 0))); + + // Modify the timestamp + table.delete(new Delete(getRowKey(24)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(24)).addColumn(CF, COL, 1, getValue(24, 0))); + + // Modify the value + table.delete(new Delete(getRowKey(26)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(26)).addColumn(CF, COL, getValue(26, 0))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output) + .containsInAnyOrder(hashes.get(0), hashes.get(1), hashes.get(2), hashes.get(3)); + PipelineResult result = p.run(); + validateCounters(result, 1L, 4L); + } + + ////////////////// Tests with CBT missing data ////////////////////////////// + @Test + public void testMissingRows() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY)); + + // Delete a row at the beginning + table.delete(new Delete(getRowKey(FIRST_ROW_INDEX))); + + // Delete a row at the middle + table.delete(new Delete(getRowKey(24))); + + // Delete a row at the end + table.delete(new Delete(getRowKey(LAST_ROW_INDEX))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4)); + PipelineResult result = p.run(); + validateCounters(result, 2L, 3L); + } + + @Test + public void testMissingRanges() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(29))); + hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY)); + + // Delete a range at the beginning + deleteRange(FIRST_ROW_INDEX, 21); + + // Delete a range in middle + deleteRange(23, 25); + + // Delete row ranges at the end, bigtable scanner will finish with multiple row-ranges to + // process. + deleteRange(27, LAST_ROW_INDEX + 1); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output) + .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5)); + PipelineResult result = p.run(); + validateCounters(result, 2L, 4L); + } + + @Test + public void testCbtEmpty() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(29))); + hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY)); + + // Delete all data from bigtable + deleteRange(FIRST_ROW_INDEX, LAST_ROW_INDEX); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes); + PipelineResult result = p.run(); + validateCounters(result, 0L, 3L); + } + + ////////////////////// Test that scan is used from TableHash.//////////////////////// + @Test + public void testScanFromTableHash() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(30))); + + // Update the TableHashWrapper Scan to default. Scan from HashTable.TableHash determines the + // cells used to compute hash. CBT has to use the same cells for validation. + fakeTableHashWrapper.scan = new Scan(); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes); + PipelineResult result = p.run(); + validateCounters(result, 0L, 3L); + } + + ////////////////////// Combination of different cases ////////////////////////////////// + @Test + public void testMismatchesComprehensive() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(29))); + hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY)); + + // Delete a range at the beginning from CBT + deleteRange(FIRST_ROW_INDEX, 21); + + // Delete a row in middle from CBT + table.delete(new Delete(getRowKey(23))); + + // Update a value in CBT + table.delete(new Delete(getRowKey(27)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(27)).addColumn(CF, COL, getValue(27, 0))); + + // Add an extra row at the end. + table.put(new Put(getRowKey(5)).addColumn(CF, COL, EXTRA_VALUE)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output) + .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5)); + PipelineResult result = p.run(); + validateCounters(result, 2L, 4L); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java new file mode 100644 index 0000000000..ee2b6814e2 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java @@ -0,0 +1,153 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.bigtable.repackaged.com.google.gson.Gson; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.ArrayList; +import java.util.List; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * A fake for TableHashWrapper that allows us to mock the behavior of hbase's HashTable.TableHash + */ +public class FakeTableHashWrapper implements TableHashWrapper { + + // Sorted list of partition keys splitting the key range. + public List partitions; + // List of sorted by key. + public List> hashes; + public ImmutableBytesWritable startRowInclusive; + public ImmutableBytesWritable stopRowExclusive; + public Scan scan; + private static final long serialVersionUID = 34876543L; + + public FakeTableHashWrapper() { + this( + new ImmutableBytesWritable(), + new ImmutableBytesWritable(), + new ArrayList<>(), + new ArrayList<>(), + new Scan()); + } + + public FakeTableHashWrapper( + ImmutableBytesWritable startRowInclusive, + ImmutableBytesWritable stopRowExclusive, + List partitions, + List> hashes, + Scan scan) { + super(); + this.startRowInclusive = startRowInclusive; + this.stopRowExclusive = stopRowExclusive; + this.partitions = partitions; + this.hashes = hashes; + this.scan = scan; + } + + @Override + public int getNumHashFiles() { + return partitions.size() + 1; + } + + @Override + public ImmutableList getPartitions() { + return ImmutableList.copyOf(partitions); + } + + @Override + public ImmutableBytesWritable getStartRow() { + return startRowInclusive; + } + + @Override + public ImmutableBytesWritable getStopRow() { + return stopRowExclusive; + } + + @Override + public Scan getScan() { + return scan; + } + + @Override + public TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow) { + return new FakeTableHashReader(startRow); + } + + private void writeObject(ObjectOutputStream s) throws IOException { + Gson gson = new Gson(); + s.writeObject(gson.toJson(scan)); + s.writeObject(gson.toJson(startRowInclusive)); + s.writeObject(gson.toJson(stopRowExclusive)); + s.writeObject(gson.toJson(partitions)); + s.writeObject(gson.toJson(hashes)); + } + + private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { + Gson gson = new Gson(); + scan = gson.fromJson((String) s.readObject(), Scan.class); + startRowInclusive = gson.fromJson((String) s.readObject(), ImmutableBytesWritable.class); + stopRowExclusive = gson.fromJson((String) s.readObject(), ImmutableBytesWritable.class); + partitions = gson.fromJson((String) s.readObject(), ArrayList.class); + hashes = gson.fromJson((String) s.readObject(), ArrayList.class); + } + + public class FakeTableHashReader implements TableHashReader { + private final ImmutableBytesWritable startRow; + // Copy of items to be read by this reader. + private final List> entriesToRead; + // First next() will make index = 0, and compare it with the size of entriesToRead. + private int index = -1; + + public FakeTableHashReader(ImmutableBytesWritable startRow) { + this.startRow = startRow; + entriesToRead = new ArrayList<>(); + for (KV hash : hashes) { + // Collect all the entries after startRow. + if (hash.getKey().compareTo(startRow) >= 0) { + entriesToRead.add(hash); + } + } + } + + @Override + public boolean next() throws IOException { + return ++index < entriesToRead.size(); + } + + @Override + public ImmutableBytesWritable getCurrentKey() { + return entriesToRead.get(index).getKey(); + } + + @Override + public ImmutableBytesWritable getCurrentHash() { + return entriesToRead.get(index).getValue(); + } + + @Override + public void close() throws IOException { + // NOOP + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java new file mode 100644 index 0000000000..2e65e3b855 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java @@ -0,0 +1,32 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +public class FakeTableHashWrapperFactory extends TableHashWrapperFactory { + + private static final long serialVersionUID = 269854624L; + + private final FakeTableHashWrapper fakeTableHashWrapper; + + public FakeTableHashWrapperFactory(FakeTableHashWrapper wrapper) { + this.fakeTableHashWrapper = wrapper; + } + + @Override + public TableHashWrapper getTableHash(String projectId, String sourceHashDir) { + return fakeTableHashWrapper; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java new file mode 100644 index 0000000000..fa88a56d14 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java @@ -0,0 +1,179 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static org.junit.Assert.assertEquals; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.beam.sdk.testing.SourceTestUtils; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class HadoopHashBasedReaderTest { + + private HadoopHashTableSource hashTableSource; + private FakeTableHashWrapper fakeTableHashWrapper; + + private static final String HASH_TABLE_OUTPUT_PATH_DIR = "gs://my-bucket/outputDir"; + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("AAAA".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = + new ImmutableBytesWritable("ZZZZ".getBytes()); + private static final ImmutableBytesWritable EMPTY_ROW = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + private static final ImmutableBytesWritable START_HASH = + new ImmutableBytesWritable("START-HASH".getBytes()); + + @Before + public void setUp() throws Exception { + fakeTableHashWrapper = + new FakeTableHashWrapper( + START_ROW, STOP_ROW, new ArrayList<>(), new ArrayList<>(), new Scan()); + hashTableSource = + new HadoopHashTableSource( + StaticValueProvider.of("cbt-dev"), + StaticValueProvider.of(HASH_TABLE_OUTPUT_PATH_DIR), + START_ROW, + STOP_ROW, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)); + } + + protected static ImmutableBytesWritable getKey(int keyIndex) { + return new ImmutableBytesWritable(("KEY-" + keyIndex).getBytes()); + } + + protected static ImmutableBytesWritable getHash(int hashIndex) { + return new ImmutableBytesWritable(("HASH-" + hashIndex).getBytes()); + } + + /** + * Populates the fakeTableHashWrapper with {@code numEntries} entries starting with startKey. + * Returns a List of expected RangeHashes for this data, for numEntries=1, single RangeHash is + * returned (startRow, stopRow, START_HASH). + */ + protected List setupTestData( + ImmutableBytesWritable startRow, ImmutableBytesWritable stopRow, int numEntries) { + fakeTableHashWrapper.startRowInclusive = startRow; + fakeTableHashWrapper.stopRowExclusive = stopRow; + fakeTableHashWrapper.hashes.add(KV.of(startRow, START_HASH)); + for (int i = 0; i < numEntries - 1; i++) { + fakeTableHashWrapper.hashes.add(KV.of(getKey(i), getHash(i))); + } + + // Setup RangeHashes to be returned + List expectedRangeHashes = new ArrayList<>(); + ImmutableBytesWritable key = startRow; + ImmutableBytesWritable hash = START_HASH; + for (int i = 0; i < numEntries - 1; i++) { + expectedRangeHashes.add(RangeHash.of(key, getKey(i), hash)); + key = getKey(i); + hash = getHash(i); + } + expectedRangeHashes.add(RangeHash.of(key, stopRow, hash)); + return expectedRangeHashes; + } + + /////////////////////////////// Test the end of HashTable Output ///////////////////////// + + @Test + public void testHashReaderEmpty() throws IOException { + // The tableHashWrapper has no hashes, this should result in empty source. + assertEquals(Arrays.asList(), SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderSingleHashBatch() throws IOException { + // Setup 1 entry in this hashtable datafile. The test is setup so that HashTable datafile has + // only 1 entry. + List expected = setupTestData(START_ROW, STOP_ROW, 1); + + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderMultipleHashBatch() throws IOException { + // Setup 4 entries in this hashtable datafile. + List expected = setupTestData(START_ROW, STOP_ROW, 4); + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + //////////////////// Test the end of HashTable output when end of range is ""///////////////// + @Test + public void testHashReaderWithEmptyEndRow() throws IOException { + // Setup 4 entries in this hashtable datafile with no start or stop keys set. + List expected = setupTestData(EMPTY_ROW, EMPTY_ROW, 4); + hashTableSource.startRowInclusive = EMPTY_ROW; + hashTableSource.stopRowExclusive = EMPTY_ROW; + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + /////////////////////////////// Test reader.getCurrent() >= stopRow ///////////////////////// + + @Test + public void testHashReaderWorkItemEndedOnFirstBatch() throws IOException { + // Setup 1 entry in this hashtable datafile. This entry is outside of the workitem's row + fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, START_HASH)); + // Source will be empty as no hashes fall in its bounds. + assertEquals(new ArrayList(), SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderWorkItemEndedOnSecondEntry() throws IOException { + // Setup 1 entry in this hashtable datafile. The test is setup so that HashTable datafile has + // only 1 entry. + List expected = setupTestData(START_ROW, STOP_ROW, 1); + // Add a next entry at the stop row. Reader should stop and read just 1 entry. + fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, getHash(100))); + + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderWorkItemEndedAfterMultipleBatches() throws IOException { + // Setup 4 entries in this hashtable datafile. + List expected = setupTestData(START_ROW, STOP_ROW, 4); + // Add a next entry at the stop row. Reader should stop and read just 4 entry. + fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, getHash(100))); + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testSplitEqualsUnsplit() throws Exception { + setupTestData(START_ROW, STOP_ROW, 6); + fakeTableHashWrapper.partitions = Arrays.asList(getKey(2), getKey(4)); + SourceTestUtils.assertSourcesEqualReferenceSource( + hashTableSource, hashTableSource.split(1, null), null); + } + + @Test + public void testUnstartedReaderEqualsStarted() throws Exception { + setupTestData(START_ROW, STOP_ROW, 6); + SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource( + hashTableSource.createReader(null), null); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java new file mode 100644 index 0000000000..a3aba3f756 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java @@ -0,0 +1,209 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.HashBasedReader; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.util.List; +import junit.framework.TestCase; +import org.apache.beam.sdk.io.BoundedSource; +import org.apache.beam.sdk.io.BoundedSource.BoundedReader; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class HadoopHashTableSourceTest extends TestCase { + + HadoopHashTableSource source; + FakeTableHashWrapper fakeTableHashWrapper; + + private static final ValueProvider PROJECT_ID = StaticValueProvider.of("test-project"); + private static final ValueProvider HASH_TABLE_OUTPUT_PATH_DIR = + StaticValueProvider.of("gs://my-bucket/outputDir"); + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("a".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = new ImmutableBytesWritable("z".getBytes()); + private static final ImmutableBytesWritable PARTITION1 = + new ImmutableBytesWritable("d".getBytes()); + private static final ImmutableBytesWritable PARTITION2 = + new ImmutableBytesWritable("g".getBytes()); + private static final ImmutableBytesWritable EMPTY_ROW_KEY = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + + @Before + public void setUp() throws Exception { + super.setUp(); + fakeTableHashWrapper = new FakeTableHashWrapper(); + } + + private List> getSplitSources( + List partitions, + ImmutableBytesWritable startRow, + ImmutableBytesWritable stopRow) + throws IOException { + fakeTableHashWrapper.startRowInclusive = startRow; + fakeTableHashWrapper.stopRowExclusive = stopRow; + fakeTableHashWrapper.partitions = partitions; + + source = + new HadoopHashTableSource( + PROJECT_ID, + HASH_TABLE_OUTPUT_PATH_DIR, + startRow, + stopRow, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)); + return (List>) source.split(0, null); + } + + private void testSourceSplits( + List partitions, + ImmutableBytesWritable startRow, + ImmutableBytesWritable stopRow, + List> expectedSources) + throws IOException { + assertEquals(expectedSources, getSplitSources(partitions, startRow, stopRow)); + } + + @Test + public void testSplitZeroPartitions() throws IOException { + // Row range [a-z) with no splits. + List> expected = + ImmutableList.of( + new HadoopHashTableSource(PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, STOP_ROW)); + testSourceSplits(ImmutableList.of(), START_ROW, STOP_ROW, expected); + } + + @Test + public void testSplitOnePartition() throws IOException { + // Row range [a-z) with 1 splits. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, STOP_ROW)); + testSourceSplits(ImmutableList.of(PARTITION1), START_ROW, STOP_ROW, expected); + } + + @Test + public void testMultiplePartitons() throws IOException { + // Row range [a-z) with splits on {d,g}. The data files will be for {[a,d), [d,g), [g,z)}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, STOP_ROW)); + testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), START_ROW, STOP_ROW, expected); + } + + @Test + public void testSplitEmptyStartRow() throws IOException { + // Row range [""-z) with splits on {d,g}. The data files will be for {["",d), [d,g), [g,z)}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, EMPTY_ROW_KEY, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, STOP_ROW)); + testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), EMPTY_ROW_KEY, STOP_ROW, expected); + } + + @Test + public void testSplitEmptyStopRow() throws IOException { + // Row range [a-"") with splits on {d,g}. The data files will be for {[a,d), [d,g), [g,"")}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, EMPTY_ROW_KEY)); + testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), START_ROW, EMPTY_ROW_KEY, expected); + } + + @Test + public void testSplitFullTableScan() throws IOException { + // Row range [""-"") with splits on {d,g}. The data files will be for {["",d), [d,g), [g,"")}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, EMPTY_ROW_KEY, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, EMPTY_ROW_KEY)); + testSourceSplits( + ImmutableList.of(PARTITION1, PARTITION2), EMPTY_ROW_KEY, EMPTY_ROW_KEY, expected); + } + + @Test + public void testCreateReaderWithoutSplit() throws IOException { + source = + new HadoopHashTableSource( + PROJECT_ID, + HASH_TABLE_OUTPUT_PATH_DIR, + // When split is not called, start/stop are uninitialized. Start/stop are runtime params + // and are initialized in split/createReader. + null, + null, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)); + // Setup boundaries on the TableHashWrapper to be used in Source. + fakeTableHashWrapper.startRowInclusive = START_ROW; + fakeTableHashWrapper.stopRowExclusive = STOP_ROW; + + // Create a new Reader + BoundedReader reader = source.createReader(null); + + // Validate that the reader was properly created. + assertEquals(HashBasedReader.class, reader.getClass()); + assertEquals(source, reader.getCurrentSource()); + HashBasedReader hashBasedReader = (HashBasedReader) reader; + assertEquals(START_ROW, hashBasedReader.startRowInclusive); + assertEquals(STOP_ROW, hashBasedReader.stopRowExclusive); + } + + @Test + public void testCreateReaderAfterSplit() throws IOException { + // Single partitions will return a 2 sources. + List> splitSources = + getSplitSources(ImmutableList.of(PARTITION1), START_ROW, STOP_ROW); + BoundedSource splitHashSource = splitSources.get(0); + + // Create a new Reader + BoundedReader reader = splitHashSource.createReader(null); + + // Validate that the reader was properly created. + assertEquals(HashBasedReader.class, reader.getClass()); + assertEquals(splitHashSource, reader.getCurrentSource()); + HashBasedReader hashBasedReader = (HashBasedReader) reader; + assertEquals(START_ROW, hashBasedReader.startRowInclusive); + assertEquals(PARTITION1, hashBasedReader.stopRowExclusive); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java new file mode 100644 index 0000000000..f58becf3cb --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java @@ -0,0 +1,122 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.common.truth.Truth.assertWithMessage; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import junit.framework.TestCase; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class HashBasedSourceSerializationTest extends TestCase { + + public static final String SOURCE_HASH_DIR = "gs://my-bucket/outputDir"; + public static final String PROJECT_ID = "test-project"; + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("a".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = new ImmutableBytesWritable("y".getBytes()); + + @Before + public void setUp() throws Exception { + super.setUp(); + } + + @Test + public void testSerializeWithValueProviders() throws IOException { + checkSerialization( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR))); + } + + @Test + public void testSerializeWithStartStop() throws IOException { + checkSerialization( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), + StaticValueProvider.of(SOURCE_HASH_DIR), + new ImmutableBytesWritable(START_ROW), + new ImmutableBytesWritable(STOP_ROW))); + } + + @Test + public void testBufferedSourceSerialize() { + checkSerialization( + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR)))); + } + + @Test + public void testBufferedSourceSerializeWithBatchSize() { + checkSerialization( + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR)), + 5)); + } + + private static void checkSerialization(Object source) { + try { + Object deserialized = serializeDeserialize(source); + checkClassDeclaresSerialVersionUid(source.getClass()); + assertEquals(source, deserialized); + } catch (IOException | ClassNotFoundException e) { + fail(e.toString()); + } + } + + private static void checkClassDeclaresSerialVersionUid(Class cls) { + String uid = "serialVersionUID"; + for (Field field : cls.getDeclaredFields()) { + if (field.getName() == uid) { + int modifiers = field.getModifiers(); + assertWithMessage(field + " is not static").that(Modifier.isStatic(modifiers)).isTrue(); + assertWithMessage(field + " is not final").that(Modifier.isFinal(modifiers)).isTrue(); + assertWithMessage(field + " is not private").that(Modifier.isPrivate(modifiers)).isTrue(); + assertWithMessage(field + " must be long") + .that(field.getType().getSimpleName()) + .isEqualTo("long"); + return; + } + } + fail(cls + " does not declare serialVersionUID"); + } + + private static Object serializeDeserialize(Object obj) + throws IOException, ClassNotFoundException { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try (ObjectOutputStream outStream = new ObjectOutputStream(bos)) { + outStream.writeObject(obj); + } + + ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray()); + try (ObjectInputStream inStream = new ObjectInputStream(bis)) { + return inStream.readObject(); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java new file mode 100644 index 0000000000..5f644e3b50 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java @@ -0,0 +1,51 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import org.apache.beam.sdk.coders.CoderException; +import org.apache.beam.sdk.testing.CoderProperties; +import org.apache.beam.sdk.util.CoderUtils; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Assert; +import org.junit.Test; + +public class RangeHashCoderTest { + private static final RangeHashCoder TEST_CODER = new RangeHashCoder(); + private static final ImmutableBytesWritable START = + new ImmutableBytesWritable("Start".getBytes()); + private static final ImmutableBytesWritable STOP = new ImmutableBytesWritable("Stop".getBytes()); + private static final ImmutableBytesWritable HASH = new ImmutableBytesWritable("hash".getBytes()); + private static final ImmutableBytesWritable EMPTY = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + + @Test + public void encodeRangeHash() throws Exception { + CoderProperties.coderDecodeEncodeEqual(TEST_CODER, RangeHash.of(START, STOP, HASH)); + } + + @Test(expected = CoderException.class) + public void encodeNullThrowsCoderException() throws Exception { + CoderUtils.encodeToByteArray(TEST_CODER, null); + } + + @Test + public void testEncodedTypeDescriptor() throws Exception { + Assert.assertEquals(TEST_CODER.getEncodedTypeDescriptor(), TypeDescriptor.of(RangeHash.class)); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/README.md b/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/README.md new file mode 100644 index 0000000000..3d9b722bb9 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/README.md @@ -0,0 +1,18 @@ +# Generating the test HBase snapshot for HBase snapshot import integration tests + +The file `generate_test_data.txt` is an HBase command line command sequence +used to generated the testing HBase snapshot data. + +If you need to modify the test data used by `bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java`, +Please make sure you have HBase installed and export `/bin` to your PATH. + +Then: + + $ hbase shell ./generate_test_data.txt + $ hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -Dmapreduce.framework.name=local -snapshot test-snapshot -copy-to file:////data + + $ cd + $ gsutil -m cp -r ./data/ gs:///integration-test/ + +After this, you use be able to run the integration test with your new data by specifying +`-Dcloud.test.data.folder=gs:///integration-test/` \ No newline at end of file diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/log4j.properties b/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/log4j.properties index 7f9118c7bc..c609eb001a 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/log4j.properties +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/log4j.properties @@ -22,3 +22,7 @@ log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n log4j.category.org.apache.beam.sdk.io.FileBasedSource=WARN log4j.category.com.google.cloud.bigtable.beam.sequencefiles.SequenceFileSource=WARN +# make hbase snapshot import integration tests output less verbose. +log4j.category.org.apache.hadoop=WARN +log4j.category.org.apache.beam.runners.dataflow.util.MonitoringUtil=WARN +log4j.category.org.apache.beam.runners.dataflow.util.MonitoringUtil.LoggingHandler=WARN \ No newline at end of file diff --git a/bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/pom.xml b/bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/pom.xml index ef0b866ec6..7cf1993350 100644 --- a/bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/pom.xml +++ b/bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/pom.xml @@ -39,6 +39,16 @@ limitations under the License. provided + + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + ${hadoop.scope} + + + ${project.groupId} diff --git a/pom.xml b/pom.xml index 215a7bc69e..9e6d60dc08 100644 --- a/pom.xml +++ b/pom.xml @@ -81,6 +81,7 @@ limitations under the License. 30.0-android 20.0 1.7 + 29.0-jre 1.29.0 @@ -165,7 +166,7 @@ limitations under the License. org.apache.maven.plugins maven-shade-plugin - 3.2.2 + 3.2.4 org.apache.maven.plugins @@ -175,7 +176,7 @@ limitations under the License. org.apache.maven.plugins maven-javadoc-plugin - 3.1.1 + 3.2.0 none