From 28547a28a916d413ac1e5c7712111926e8d56b9e Mon Sep 17 00:00:00 2001 From: shitanshu verma Date: Tue, 2 Feb 2021 13:15:00 -0500 Subject: [PATCH 1/8] feat: add a new pipeline to validate data imported into cloud bigtable from HBase. --- .../bigtable-beam-import/pom.xml | 18 + .../com/google/cloud/bigtable/beam/Main.java | 4 + .../cloud/bigtable/beam/TemplateUtils.java | 14 + .../BufferedHadoopHashTableSource.java | 211 ++++++++ ...omputeAndValidateHashFromBigtableDoFn.java | 232 +++++++++ .../validation/HadoopHashTableSource.java | 464 ++++++++++++++++++ .../beam/validation/RangeHashCoder.java | 105 ++++ .../beam/validation/SyncTableJob.java | 199 ++++++++ .../beam/validation/SyncTableUtils.java | 55 +++ .../beam/validation/TableHashWrapper.java | 55 +++ .../validation/TableHashWrapperFactory.java | 33 ++ .../beam/validation/TableHashWrapperImpl.java | 119 +++++ .../mapreduce/BigtableTableHashAccessor.java | 77 +++ .../test-snapshot/..snapshotinfo.crc | Bin 12 -> 0 bytes .../test-snapshot/.data.manifest.crc | Bin 20 -> 0 bytes .../test-snapshot/.snapshotinfo | 2 - .../cf/.b0f68aca966b48f1b171614e582b1cbb.crc | Bin 52 -> 0 bytes .../cf/.8aff180e3a244dcc807e4de8b6fce0a7.crc | Bin 52 -> 0 bytes .../cf/.c2945aa8dac34922913a1f60fedb6154.crc | Bin 52 -> 0 bytes .../cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc | Bin 52 -> 0 bytes .../cf/.d8b49b374391407ba35d5e0db1c835c9.crc | Bin 52 -> 0 bytes .../cf/.32053565831341128b8d8f5567d48fdc.crc | Bin 52 -> 0 bytes .../cf/.36798a163ed046b193818e21dd7516b4.crc | Bin 52 -> 0 bytes .../cf/.65b9c6860f5f4de39d61d1674947b030.crc | Bin 52 -> 0 bytes .../cf/.b83044f76ba6474aa829e3bae7fd82d1.crc | Bin 52 -> 0 bytes .../src/test/generate_test_data.txt | 226 +++++---- .../test-snapshot/.snapshotinfo | 2 + .../test-snapshot/data.manifest | Bin 1090 -> 1090 bytes .../cf/0ad53893d268478f9b2484cbb6016d9b} | Bin 5264 -> 5264 bytes .../cf/9926df0da08b4f51a33517afb040f82d} | Bin 5264 -> 5264 bytes .../cf/966e85699fdd4680a8c6fbf4b41b6e4b} | Bin 5264 -> 5264 bytes .../cf/bab07e8089634e629a4c111ea2b415fe} | Bin 5264 -> 5264 bytes .../cf/7fef5694213b4be0ad79f79c45200c2d} | Bin 5264 -> 5264 bytes .../cf/f8d40658d79b4a7191f21bcf14ae289b} | Bin 5264 -> 5264 bytes .../cf/afe596ef5c61440983da2dcb54d581ab} | Bin 5264 -> 5264 bytes .../cf/2c766f1fc8eb460dbfa9a3803138c9b2} | Bin 5264 -> 5264 bytes .../cf/e59edc08de6d441689288f04c7c0fe85} | Bin 5299 -> 5299 bytes .../hashtable/hashes/_SUCCESS | 0 .../hashtable/hashes/part-r-00000/data | Bin 0 -> 158 bytes .../hashtable/hashes/part-r-00000/index | Bin 0 -> 220 bytes .../hashtable/hashes/part-r-00001/data | Bin 0 -> 534 bytes .../hashtable/hashes/part-r-00001/index | Bin 0 -> 221 bytes .../hashtable/hashes/part-r-00002/data | Bin 0 -> 499 bytes .../hashtable/hashes/part-r-00002/index | Bin 0 -> 221 bytes .../hashtable/hashes/part-r-00003/data | Bin 0 -> 499 bytes .../hashtable/hashes/part-r-00003/index | Bin 0 -> 221 bytes .../hashtable/hashes/part-r-00004/data | Bin 0 -> 499 bytes .../hashtable/hashes/part-r-00004/index | Bin 0 -> 221 bytes .../hashtable/hashes/part-r-00005/data | Bin 0 -> 499 bytes .../hashtable/hashes/part-r-00005/index | Bin 0 -> 221 bytes .../hashtable/hashes/part-r-00006/data | Bin 0 -> 499 bytes .../hashtable/hashes/part-r-00006/index | Bin 0 -> 221 bytes .../hashtable/hashes/part-r-00007/data | Bin 0 -> 499 bytes .../hashtable/hashes/part-r-00007/index | Bin 0 -> 221 bytes .../hashtable/hashes/part-r-00008/data | Bin 0 -> 499 bytes .../hashtable/hashes/part-r-00008/index | Bin 0 -> 221 bytes .../hashtable/hashes/part-r-00009/data | Bin 0 -> 499 bytes .../hashtable/hashes/part-r-00009/index | Bin 0 -> 221 bytes .../test/integration-test/hashtable/manifest | 4 + .../integration-test/hashtable/partitions | Bin 0 -> 342 bytes .../beam/hbasesnapshots/EndToEndIT.java | 181 ++++++- .../BufferedHadoopHashTableSourceTest.java | 162 ++++++ ...teAndValidateHashFromBigtableDoFnTest.java | 444 +++++++++++++++++ .../beam/validation/FakeTableHashWrapper.java | 153 ++++++ .../FakeTableHashWrapperFactory.java | 32 ++ .../validation/HadoopHashBasedReaderTest.java | 181 +++++++ .../validation/HadoopHashTableSourceTest.java | 209 ++++++++ .../HashBasedSourceSerializationTest.java | 127 +++++ .../beam/validation/RangeHashCoderTest.java | 51 ++ 69 files changed, 3235 insertions(+), 125 deletions(-) create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/..snapshotinfo.crc delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.data.manifest.crc delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/.b0f68aca966b48f1b171614e582b1cbb.crc delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/.8aff180e3a244dcc807e4de8b6fce0a7.crc delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/.c2945aa8dac34922913a1f60fedb6154.crc delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/.d8b49b374391407ba35d5e0db1c835c9.crc delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/.32053565831341128b8d8f5567d48fdc.crc delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/.36798a163ed046b193818e21dd7516b4.crc delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/.65b9c6860f5f4de39d61d1674947b030.crc delete mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/.b83044f76ba6474aa829e3bae7fd82d1.crc create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo rename bigtable-dataflow-parent/bigtable-beam-import/src/test/{ => integration-test}/data/.hbase-snapshot/test-snapshot/data.manifest (55%) rename bigtable-dataflow-parent/bigtable-beam-import/src/test/{data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/32053565831341128b8d8f5567d48fdc => integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b} (86%) rename bigtable-dataflow-parent/bigtable-beam-import/src/test/{data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/8aff180e3a244dcc807e4de8b6fce0a7 => integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d} (87%) rename bigtable-dataflow-parent/bigtable-beam-import/src/test/{data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/36798a163ed046b193818e21dd7516b4 => integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b} (87%) rename bigtable-dataflow-parent/bigtable-beam-import/src/test/{data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/c2945aa8dac34922913a1f60fedb6154 => integration-test/data/archive/data/default/test/3bfc13b0a9bf8148a91788a8d2b60117/cf/bab07e8089634e629a4c111ea2b415fe} (87%) rename bigtable-dataflow-parent/bigtable-beam-import/src/test/{data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/65b9c6860f5f4de39d61d1674947b030 => integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d} (87%) rename bigtable-dataflow-parent/bigtable-beam-import/src/test/{data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/b0f68aca966b48f1b171614e582b1cbb => integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b} (87%) rename bigtable-dataflow-parent/bigtable-beam-import/src/test/{data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/cda93ca899f3475fb1c0f8989a8f0d18 => integration-test/data/archive/data/default/test/818d6b145a50cfc3bf8ee865486fdda3/cf/afe596ef5c61440983da2dcb54d581ab} (87%) rename bigtable-dataflow-parent/bigtable-beam-import/src/test/{data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/b83044f76ba6474aa829e3bae7fd82d1 => integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2} (87%) rename bigtable-dataflow-parent/bigtable-beam-import/src/test/{data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/d8b49b374391407ba35d5e0db1c835c9 => integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85} (86%) create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/_SUCCESS create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/index create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/index create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/data create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/index create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/data create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/index create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/index create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/data create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/data create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/index create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 218dc06db8..8ee5ba861b 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -26,6 +26,7 @@ limitations under the License. com.google.cloud.bigtable.beam.Main + false @@ -217,6 +218,23 @@ limitations under the License. ${hbase.version} test + + com.google.truth + truth + 1.0.1 + test + + + com.google.auto.service + auto-service-annotations + 1.0-rc7 + + + com.google.cloud + google-cloud-bigtable-emulator + 0.124.0 + test + diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java index b346b90837..1f52f5125a 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java @@ -21,6 +21,7 @@ import com.google.cloud.bigtable.beam.sequencefiles.CreateTableHelper; import com.google.cloud.bigtable.beam.sequencefiles.ExportJob; import com.google.cloud.bigtable.beam.sequencefiles.ImportJob; +import com.google.cloud.bigtable.beam.validation.SyncTableJob; import java.io.File; import java.net.URISyntaxException; import java.util.Arrays; @@ -53,6 +54,9 @@ public static void main(String[] args) throws Exception { case "create-table": CreateTableHelper.main(subArgs); break; + case "sync-table": + SyncTableJob.main(subArgs); + break; default: usage(); System.exit(1); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java index e64507317b..f839a50b23 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java @@ -26,6 +26,7 @@ import com.google.bigtable.repackaged.com.google.cloud.bigtable.data.v2.models.Query; import com.google.cloud.bigtable.beam.sequencefiles.ExportJob.ExportOptions; import com.google.cloud.bigtable.beam.sequencefiles.ImportJob.ImportOptions; +import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions; import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; import com.google.cloud.bigtable.hbase.adapters.Adapters; import com.google.cloud.bigtable.hbase.adapters.read.DefaultReadHooks; @@ -72,6 +73,19 @@ public static CloudBigtableTableConfiguration BuildImportConfig(ImportOptions op return builder.build(); } + /** Builds CloudBigtableTableConfiguration from input runtime parameters for import job. */ + public static CloudBigtableTableConfiguration BuildSyncTableConfig(SyncTableOptions opts) { + CloudBigtableTableConfiguration.Builder builder = + new CloudBigtableTableConfiguration.Builder() + .withProjectId(opts.getBigtableProject()) + .withInstanceId(opts.getBigtableInstanceId()) + .withTableId(opts.getBigtableTableId()); + if (opts.getBigtableAppProfileId() != null) { + builder.withAppProfileId(opts.getBigtableAppProfileId()); + } + return builder.build(); + } + /** Provides a request that is constructed with some attributes. */ private static class RequestValueProvider implements ValueProvider, Serializable { diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java new file mode 100644 index 0000000000..eb018832ce --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java @@ -0,0 +1,211 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.api.core.InternalApi; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.ArrayList; +import java.util.List; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.coders.ListCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.BoundedSource; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.values.KV; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Buffers the RangeHashes generated by {@link HadoopHashTableSource}. This is an optimization that + * allows {@link ComputeAndValidateHashFromBigtableDoFn} to issue fewer ReadRow APIs with larger row + * ranges. + * + *

Hadoop HashTable output is sorted by row-key and contains a row-range and hash. Beam + * Pcollection do not guarantee any ordering. To fetch a batch of ranges in 1 ReadRows operation, + * this source buffers then and outputs a List guaranteeing the sorted order of ranges. + */ +@InternalApi +class BufferedHadoopHashTableSource extends BoundedSource>> { + + private static final long serialVersionUID = 39842743L; + + public static final Log LOG = LogFactory.getLog(BufferedHadoopHashTableSource.class); + private static final int DEFAULT_BATCH_SIZE = 50; + + // Max number of RangeHashes to buffer. + private int maxBufferSize; + private HadoopHashTableSource hashTableSource; + private Coder>> coder; + + public BufferedHadoopHashTableSource(HadoopHashTableSource source) { + this(source, DEFAULT_BATCH_SIZE); + } + + public BufferedHadoopHashTableSource(HadoopHashTableSource hashTableSource, int maxBufferSize) { + this.hashTableSource = hashTableSource; + this.coder = KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of())); + this.maxBufferSize = maxBufferSize; + } + + @Override + public List>>> split( + long desiredBundleSizeBytes, PipelineOptions options) throws IOException { + + List splitHashTableSources = + (List) hashTableSource.split(desiredBundleSizeBytes, options); + + List splitSources = + new ArrayList<>(splitHashTableSources.size()); + // Keep the splits same as HashTableSource. + for (HadoopHashTableSource splitHashTableSource : splitHashTableSources) { + // Add the last range for [lastPartition, stopRow). + splitSources.add(new BufferedHadoopHashTableSource(splitHashTableSource)); + } + return splitSources; + } + + @Override + public Coder>> getOutputCoder() { + return coder; + } + + @Override + public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { + // HashTable data files don't expose a method to estimate size or lineCount. + return 0; + } + + @Override + public BoundedReader createReader(PipelineOptions options) throws IOException { + return new BufferedHashBasedReader(this, hashTableSource.createReader(options)); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof BufferedHadoopHashTableSource)) { + return false; + } + BufferedHadoopHashTableSource that = (BufferedHadoopHashTableSource) o; + return maxBufferSize == that.maxBufferSize && Objects.equal(hashTableSource, that.hashTableSource); + } + + @Override + public int hashCode() { + return Objects.hashCode(maxBufferSize, hashTableSource); + } + + @Override + public String toString() { + return "BufferedHadoopHashTableSource [" + + immutableBytesToString(hashTableSource.startRowInclusive) + + ", " + + immutableBytesToString(hashTableSource.stopRowExclusive) + + "), maxBufferSize=" + + maxBufferSize; + } + + private void writeObject(ObjectOutputStream s) throws IOException { + s.writeObject(hashTableSource); + s.writeInt(maxBufferSize); + } + + private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { + this.hashTableSource = (HadoopHashTableSource) s.readObject(); + this.coder = KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of())); + this.maxBufferSize = s.readInt(); + } + + private static class BufferedHashBasedReader extends BoundedReader>> { + + private BoundedReader hashReader; + private BufferedHadoopHashTableSource source; + + private List buffer; + + public BufferedHashBasedReader( + BufferedHadoopHashTableSource source, BoundedReader hashReader) { + this.source = source; + this.hashReader = hashReader; + this.buffer = new ArrayList<>(source.maxBufferSize); + } + + @Override + public boolean start() throws IOException { + if (!hashReader.start()) { + // HashReader does not have any hashes, return empty reader. + return false; + } + // Start returned true, consume the current RangeHash. + buffer.add(hashReader.getCurrent()); + bufferRangeHashes(); + // Buffer is not empty, return true to consume the current buffer. + return true; + } + + // Reads from hashReader and buffers the RangeHashes. + // Returns true if any RangeHashes were read from hashReader. + private boolean bufferRangeHashes() throws IOException { + boolean readRangeHashes = false; + while (buffer.size() < source.maxBufferSize && hashReader.advance()) { + readRangeHashes = true; + buffer.add(hashReader.getCurrent()); + } + return readRangeHashes; + } + + @Override + public boolean advance() throws IOException { + return bufferRangeHashes(); + } + + @Override + public KV> getCurrent() { + // getCurrent only gets called when buffer is not empty. + Preconditions.checkArgument(!buffer.isEmpty(), "Can not get current on empty buffer."); + List hashes = buffer; + // Reset the buffer for next batch. + buffer = new ArrayList<>(source.maxBufferSize); + // GroupBy key is a string and not ImmutableBytesWritable because the WritableCoder is not + // deterministic. The outputted PCollection is grouped by the K and needs a deterministic + // coder. Having a String K leads to an unfortunate double encoding, ImmutableBytesWritable-> + // HEX string -> UTF8 encoded string. The number of batches are significantly smaller than + // data fetched from Bigtable and should not have meaningful impact on the job performance. + return KV.of(Bytes.toStringBinary(hashes.get(0).startInclusive.copyBytes()), hashes); + } + + @Override + public void close() throws IOException { + hashReader.close(); + } + + @Override + public BoundedSource>> getCurrentSource() { + return source; + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java new file mode 100644 index 0000000000..3801465f2f --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java @@ -0,0 +1,232 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.api.core.InternalApi; +import com.google.cloud.bigtable.beam.AbstractCloudBigtableTableDoFn; +import com.google.cloud.bigtable.beam.CloudBigtableConfiguration; +import com.google.cloud.bigtable.beam.TemplateUtils; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions; +import com.google.common.annotations.VisibleForTesting; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher; + +/** + * A {@link DoFn} that takes a row range and hash from HBase and validates the hash from rows read + * from Cloud Bigtable. + */ +@InternalApi +class ComputeAndValidateHashFromBigtableDoFn + extends AbstractCloudBigtableTableDoFn>>, RangeHash> { + + private static final long serialVersionUID = 2349094L; + private final ValueProvider tableName; + private final ValueProvider projectId; + private final ValueProvider sourceHashDir; + + private final TableHashWrapperFactory tableHashWrapperFactory; + + // Counter for reporting matching and mismatching ranges. Names are similar to HBase sync-table + // job. + private final Counter matches = Metrics.counter("cbt-dataflow-validate", "ranges_matched"); + private final Counter mismatches = Metrics.counter("cbt-dataflow-validate", "ranges_not_matched"); + + public ComputeAndValidateHashFromBigtableDoFn(SyncTableOptions options) { + super(TemplateUtils.BuildSyncTableConfig(options)); + this.tableName = options.getBigtableTableId(); + // Create a local copy of ValueProviders, PipelineOptions are not serializable. + projectId = options.getBigtableProject(); + sourceHashDir = options.getHashTableOutputDir(); + tableHashWrapperFactory = new TableHashWrapperFactory(); + } + + @VisibleForTesting + ComputeAndValidateHashFromBigtableDoFn( + CloudBigtableConfiguration config, + ValueProvider tableName, + ValueProvider projectId, + ValueProvider sourceHashDir, + TableHashWrapperFactory factory) { + super(config); + this.tableName = tableName; + this.tableHashWrapperFactory = factory; + this.sourceHashDir = projectId; + this.projectId = sourceHashDir; + } + + @ProcessElement + public void processElement(ProcessContext context) throws Exception { + // BufferedHadoopHashTableSource generates only 1 item per groupby key, but iterate just in + // case. + for (List rangeHashes : context.element().getValue()) { + if (rangeHashes.isEmpty()) { + // No rows ranges found, return; + return; + } + + ImmutableBytesWritable rangeStartInclusive = rangeHashes.get(0).startInclusive; + ImmutableBytesWritable rangeEndExclusive = + rangeHashes.get(rangeHashes.size() - 1).stopExclusive; + + BigtableResultHasher resultHasher = new BigtableResultHasher(); + resultHasher.startBatch(rangeStartInclusive); + + // Since all the row-ranges are sorted in HashTable's data files, 1 big scan can be used + // to read all the row ranges. Parallelism is achieved by splitting the HashTable's data + // files into smaller bundle of row-ranges in GroupBy. + ResultScanner scanner = + createBigtableScan(rangeStartInclusive.copyBytes(), rangeEndExclusive.copyBytes()); + + Iterator rangeHashIterator = rangeHashes.iterator(); + long numRows = 0; + + RangeHash currentRangeHash = rangeHashIterator.next(); + + // Process each row and validate hashes + for (Result result : scanner) { + numRows++; + if (numRows % 10_000 == 0) { + // Heartbeat in logs in case a large scan gets hung. + DOFN_LOG.debug("Processed " + numRows + " rows "); + } + + ImmutableBytesWritable rowKey = new ImmutableBytesWritable(result.getRow()); + + // Check if the rowKey belongs to current range, if not keep iterating through the + // rangeHashes until rowKey's range is found. + while (!isWithinUpperBound(currentRangeHash.stopExclusive, rowKey)) { + validateBatchHash(context, resultHasher, currentRangeHash); + if (!rangeHashIterator.hasNext()) { + // THIS SHOULD NEVER HAPPEN. Bigtable is being scanned till the last + // RangeHash.endKeyExclusive(), so bigtable's result should not outlast the + // rangeHashes. + throw new IllegalStateException( + "Buffer reached to end while scan is still active at row :" + + immutableBytesToString(result.getRow()) + + ". Affected Range: [" + + immutableBytesToString(rangeStartInclusive) + + ", " + + immutableBytesToString(rangeEndExclusive) + + ")."); + } + currentRangeHash = rangeHashIterator.next(); + } + + // Always Hash the current row. + resultHasher.hashResult(result); + } + + // Bigtable scan is finished at this point and rangeHashes may contain additional row ranges. + // Last range will always be unverified as the range end is exclusive and + // currentRow > rangeEndExclusive will never by true. Verify the last range. + validateBatchHash(context, resultHasher, currentRangeHash); + + // If there are remaining ranges in the rangeHashes they all need to reported as mismatched as + // there is nothing in Cloud Bigtable for those row ranges. + // for (int i = bufferIndex; i < rangeHashes.size(); i++) { + while (rangeHashIterator.hasNext()) { + currentRangeHash = rangeHashIterator.next(); + reportMismatch(context, currentRangeHash); + } + + DOFN_LOG.debug( + "Finishing context by outputting " + + rangeHashes.size() + + " keys in range [" + + ((!rangeHashes.isEmpty()) + ? immutableBytesToString(rangeStartInclusive) + + ", " + + immutableBytesToString(rangeEndExclusive) + + ")." + : ", ).")); + } + } + + private ResultScanner createBigtableScan(byte[] startKeyInclusive, byte[] stopKeyExclusive) + throws IOException { + Table table = getConnection().getTable(TableName.valueOf(tableName.get())); + // Get the scan from TableHash, HashTable can be run to hash a small part of data (selected + // column families, timestamp range, maxVersions etc), this scan allows us to fetch the same + // data from Cloud Bigtable to match. + TableHashWrapper tableHash = + tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get()); + Scan scan = tableHash.getScan(); + // Set the workitem boundaries on the scan. + if (startKeyInclusive.length > 0) { + scan.withStartRow(startKeyInclusive, true); + } + if (stopKeyExclusive.length > 0) { + scan.withStopRow(stopKeyExclusive, false); + } + + return table.getScanner(scan); + } + + /** + * Determines if row >= stopExclusive for a row range (start, stopExclusive). Empty stopExclusive + * represents a range with no upper bound. + * + * @param stopExclusive + * @param row + * @return + */ + private boolean isWithinUpperBound( + ImmutableBytesWritable stopExclusive, ImmutableBytesWritable row) { + return stopExclusive.equals(HConstants.EMPTY_END_ROW) || row.compareTo(stopExclusive) < 0; + } + + private void validateBatchHash( + ProcessContext context, BigtableResultHasher resultHasher, RangeHash currentRangeHash) { + // The batch is always started, so its safe to finish the batch. If there were no rows, we will + // get a hash for empty batch. + resultHasher.finishBatch(); + if (!resultHasher.getBatchHash().equals(currentRangeHash.hash)) { + reportMismatch(context, currentRangeHash); + } else { + matches.inc(); + } + // Start a new batch + resultHasher.startBatch(new ImmutableBytesWritable(currentRangeHash.stopExclusive)); + } + + private void reportMismatch(ProcessContext context, RangeHash currentRangeHash) { + mismatches.inc(); + DOFN_LOG.info( + "MISMATCH ON RANGE [" + + immutableBytesToString(currentRangeHash.startInclusive) + + ", " + + immutableBytesToString(currentRangeHash.stopExclusive) + + ")."); + context.output(currentRangeHash); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java new file mode 100644 index 0000000000..20b693963a --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java @@ -0,0 +1,464 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.createConfiguration; +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import autovalue.shaded.com.google$.common.annotations.$VisibleForTesting; +import com.google.api.core.InternalApi; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.beam.validation.TableHashWrapper.TableHashReader; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.DefaultCoder; +import org.apache.beam.sdk.io.BoundedSource; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * A beam source to read output of Hadoop HashTable job. The source creates 1 workitem per HashTable + * data file and emits a row-range/hash pair. + */ +@InternalApi +class HadoopHashTableSource extends BoundedSource implements Serializable { + + private static final long serialVersionUID = 2383724L; + + /** + * A simple POJO encapsulating a row range and the corresponding hash generated by HashTable job. + */ + @DefaultCoder(RangeHashCoder.class) + public static class RangeHash { + + public final ImmutableBytesWritable startInclusive; + public final ImmutableBytesWritable stopExclusive; + public final ImmutableBytesWritable hash; + + private RangeHash( + ImmutableBytesWritable startInclusive, + ImmutableBytesWritable stopExclusive, + ImmutableBytesWritable hash) { + this.startInclusive = startInclusive; + this.stopExclusive = stopExclusive; + this.hash = hash; + } + + static RangeHash of( + ImmutableBytesWritable startInclusive, + ImmutableBytesWritable stopExclusive, + ImmutableBytesWritable hash) { + Preconditions.checkNotNull(startInclusive); + Preconditions.checkNotNull(stopExclusive); + Preconditions.checkNotNull(hash); + return new RangeHash(startInclusive, stopExclusive, hash); + } + + @Override + public String toString() { + return String.format( + "RangeHash{ range = [ %s, %s), hash: %s }", + immutableBytesToString(startInclusive), + immutableBytesToString(stopExclusive), + immutableBytesToString(hash)); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof RangeHash)) { + return false; + } + RangeHash rangeHash = (RangeHash) o; + return Objects.equal(startInclusive, rangeHash.startInclusive) + && Objects.equal(stopExclusive, rangeHash.stopExclusive) + && Objects.equal(hash, rangeHash.hash); + } + + @Override + public int hashCode() { + return Objects.hashCode(startInclusive, stopExclusive, hash); + } + } + + public static final Log LOG = LogFactory.getLog(HadoopHashTableSource.class); + + private ValueProvider projectId; + + // Path to the output of HashTable job. Usually in GCS. + private ValueProvider sourceHashDir; + + // Coder to encode/decode the RangeHash + private RangeHashCoder coder; + + // Row range owned by this source. + @VisibleForTesting ImmutableBytesWritable startRowInclusive; + + @VisibleForTesting ImmutableBytesWritable stopRowExclusive; + + private TableHashWrapperFactory tableHashWrapperFactory; + + public HadoopHashTableSource() { + this.coder = new RangeHashCoder(); + } + + /** + * Creates a HadoopHashTableSource that reads HashTable data from hashTableOutputDir in GCS bucket + * in project $(projectId). + */ + public HadoopHashTableSource( + ValueProvider projectId, ValueProvider sourceHashDir) { + this(projectId, sourceHashDir, /*startRowInclusive*/ null, /*stopRowExclusive*/ null); + } + + /** + * Constructor to initialize a HadoopHashTableSource for a given row-range. Used for creating + * split sources. + */ + @$VisibleForTesting + HadoopHashTableSource( + ValueProvider projectId, + ValueProvider sourceHashDir, + ImmutableBytesWritable startRowInclusive, + ImmutableBytesWritable stopRowExclusive) { + this( + projectId, + sourceHashDir, + startRowInclusive, + stopRowExclusive, + new TableHashWrapperFactory()); + } + + @VisibleForTesting + HadoopHashTableSource( + ValueProvider projectId, + ValueProvider hadoopHashTableOutputDir, + ImmutableBytesWritable startRowInclusive, + ImmutableBytesWritable stopRowExclusive, + TableHashWrapperFactory tableHashWrapperFactory) { + this.coder = new RangeHashCoder(); + this.projectId = projectId; + this.sourceHashDir = hadoopHashTableOutputDir; + // startRow and stopRow will be null when the template is initialized. startRow and stopRow are + // read from the hashTableOutputDir, which is only available at pipeline runtime. + this.startRowInclusive = startRowInclusive; + this.stopRowExclusive = stopRowExclusive; + this.tableHashWrapperFactory = tableHashWrapperFactory; + } + + @Override + public List> split( + long desiredBundleSizeBytes, PipelineOptions options) throws IOException { + // This method relies on the partitioning done by HBase-HashTable job. There is a possibility + // of stragglers. SyncTable handles it by using a group by and further splitting workitems. + TableHashWrapper hash = + tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get()); + + ImmutableList partitions = hash.getPartitions(); + int numPartitions = partitions.size(); + + List splitSources = new ArrayList<>(numPartitions + 1); + if (numPartitions == 0) { + // There are 0 partitions and 1 hashfile, return single source with full key range. + splitSources.add( + new HadoopHashTableSource( + projectId, + sourceHashDir, + new ImmutableBytesWritable(hash.getStartRow()), + new ImmutableBytesWritable(hash.getStopRow()), + tableHashWrapperFactory)); + return splitSources; + } + + // Use the HashTable start key. The value is HConstants.EMPTY_START_ROW for full table scan. + ImmutableBytesWritable startRow = new ImmutableBytesWritable(hash.getStartRow()); + ImmutableBytesWritable stopRow = new ImmutableBytesWritable(hash.getStopRow()); + + // The output of HashTable is organized as partition file and a set of datafiles. + // Partition file contains a list of partitions, these partitions split the key-range of a table + // into roughly equal row-ranges and hashes for these row-ranges are stored in a single + // datafile. + // + // There are always numPartitions +1 data files. Datafile(i) covers hashes for [partition{i-1}, + // partition{i}). + // So a partition file containing entries [b,f] for a table with row range [a,z] will have 3 + // data files containing hashes. + // file0 will contain [a(startRow), b), file1 will contain [b,f), and file3 will contain + // [f,z(stopRow)) + for (int i = 0; i < numPartitions; i++) { + LOG.debug( + "Adding: [" + + immutableBytesToString(startRow.get()) + + ", " + + immutableBytesToString(partitions.get(i).get()) + + "]"); + splitSources.add( + new HadoopHashTableSource( + projectId, sourceHashDir, startRow, partitions.get(i), tableHashWrapperFactory)); + startRow = partitions.get(i); + } + // Add the last range for [lastPartition, stopRow). + LOG.debug( + "Adding: [" + + immutableBytesToString(startRow.get()) + + ", " + + immutableBytesToString(stopRow.get()) + + "]"); + // Add the last range for [lastPartition, stopRow). + splitSources.add( + new HadoopHashTableSource( + projectId, + sourceHashDir, + partitions.get(numPartitions - 1), + new ImmutableBytesWritable(stopRow), + tableHashWrapperFactory)); + LOG.info("Returning " + splitSources.size() + " sources from " + numPartitions + " partitions"); + return splitSources; + } + + @Override + public Coder getOutputCoder() { + return coder; + } + + @Override + public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { + // HashTable data files don't expose a method to estimate size or lineCount. + return 0; + } + + @Override + public BoundedReader createReader(PipelineOptions options) throws IOException { + TableHashWrapper hash = + tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get()); + + // The row range for an un-split source is determined from the output of HashTable job. + // HashTableOutputDir is a runtime parameter and hence not available at construction time, so + // populate the start and stop here. + if (startRowInclusive == null || stopRowExclusive == null) { + startRowInclusive = hash.getStartRow(); + stopRowExclusive = hash.getStopRow(); + } + + return new HashBasedReader( + this, + new ImmutableBytesWritable(startRowInclusive), + new ImmutableBytesWritable(stopRowExclusive), + hash.newReader( + createConfiguration(this.projectId.get(), this.sourceHashDir.get()), + new ImmutableBytesWritable(startRowInclusive))); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof HadoopHashTableSource)) { + return false; + } + HadoopHashTableSource that = (HadoopHashTableSource) o; + return Objects.equal(projectId, that.projectId) + && Objects.equal(sourceHashDir, that.sourceHashDir) + && Objects.equal(startRowInclusive, that.startRowInclusive) + && Objects.equal(stopRowExclusive, that.stopRowExclusive); + } + + @Override + public int hashCode() { + return Objects.hashCode(projectId, sourceHashDir, coder, startRowInclusive, stopRowExclusive); + } + + @Override + public String toString() { + return "HadoopHashTableSource [" + + immutableBytesToString(startRowInclusive) + + ", " + + immutableBytesToString(stopRowExclusive) + + ')'; + } + + private void writeObject(ObjectOutputStream s) throws IOException { + // s.defaultWriteObject(); + s.writeObject(projectId); + s.writeObject(sourceHashDir); + s.writeObject(tableHashWrapperFactory); + // Start and Stop can be null, write a boolean to indicate if start/stop is expected. + if (startRowInclusive == null) { + s.writeBoolean(false); + } else { + s.writeBoolean(true); + s.writeObject(startRowInclusive.copyBytes()); + } + + if (stopRowExclusive == null) { + s.writeBoolean(false); + } else { + s.writeBoolean(true); + s.writeObject(stopRowExclusive.copyBytes()); + } + } + + private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { + // s.defaultReadObject(); + this.projectId = (ValueProvider) s.readObject(); + this.sourceHashDir = (ValueProvider) s.readObject(); + this.tableHashWrapperFactory = (TableHashWrapperFactory) s.readObject(); + // start/stop can be null, they are preceded by a boolean indicating their presence. + if (s.readBoolean() == true) { + this.startRowInclusive = new ImmutableBytesWritable((byte[]) s.readObject()); + } + if (s.readBoolean() == true) { + this.stopRowExclusive = new ImmutableBytesWritable((byte[]) s.readObject()); + } + } + + @VisibleForTesting + static class HashBasedReader extends BoundedReader { + + final HadoopHashTableSource source; + final TableHashReader reader; + + final ImmutableBytesWritable startRowInclusive; + final ImmutableBytesWritable stopRowExclusive; + + long numKeys = 0; + // Flag indicating that this workitem is finished. + boolean isDone = false; + ImmutableBytesWritable currentRangeStartKey; + // Hash for the current range. + ImmutableBytesWritable currentHash; + RangeHash currentRangeHash; + + public HashBasedReader( + HadoopHashTableSource source, + ImmutableBytesWritable startRowInclusive, + ImmutableBytesWritable stopRowExclusive, + TableHashReader reader) { + this.reader = reader; + this.source = source; + this.startRowInclusive = startRowInclusive; + this.stopRowExclusive = stopRowExclusive; + } + + @Override + public boolean start() throws IOException { + // NO CHECKED EXCEPTIONS HERE. + LOG.debug( + "Starting a new reader at key range [" + + immutableBytesToString(startRowInclusive) + + " ," + + immutableBytesToString(stopRowExclusive) + + ")."); + numKeys = 0; + + if (readNextKey()) { + // Dataflow calls start, followed by getCurrent. HashBased reader needs to read on TableHash + // twice to return a RangeHash since it specifies both range-start and range-end. + advance(); + return true; + } + + isDone = true; + return false; + } + + @Override + public boolean advance() throws IOException { + if (isDone) { + LOG.debug("Ending workitem at key " + immutableBytesToString(currentRangeStartKey) + " ."); + return false; + } + + ImmutableBytesWritable startKey = this.currentRangeStartKey; + ImmutableBytesWritable hash = this.currentHash; + + if (!readNextKey()) { + this.currentRangeHash = RangeHash.of(startKey, stopRowExclusive, hash); + // return true since we have lastBatchStartKey to emit. Set isDone=true to prevent reading + // from a potentially exhausted reader. + isDone = true; + } else { + this.currentRangeHash = RangeHash.of(startKey, reader.getCurrentKey(), hash); + } + + return true; + } + + // Returns true if a key can be read for this workitem. + private boolean readNextKey() throws IOException { + if (reader.next()) { + numKeys++; + this.currentRangeStartKey = reader.getCurrentKey(); + if ( // StopRow is not set, everything is in bounds. + (stopRowExclusive.equals(HConstants.EMPTY_END_ROW) + || currentRangeStartKey.compareTo(stopRowExclusive) < 0)) { // currentKey < stopKey + // There is a key to read and the key is within the bounds of this workitem. Return true. + this.currentHash = reader.getCurrentHash(); + return true; + } else { + // There is a key to read but its outside of the bounds of this workitem. + this.currentHash = null; + return false; + } + } + + // Nothing left to read for this workitem. + currentRangeStartKey = null; + currentHash = null; + return false; + } + + @Override + public RangeHash getCurrent() { + return currentRangeHash; + } + + @Override + public void close() throws IOException { + LOG.info( + "Finishing a reader for key range [" + + immutableBytesToString(startRowInclusive) + + " ," + + immutableBytesToString(stopRowExclusive) + + ") after reading " + + numKeys + + " keys. Ending at " + + immutableBytesToString(currentRangeStartKey)); + reader.close(); + } + + @Override + public BoundedSource getCurrentSource() { + return source; + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java new file mode 100644 index 0000000000..6799d63872 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java @@ -0,0 +1,105 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InvalidObjectException; +import java.io.OutputStream; +import java.util.Collections; +import java.util.List; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.CoderException; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** Coder used by beam to encode/decode @{@link RangeHash} objects. */ +public class RangeHashCoder extends Coder { + + public static Coder of() { + return new RangeHashCoder(); + } + + @Override + public void encode(RangeHash value, OutputStream outStream) throws IOException { + if (value == null) { + throw new CoderException("Can not encode null objects."); + } + DataOutputStream dataOutputStream = new DataOutputStream(outStream); + // RangeHash fields can never be null. + value.startInclusive.write(dataOutputStream); + value.stopExclusive.write(dataOutputStream); + value.hash.write(dataOutputStream); + } + + @Override + public RangeHash decode(InputStream inStream) throws IOException { + DataInputStream dataInputStream = new DataInputStream(inStream); + + ImmutableBytesWritable startInclusive = new ImmutableBytesWritable(); + startInclusive.readFields(dataInputStream); + + ImmutableBytesWritable stopExclusive = new ImmutableBytesWritable(); + stopExclusive.readFields(dataInputStream); + + ImmutableBytesWritable hash = new ImmutableBytesWritable(); + hash.readFields(dataInputStream); + + return RangeHash.of(startInclusive, stopExclusive, hash); + } + + @Override + public List> getCoderArguments() { + return Collections.emptyList(); + } + + @Override + public void verifyDeterministic() throws NonDeterministicException { + // This is a deterministic coder as it writes the byte[] in order. + } + + /** + * !!! DO NOT DELETE !!! + * + *

See readObjectNoData method in: + * https://docs.oracle.com/javase/7/docs/platform/serialization/spec/input.html#6053. + * + *

Disable backwards compatibility with previous versions that were serialized. + * + * @throws InvalidObjectException + */ + @SuppressWarnings("unused") + private void readObjectNoData() throws InvalidObjectException { + throw new InvalidObjectException("Hash data required"); + } + + @Override + protected Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + @Override + public boolean equals(Object other) { + return other instanceof RangeHashCoder; + } + + @Override + public int hashCode() { + return RangeHashCoder.class.hashCode(); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java new file mode 100644 index 0000000000..a664ea2602 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java @@ -0,0 +1,199 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; +import com.google.cloud.bigtable.beam.sequencefiles.Utils; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.common.annotations.VisibleForTesting; +import com.google.gson.Gson; +import java.util.List; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.io.Read; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.metrics.MetricQueryResults; +import org.apache.beam.sdk.metrics.MetricResult; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.beam.sdk.transforms.GroupByKey; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A job that takes HBase HashTable output and compares the hashes from Cloud Bigtable table. + * + *

Execute the following command to run the job directly: + * + *

+ *   mvn compile exec:java \
+ *      -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
+ *      -Dexec.args="--runner=DataflowRunner \
+ *            --project=$PROJECT \
+ *            --bigtableInstanceId=$INSTANCE \
+ *            --bigtableTableId=$TABLE \
+ *            --sourceHashDir=$SOURCE_HASH_DIR \
+ *            --outputPrefix=$OUtPUT_PREFIX \
+ *            --stagingLocation=$STAGING_LOC \
+ *            --tempLocation=$TMP_LOC \
+ *            --region=$REGION \
+ *            --workerZone=$WORKER_ZONE"
+ * 
+ * + *

Execute the following command to create the Dataflow template: + * + *

+ * mvn compile exec:java \
+ *   -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
+ *   -Dexec.args="--runner=DataflowRunner \
+ *                --project=$PROJECT \
+ *                --stagingLocation=gs://$STAGING_PATH \
+ *                --templateLocation=gs://$TEMPLATE_PATH \
+ *                --wait=false"
+ * 
+ * + *

There are a few ways to run the pipeline using the template. See Dataflow doc for details: + * https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload + * a metadata file that contains information about the runtime parameters that can be used for + * parameter validation purpose and more. A sample metadata file can be found at + * "src/main/resources/SyncTableJob_metadata". + * + *

An example using gcloud command line: + * + *

+ * gcloud beta dataflow jobs run $JOB_NAME \
+ *   --gcs-location gs://$TEMPLATE_PATH \
+ *   --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,sourceHashDir=gs://$SOURCE_HASH_DIR,outputPrefix=$OUTPUT_PREFIX
+ * 
+ */ +@InternalExtensionOnly +public class SyncTableJob { + + private static final Log LOG = LogFactory.getLog(SyncTableJob.class); + + public interface SyncTableOptions extends GcpOptions { + + @Description("This Bigtable App Profile id.") + ValueProvider getBigtableAppProfileId(); + + @SuppressWarnings("unused") + void setBigtableAppProfileId(ValueProvider appProfileId); + + @Description("The project that contains the table to export. Defaults to --project.") + @Default.InstanceFactory(Utils.DefaultBigtableProjectFactory.class) + ValueProvider getBigtableProject(); + + @SuppressWarnings("unused") + void setBigtableProject(ValueProvider projectId); + + @Description("The Bigtable instance id that contains the table to export.") + ValueProvider getBigtableInstanceId(); + + @SuppressWarnings("unused") + void setBigtableInstanceId(ValueProvider instanceId); + + @Description("The Bigtable table id to export.") + ValueProvider getBigtableTableId(); + + @SuppressWarnings("unused") + void setBigtableTableId(ValueProvider tableId); + + @Description("HBase HashTable job output dir.") + ValueProvider getHashTableOutputDir(); + + @SuppressWarnings("unused") + // Rename it to sourceHashDir as in HBase sync table job. + void setHashTableOutputDir(ValueProvider hashTableOutputDir); + + @Description("File pattern for files containing mismatched row ranges.") + ValueProvider getOutputPrefix(); + + @SuppressWarnings("unused") + void setOutputPrefix(ValueProvider outputPrefix); + + // When creating a template, this flag must be set to false. + @Description("Wait for pipeline to finish.") + @Default.Boolean(true) + boolean getWait(); + + @SuppressWarnings("unused") + void setWait(boolean wait); + } + + public static void main(String[] args) { + PipelineOptionsFactory.register(SyncTableOptions.class); + + SyncTableOptions opts = + PipelineOptionsFactory.fromArgs(args).withValidation().as(SyncTableOptions.class); + + LOG.info("===> Building Pipeline"); + Pipeline pipeline = buildPipeline(opts); + + LOG.info("===> Running Pipeline"); + PipelineResult result = pipeline.run(); + + if (opts.getWait()) { + Utils.waitForPipelineToFinish(result); + } + + // Log all the counters for number of matches and number of mismatches. + MetricQueryResults metrics = result.metrics().allMetrics(); + for (MetricResult counter : metrics.getCounters()) { + LOG.warn(counter.getName() + ":" + counter.getAttempted()); + } + } + + @VisibleForTesting + public static Pipeline buildPipeline(SyncTableOptions opts) { + Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); + pipeline + .apply( + "Read HBase HashTable output", + Read.from( + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + opts.getBigtableProject(), opts.getHashTableOutputDir())))) + .apply( + "group by and create granular workitems", GroupByKey.>create()) + .apply("validate hash", ParDo.of(new ComputeAndValidateHashFromBigtableDoFn(opts))) + .apply("Serialize the ranges", MapElements.via(new RangeHashToString())) + .apply("Write to file", TextIO.write().to(opts.getOutputPrefix()).withSuffix(".txt")); + return pipeline; + } + + static class RangeHashToString extends SimpleFunction { + // TODO maybe explore a sequenceFile sink for RangeHash. Hadoop jobs using this output may be + // easier to write for sequence file. + + // GSON is not serializable, keep it transient. Member variable to avoid creating a Gson object + // per apply call. + private transient Gson gson = null; + + @Override + public String apply(RangeHash input) { + if (gson == null) { + gson = new Gson(); + } + return gson.toJson(input); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java new file mode 100644 index 0000000000..2f0c5cc4cc --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java @@ -0,0 +1,55 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; + +/** Utility class for SyncTable job. */ +public class SyncTableUtils { + + private SyncTableUtils() {} + + public static String immutableBytesToString(ImmutableBytesWritable bytes) { + if (bytes == null) { + return ""; + } + return immutableBytesToString(bytes.get()); + } + + public static String immutableBytesToString(byte[] bytes) { + return Bytes.toStringBinary(bytes); + } + + /** + * Creates a HBase configuration for reading HashTable output from GCS bucket located in + * projectId. + * + * @param projectId project containing the GCS bucket holding hashtable output. + * @param sourceHashDir location of hashtable output from HBase. + * @return + */ + public static Configuration createConfiguration(String projectId, String sourceHashDir) { + Configuration conf = HBaseConfiguration.create(); + conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); + conf.set("fs.gs.project.id", projectId); + conf.set("fs.defaultFS", sourceHashDir); + conf.set("google.cloud.auth.service.account.enable", "true"); + return conf; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java new file mode 100644 index 0000000000..2f75c5722a --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java @@ -0,0 +1,55 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.api.core.InternalApi; +import com.google.common.collect.ImmutableList; +import java.io.Closeable; +import java.io.IOException; +import java.io.Serializable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * Wraps HashTable.TableHash object and delegates the calls to it. This class exposes the minimal + * interface required from TableHash. This class is required for mocking purposes in unit tests. + */ +@InternalApi +public interface TableHashWrapper extends Serializable { + + int getNumHashFiles(); + + ImmutableList getPartitions(); + + ImmutableBytesWritable getStartRow(); + + ImmutableBytesWritable getStopRow(); + + Scan getScan(); + + TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow); + + interface TableHashReader extends Closeable { + boolean next() throws IOException; + + ImmutableBytesWritable getCurrentKey(); + + ImmutableBytesWritable getCurrentHash(); + + void close() throws IOException; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java new file mode 100644 index 0000000000..262aadc7c5 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java @@ -0,0 +1,33 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.createConfiguration; + +import java.io.IOException; +import java.io.Serializable; + +/** Factory to create a TableHashWrapper. */ +public class TableHashWrapperFactory implements Serializable { + + private static final long serialVersionUID = 265433454L; + + public TableHashWrapper getTableHash(String projectId, String sourceHashDir) throws IOException { + return TableHashWrapperImpl.create( + createConfiguration(projectId, sourceHashDir), sourceHashDir); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java new file mode 100644 index 0000000000..71a0f6ddaa --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java @@ -0,0 +1,119 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor; +import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash; +import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash.Reader; + +class TableHashWrapperImpl implements TableHashWrapper { + + static TableHashWrapper create(Configuration conf, String hashTableOutputDir) throws IOException { + TableHash tableHash = TableHash.read(conf, new Path(hashTableOutputDir)); + + TableHashWrapper tableHashWrapper = new TableHashWrapperImpl(tableHash); + Preconditions.checkArgument( + tableHashWrapper.getNumHashFiles() == (tableHashWrapper.getPartitions().size() + 1), + String.format( + "Corrupt hashtable output. %d hash files for %d partitions. Expected %d files.", + tableHashWrapper.getNumHashFiles(), + tableHashWrapper.getPartitions().size(), + tableHashWrapper.getPartitions().size() + 1)); + return tableHashWrapper; + } + + private final TableHash hash; + + private TableHashWrapperImpl(TableHash hash) { + this.hash = hash; + } + + public int getNumHashFiles() { + return BigtableTableHashAccessor.getNumHashFiles(hash); + } + + public ImmutableList getPartitions() { + return BigtableTableHashAccessor.getPartitions(hash); + } + + public ImmutableBytesWritable getStartRow() { + return BigtableTableHashAccessor.getStartRow(hash); + } + + public ImmutableBytesWritable getStopRow() { + return BigtableTableHashAccessor.getStopRow(hash); + } + + public Scan getScan() { + try { + return BigtableTableHashAccessor.getScan(hash); + } catch (IOException e) { + throw new RuntimeException("Failed to init a scan from TableHash: ", e); + } + } + + public TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow) { + try { + return TableHashReaderImpl.create(hash.newReader(conf, startRow)); + } catch (IOException e) { + throw new RuntimeException( + "Failed to open reader at " + immutableBytesToString(startRow.copyBytes()), e); + } + } + + static class TableHashReaderImpl implements TableHashReader { + + private final Reader reader; + + static TableHashReaderImpl create(TableHash.Reader reader) { + Preconditions.checkNotNull(reader, "Reader can not be null."); + return new TableHashReaderImpl(reader); + } + + private TableHashReaderImpl(TableHash.Reader reader) { + this.reader = reader; + } + + @Override + public boolean next() throws IOException { + return reader.next(); + } + + @Override + public ImmutableBytesWritable getCurrentKey() { + return reader.getCurrentKey(); + } + + @Override + public ImmutableBytesWritable getCurrentHash() { + return reader.getCurrentHash(); + } + + @Override + public void close() throws IOException { + reader.close(); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java new file mode 100644 index 0000000000..a5312d6c52 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java @@ -0,0 +1,77 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.mapreduce; + +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.HashTable.ResultHasher; +import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash; + +/** A helper class to access package private fields of HashTable.TableHash. */ +public class BigtableTableHashAccessor { + + // Restrict object creation. This class should only be used to access state from TableHash. + private BigtableTableHashAccessor() {} + + public static int getNumHashFiles(TableHash hash) { + return hash.numHashFiles; + } + + public static ImmutableList getPartitions(TableHash hash) { + return ImmutableList.copyOf(hash.partitions); + } + + public static ImmutableBytesWritable getStartRow(TableHash hash) { + return new ImmutableBytesWritable(hash.startRow); + } + + public static ImmutableBytesWritable getStopRow(TableHash hash) { + return new ImmutableBytesWritable(hash.stopRow); + } + + public static Scan getScan(TableHash hash) throws IOException { + return hash.initScan(); + } + + // Wrapper to access package private class ResultHasher. Delegates all the calls to underlying + // TableHash.ResultHasher, helps in mocking for unit tests. + public static class BigtableResultHasher { + private final ResultHasher hasher; + + public BigtableResultHasher() { + hasher = new ResultHasher(); + } + + public void startBatch(ImmutableBytesWritable batchStartKey) { + hasher.startBatch(batchStartKey); + } + + public void finishBatch() { + hasher.finishBatch(); + } + + public ImmutableBytesWritable getBatchHash() { + return hasher.getBatchHash(); + } + + public void hashResult(Result result) { + hasher.hashResult(result); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/..snapshotinfo.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/..snapshotinfo.crc deleted file mode 100644 index 8fe4533a0159f76b5bb3a1968ac5d1fa7fc45a58..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}AWGHv9?z6(s~B diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.data.manifest.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.data.manifest.crc deleted file mode 100644 index 1467a17f1f9924f6a69bd2963d5e21ff088ca3f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20 bcmYc;N@ieSU}8|8vgZ5;2Nu4voskOwJbDJ( diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo deleted file mode 100644 index 83e482aac0..0000000000 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo +++ /dev/null @@ -1,2 +0,0 @@ - - test-snapshottest�����. ( \ No newline at end of file diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/.b0f68aca966b48f1b171614e582b1cbb.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/.b0f68aca966b48f1b171614e582b1cbb.crc deleted file mode 100644 index ea5b25e7785f94c7a36b646dc7c947d4cc4bce43..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52 lcmYc;N@ieSU}A`pign*xAbgs=z3KU;RizcU@Jr?pH2^LR7rFod diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/.8aff180e3a244dcc807e4de8b6fce0a7.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/.8aff180e3a244dcc807e4de8b6fce0a7.crc deleted file mode 100644 index 51cacdd03b5469b099265d607b492728ca48fb07..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52 lcmYc;N@ieSU}ESn7HMqPocxn_X+h|wRizcU@Jr?pH2^J{7oz|G diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/.c2945aa8dac34922913a1f60fedb6154.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/.c2945aa8dac34922913a1f60fedb6154.crc deleted file mode 100644 index 2c4de3ac0ea20bd17cca5a5cfe7b3f696c12e5c3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52 lcmYc;N@ieSU}9*<{16|;p~b$p^6arqt4b?y;g`%IY5+o67%%_; diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc deleted file mode 100644 index 931ebfb54555d336879fa44ef956de26ba9c2a4e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52 mcmYc;N@ieSU}9MD$S>Yy!SuIZFSEYew5qfM7kMA=N`FDt4b?y;j9U&TL27c7ajlr diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/.32053565831341128b8d8f5567d48fdc.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/.32053565831341128b8d8f5567d48fdc.crc deleted file mode 100644 index 80317a1515597ecbac0015cf7edba1283ce6824b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52 mcmYc;N@ieSU}9J$_&t8|r`=f(wzKMOT2)$s3%_IzQ3C*MFc~EP diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/.36798a163ed046b193818e21dd7516b4.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/.36798a163ed046b193818e21dd7516b4.crc deleted file mode 100644 index 00a9d7720d3d3867ea0cd0a153d5265158a9b5ff..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52 mcmYc;N@ieSU}89xHtAGe`OGB@e!ed@ttzd+gfot4b?y;g`%IY5*h+7hwPZ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/.b83044f76ba6474aa829e3bae7fd82d1.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/.b83044f76ba6474aa829e3bae7fd82d1.crc deleted file mode 100644 index ca57c97e2deddae20f6c82712db07fd2e35620d7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52 lcmYc;N@ieSU}Esk=Ghmve-7`}t(p;=R+U!Z!Y`Ra)BrhK7wrH5 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt b/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt index 7f8f8fc2db..921caf2d6d 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt @@ -1,107 +1,133 @@ +// Run from HBase shell. Run `hbase shell` from unix terminal on HBase master. create 'test', 'cf', {SPLITS => ["1", "2", "3", "4", "5", "6", "7", "8", "9"]} -put 'test','1', 'cf:a', 'value1' -put 'test','2', 'cf:a', 'value2' -put 'test','3', 'cf:a', 'value3' -put 'test','4', 'cf:a', 'value4' -put 'test','5', 'cf:a', 'value5' -put 'test','6', 'cf:a', 'value6' -put 'test','7', 'cf:a', 'value7' -put 'test','8', 'cf:a', 'value8' -put 'test','9', 'cf:a', 'value9' -put 'test','10', 'cf:a', 'value10' -put 'test','11', 'cf:a', 'value11' -put 'test','12', 'cf:a', 'value12' -put 'test','13', 'cf:a', 'value13' -put 'test','14', 'cf:a', 'value14' -put 'test','15', 'cf:a', 'value15' -put 'test','16', 'cf:a', 'value16' -put 'test','17', 'cf:a', 'value17' -put 'test','18', 'cf:a', 'value18' -put 'test','19', 'cf:a', 'value19' -put 'test','20', 'cf:a', 'value20' -put 'test','21', 'cf:a', 'value21' -put 'test','22', 'cf:a', 'value22' -put 'test','23', 'cf:a', 'value23' -put 'test','24', 'cf:a', 'value24' -put 'test','25', 'cf:a', 'value25' -put 'test','26', 'cf:a', 'value26' -put 'test','27', 'cf:a', 'value27' -put 'test','28', 'cf:a', 'value28' -put 'test','29', 'cf:a', 'value29' -put 'test','30', 'cf:a', 'value30' -put 'test','31', 'cf:a', 'value31' -put 'test','32', 'cf:a', 'value32' -put 'test','33', 'cf:a', 'value33' -put 'test','34', 'cf:a', 'value34' -put 'test','35', 'cf:a', 'value35' -put 'test','36', 'cf:a', 'value36' -put 'test','37', 'cf:a', 'value37' -put 'test','38', 'cf:a', 'value38' -put 'test','39', 'cf:a', 'value39' -put 'test','40', 'cf:a', 'value40' -put 'test','41', 'cf:a', 'value41' -put 'test','42', 'cf:a', 'value42' -put 'test','43', 'cf:a', 'value43' -put 'test','44', 'cf:a', 'value44' -put 'test','45', 'cf:a', 'value45' -put 'test','46', 'cf:a', 'value46' -put 'test','47', 'cf:a', 'value47' -put 'test','48', 'cf:a', 'value48' -put 'test','49', 'cf:a', 'value49' -put 'test','50', 'cf:a', 'value50' -put 'test','51', 'cf:a', 'value51' -put 'test','52', 'cf:a', 'value52' -put 'test','53', 'cf:a', 'value53' -put 'test','54', 'cf:a', 'value54' -put 'test','55', 'cf:a', 'value55' -put 'test','56', 'cf:a', 'value56' -put 'test','57', 'cf:a', 'value57' -put 'test','58', 'cf:a', 'value58' -put 'test','59', 'cf:a', 'value59' -put 'test','60', 'cf:a', 'value60' -put 'test','61', 'cf:a', 'value61' -put 'test','62', 'cf:a', 'value62' -put 'test','63', 'cf:a', 'value63' -put 'test','64', 'cf:a', 'value64' -put 'test','65', 'cf:a', 'value65' -put 'test','66', 'cf:a', 'value66' -put 'test','67', 'cf:a', 'value67' -put 'test','68', 'cf:a', 'value68' -put 'test','69', 'cf:a', 'value69' -put 'test','70', 'cf:a', 'value70' -put 'test','71', 'cf:a', 'value71' -put 'test','72', 'cf:a', 'value72' -put 'test','73', 'cf:a', 'value73' -put 'test','74', 'cf:a', 'value74' -put 'test','75', 'cf:a', 'value75' -put 'test','76', 'cf:a', 'value76' -put 'test','77', 'cf:a', 'value77' -put 'test','78', 'cf:a', 'value78' -put 'test','79', 'cf:a', 'value79' -put 'test','80', 'cf:a', 'value80' -put 'test','81', 'cf:a', 'value81' -put 'test','82', 'cf:a', 'value82' -put 'test','83', 'cf:a', 'value83' -put 'test','84', 'cf:a', 'value84' -put 'test','85', 'cf:a', 'value85' -put 'test','86', 'cf:a', 'value86' -put 'test','87', 'cf:a', 'value87' -put 'test','88', 'cf:a', 'value88' -put 'test','89', 'cf:a', 'value89' -put 'test','90', 'cf:a', 'value90' -put 'test','91', 'cf:a', 'value91' -put 'test','92', 'cf:a', 'value92' -put 'test','93', 'cf:a', 'value93' -put 'test','94', 'cf:a', 'value94' -put 'test','95', 'cf:a', 'value95' -put 'test','96', 'cf:a', 'value96' -put 'test','97', 'cf:a', 'value97' -put 'test','98', 'cf:a', 'value98' -put 'test','99', 'cf:a', 'value99' -put 'test','100', 'cf:a', 'value100' +put 'test','1', 'cf:a', 'value1', 100 +put 'test','2', 'cf:a', 'value2', 100 +put 'test','3', 'cf:a', 'value3', 100 +put 'test','4', 'cf:a', 'value4', 100 +put 'test','5', 'cf:a', 'value5', 100 +put 'test','6', 'cf:a', 'value6', 100 +put 'test','7', 'cf:a', 'value7', 100 +put 'test','8', 'cf:a', 'value8', 100 +put 'test','9', 'cf:a', 'value9', 100 +put 'test','10', 'cf:a', 'value10', 100 +put 'test','11', 'cf:a', 'value11', 100 +put 'test','12', 'cf:a', 'value12', 100 +put 'test','13', 'cf:a', 'value13', 100 +put 'test','14', 'cf:a', 'value14', 100 +put 'test','15', 'cf:a', 'value15', 100 +put 'test','16', 'cf:a', 'value16', 100 +put 'test','17', 'cf:a', 'value17', 100 +put 'test','18', 'cf:a', 'value18', 100 +put 'test','19', 'cf:a', 'value19', 100 +put 'test','20', 'cf:a', 'value20', 100 +put 'test','21', 'cf:a', 'value21', 100 +put 'test','22', 'cf:a', 'value22', 100 +put 'test','23', 'cf:a', 'value23', 100 +put 'test','24', 'cf:a', 'value24', 100 +put 'test','25', 'cf:a', 'value25', 100 +put 'test','26', 'cf:a', 'value26', 100 +put 'test','27', 'cf:a', 'value27', 100 +put 'test','28', 'cf:a', 'value28', 100 +put 'test','29', 'cf:a', 'value29', 100 +put 'test','30', 'cf:a', 'value30', 100 +put 'test','31', 'cf:a', 'value31', 100 +put 'test','32', 'cf:a', 'value32', 100 +put 'test','33', 'cf:a', 'value33', 100 +put 'test','34', 'cf:a', 'value34', 100 +put 'test','35', 'cf:a', 'value35', 100 +put 'test','36', 'cf:a', 'value36', 100 +put 'test','37', 'cf:a', 'value37', 100 +put 'test','38', 'cf:a', 'value38', 100 +put 'test','39', 'cf:a', 'value39', 100 +put 'test','40', 'cf:a', 'value40', 100 +put 'test','41', 'cf:a', 'value41', 100 +put 'test','42', 'cf:a', 'value42', 100 +put 'test','43', 'cf:a', 'value43', 100 +put 'test','44', 'cf:a', 'value44', 100 +put 'test','45', 'cf:a', 'value45', 100 +put 'test','46', 'cf:a', 'value46', 100 +put 'test','47', 'cf:a', 'value47', 100 +put 'test','48', 'cf:a', 'value48', 100 +put 'test','49', 'cf:a', 'value49', 100 +put 'test','50', 'cf:a', 'value50', 100 +put 'test','51', 'cf:a', 'value51', 100 +put 'test','52', 'cf:a', 'value52', 100 +put 'test','53', 'cf:a', 'value53', 100 +put 'test','54', 'cf:a', 'value54', 100 +put 'test','55', 'cf:a', 'value55', 100 +put 'test','56', 'cf:a', 'value56', 100 +put 'test','57', 'cf:a', 'value57', 100 +put 'test','58', 'cf:a', 'value58', 100 +put 'test','59', 'cf:a', 'value59', 100 +put 'test','60', 'cf:a', 'value60', 100 +put 'test','61', 'cf:a', 'value61', 100 +put 'test','62', 'cf:a', 'value62', 100 +put 'test','63', 'cf:a', 'value63', 100 +put 'test','64', 'cf:a', 'value64', 100 +put 'test','65', 'cf:a', 'value65', 100 +put 'test','66', 'cf:a', 'value66', 100 +put 'test','67', 'cf:a', 'value67', 100 +put 'test','68', 'cf:a', 'value68', 100 +put 'test','69', 'cf:a', 'value69', 100 +put 'test','70', 'cf:a', 'value70', 100 +put 'test','71', 'cf:a', 'value71', 100 +put 'test','72', 'cf:a', 'value72', 100 +put 'test','73', 'cf:a', 'value73', 100 +put 'test','74', 'cf:a', 'value74', 100 +put 'test','75', 'cf:a', 'value75', 100 +put 'test','76', 'cf:a', 'value76', 100 +put 'test','77', 'cf:a', 'value77', 100 +put 'test','78', 'cf:a', 'value78', 100 +put 'test','79', 'cf:a', 'value79', 100 +put 'test','80', 'cf:a', 'value80', 100 +put 'test','81', 'cf:a', 'value81', 100 +put 'test','82', 'cf:a', 'value82', 100 +put 'test','83', 'cf:a', 'value83', 100 +put 'test','84', 'cf:a', 'value84', 100 +put 'test','85', 'cf:a', 'value85', 100 +put 'test','86', 'cf:a', 'value86', 100 +put 'test','87', 'cf:a', 'value87', 100 +put 'test','88', 'cf:a', 'value88', 100 +put 'test','89', 'cf:a', 'value89', 100 +put 'test','90', 'cf:a', 'value90', 100 +put 'test','91', 'cf:a', 'value91', 100 +put 'test','92', 'cf:a', 'value92', 100 +put 'test','93', 'cf:a', 'value93', 100 +put 'test','94', 'cf:a', 'value94', 100 +put 'test','95', 'cf:a', 'value95', 100 +put 'test','96', 'cf:a', 'value96', 100 +put 'test','97', 'cf:a', 'value97', 100 +put 'test','98', 'cf:a', 'value98', 100 +put 'test','99', 'cf:a', 'value99', 100 +put 'test','100', 'cf:a', 'value100', 100 snapshot 'test', 'test-snapshot' list_snapshots + +////////////////////Run from Unix shell on HBase master node////////////////// +// Export the snapshot +hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -snapshot test-snapshot -copy-to /integration-test/data -mappers 16 + +// Create the hashes for the table. Run the command from unix shell on an HBase +// node. +hbase org.apache.hadoop.hbase.mapreduce.HashTable --batchsize=100 --numhashfiles=10 test /integration-test/hashtable + +// Export the data into GCS +hadoop fs -copyToLocal /integration-test /tmp/ +gsutil cp -r /tmp/integration-test gs:/// + +// GCS bucket should look like this: +$ gsutil ls gs:///integration-test/data +gs:///integration-test/data/ +gs:///integration-test/data/.hbase-snapshot/ +gs:///integration-test/data/archive/ +$ gsutil ls gs:///integration-test/hashtable +gs:///integration-test/hashtable/manifest +gs:///integration-test/hashtable/partitions +gs:///integration-test/hashtable/hashes/ + +// Run from HBase shell. Run `hbase shell` from unix terminal on HBase master. +// clean up the table disable 'test' drop 'test' exit diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo new file mode 100644 index 0000000000..03ac02e452 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo @@ -0,0 +1,2 @@ + + test-snapshottestϹ���. (@��������� \ No newline at end of file diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/data.manifest b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest similarity index 55% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/data.manifest rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest index 180516dc03055633111dc6316e9e50e08c8196ba..6439f06130b1e7f80e67897eedf5a1e582d1545c 100644 GIT binary patch delta 467 zcmZ9IF-n9n5QT9?Slg`@OIPq0S(r&ClbJ)<=m|1O<_O{eZ0rPY;0Xk8u&wM#Z0xi& zq9|_nnveIrx0!F|Pe*%)=Zn?+cKdes`tUNF4!+al{RBtL08m5w zA})XZj%TN9)xgc66%Zp&SYk|=iyIKQd=r0Gt6^2VC*{#o8Hqq8wh~&kL@f!H9jlu` zD07#grQRB`K#|IBPDW_lX@~fjP3s|^qG)V^g|hg#hlHvb19ID4emw{)2DuA}a-_~d o3spqyC5)Ej{r^`N`_pQy=&gdZ&W#9JG>AA*%@ut3E}zdn0Vgezc>n+a delta 499 zcmZXQK}wuK5QUjB!bn0ElYb%EP8NpnLswUIb=56IPf*p>eV%|<$N>U^l1&y~A{P+6 zhOLVvGW+-VzV{w>54)eAH)q%TFT3AQKmI)4TyB<|`|Yx}FvVt~I*@7^4lNB)QInAZ zd}s=@82zJh?6>2LIq0I4NH`sV$KsRP0w%N yzIm~_U?Lio-Iyv(Fa&TVrRoXOY&`>vR+#$$ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/32053565831341128b8d8f5567d48fdc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b similarity index 86% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/32053565831341128b8d8f5567d48fdc rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b index 5320c6c58dbe6e391c4994185de1655b0e874b09..1b91b948d8df70a6137e860707da34bc8dfb702e 100644 GIT binary patch delta 175 zcmbQBIYD!R3Cl(s9f64!GC*2)<-`bC5WRO|C73=xaTb_ delta 175 zcmbQBIYD!R35(ZD%OevlWPr5Q+=&sgAbQcnN-(`_;w&(|a^h|{XqFuiT!UogFE zvK%9rv2U_HgmG|k0;8-T(1@cfWr;bZsYaH3m93g#llvI+I6)S%#5C*LO=e}<2ex+d qai%#uAYldu#xjtm$@$D_KpC6r$v2thff|6uqzTP=wwaNoUjP7^+C!%R diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/8aff180e3a244dcc807e4de8b6fce0a7 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/8aff180e3a244dcc807e4de8b6fce0a7 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d index cbd9f539b3fa44c3bda39f35407a12a5071cbdcf..951eb512ac0e59374112230c0c7a0e2bbd38c54b 100644 GIT binary patch delta 175 zcmbQBIYD!R3Cl(so$C`VWPr5M@`(|$AbQ8dN-%w5;w&(IW8!Ww{e0pbF#TiVUl47~ zK3R?t%#fID4`FCdPGFQ31WH-5lqKerrdnF^9k{Z6>f}DgJWh~BEPLZ$=T2s2+6T6F s@^Pj)JRo5P2F5awrpfutX)GH{jEyGWWR?eN02*^%s)l(pBTK&k0N)!u{{R30 delta 175 zcmbQBIYD!R35(ZDEAELFGC~a>v)VC-*Vtae^#jVS1e$I+>MeAK2Q- q$C>8vfP@(s7|TGKCg(G!0cC8|C*Ne22WkKs^TkzX?`B4pegOb!dP2Sc diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/36798a163ed046b193818e21dd7516b4 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/36798a163ed046b193818e21dd7516b4 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b index ee586c252e4a4f151be3c850154faf5f1b348647..dc89f02ec2300b7003ed249fab5a308281318d4f 100644 GIT binary patch delta 175 zcmbQBIYD!R3Cl(s9qox0GC*1{WnzRZh%TL238vd8&H~djChi8)t0&$8(~u5sT*bF3riTO#8sr tPCm{whX*9gz`$4r(lj}rIgMpw`C_-pH<{&u8i2-BYgEN-O!f+_p4`Wn#|g5CMQQ)OO_N!f_JOUP se4J?x4@j7Sfw2svX>vYu8q3Dg;{M4undN~RfW}yb$aZaJWa$?G00+N2mjD0& delta 175 zcmbQBIYD!R35(ZD%Qq7(WPr4F?8FFJ5S=iw5=D-Bb!F1td zIYuy}c(Of&Q8qb&QC1LWL?ugEVoqtQnI&J6@WgAA`xx^$K^C#Reyf)|nU!fD*xJd* pndb0-gc%qZ%RrhY=QF1PWo&XM-(;2tY5*GZM|#!C&5SJl0s!2eLka)@ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/65b9c6860f5f4de39d61d1674947b030 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/65b9c6860f5f4de39d61d1674947b030 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d index e8d9789f5e9c26ecc1eed6ed9e7061b6e0a9d461..7638f6eabba8ff4ffcb7c5752dbc5991db41cb51 100644 GIT binary patch delta 175 zcmbQBIYD!R3Cl(sojnsRWPr5cw}}z5K-!3XVkMXspEwIlYfRh?rfnzQ0n`2y|AOh{ z$#RTfM%iS02%~*+0;8-T(1>X)Wr;bZsTP)eFBmflC-*Vtae^#jIbi;QXEH0(KCrcu sk2B5T0SPlOFqVNdP0nXdW7$|9aAxvNW_h3npfMRw_&GN-vh)i80AUlbqllvI+I6)S%oc{53^<-A2ePC-R qA7`4w0}^InU@QY^nw-y^29&WenS7I39;g9mOe6b~vzr-N`UL<08A4nD diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/b0f68aca966b48f1b171614e582b1cbb b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/b0f68aca966b48f1b171614e582b1cbb rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b index dc8da56c10141d296961f5567f3c5cd6649d72be..c6ba1f760bb956a3846658b41a1353c2e14946b8 100644 GIT binary patch delta 175 zcmbQBIYD!R3Cl(sommqtWPr3`+{6f35S=%%5=_@loCT(PChi8)vnSpG)2k=`1=Bkw z%Q1o(CnwuO7&j&-Fvu4LXkJDHVfAK2Q- r$C>8vfP@(s7|TGKCg(G!v1}|6P?~&`SsthXXpFpTp6zBvmVN;M#L7Pm delta 175 zcmbQBIYD!R35(ZD%Wo4cWPr4F&BO><5Zy4b5==KwoCT&^C+-H*9TV?>>F$Yt!F2Cr zIYuy}f3iJ<5bZFr5=^^HoCT)cC+-H*UK8(tX}^ho!F1qc zIYuxeWU@Vk5k5JAQC1LWL=;O|VoqtQsU_bQ1}WXieT;dWAd6TY+dJz{W@Xw3ws!Jy pra3$yVFm`qGLWXp`OIlR85_UJH<{&u8i2;k{Ze~;Gb2mC0079FKhgjI diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/b83044f76ba6474aa829e3bae7fd82d1 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2 similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/b83044f76ba6474aa829e3bae7fd82d1 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2 index c119dd13ef4179dcde442da0461e0f1b50569ba1..d29619e3ecdd76e1442d92962be623e16ad62f76 100644 GIT binary patch delta 175 zcmbQBIYD!R3Cl(s9hZp~GC*42cw&Ssh>n<838phA&H~eQ6L*8@$rJB@>5UWrg6X4^ z4M*L+dOUx-vHL>LTZ{{^~avx(JC&(g}=c_6|PG)7=2ex+d rai%#uAYldu#xjtm$@$D_EE~(!O()-EmIrD88q-jlQ?!|prC$I57YaTS delta 175 zcmbQBIYD!R35(ZD%bOD|WPr5Qi-{4kAo}&hN-+I?;w&)zY2t1${cYkMF#U7lUoib= zvK%9r@qe;Cm|@L4Ie}4D(CekOGD}%vPHC!%B_IDzxz5RbjCq_Oi&)}X^G{A@W!eX} scJgtiIXoa?1_s75kfzD`%xORw8^_5vndN~RfX2*LwQk(Z$kHzW0AsR4od5s; diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/d8b49b374391407ba35d5e0db1c835c9 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85 similarity index 86% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/d8b49b374391407ba35d5e0db1c835c9 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85 index d640fc8498e06935ecaf06a2714dda361af9ed7e..337b5f9280f2074041f28b211fc1c08e356fce63 100644 GIT binary patch delta 232 zcmdn2xmk093Cl(s?W+?lWPr5J>xmI^K-$V|Vikzi`8#orERfdapSTxHt4_QJrp+e) z2h;A86&S&c_{olpvcW)=1uSKWIi;zF<~$4x41z$+&cI}7!IYeq$iTo@22{`lRbau3 zq`(qM!4#+hOTI73e;!QU$e70oav=*3^S23;HJJ7R-J^YV@++n}JRk+xwV^BSo#G3wO~m{ delta 232 zcmdn2xmk0935(ZD%Z(E)WPr34|HKG6AZ>kVVikzC5}7zh7DP)-+zX~vCf)QtY)$-y*rzKvkFVU00840OZNZ( diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/_SUCCESS b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data new file mode 100644 index 0000000000000000000000000000000000000000..26334294df07a48144f34b9993aed0ed2746d4b3 GIT binary patch literal 158 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C zb4b_000wj7j(*ryrSmUs)4D@e=eth>1tox(1tQBJa7DJ~Wk%57gft3fyN=QfmYGh#m2|ot1I2ky>!VJw;3=C}y^#HjI BOz!{y literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data new file mode 100644 index 0000000000000000000000000000000000000000..87b715673c072c3c938847bb34e3d87458df53d8 GIT binary patch literal 534 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C zb4b_000wa``wmpw%clv)cX_ji8}9%LN&+z}5HlJA34zX!>vx*6xT?>7S8U^U`idJ+ zObUqEfSAe90Hi|R|8-=3z%)%i)l(f-K{A0rF=-%X2V!PJ1F(|S!B;A_voBv~r1#_6 z+00Yxfl>%Vz}B!hD@B`#ZEJaAC37UKTyPeO3L_v-;QG5o4Z#yUKP|mkA+yh`!5Bpa zh$kT4ar4Ptw)W;r$G3g_p!@X`iV72uipNiOxSzdY`9osu9hV;;6;7b2Fa@bFw)xV- z-pJcN!LrTrNYPp!6cuJ570TKy@1y2~8c$M}n0%A5Sqw#mIY>p^whJj4b2B>?9%g>J z?3X8wqQU~Cg8i4h<&O&W%f0ul{_R(H=S5Lr2~uIMwWR#jw+D^qOAow>+x$TU0Kb`( AeE4KI$cwJ5&y`B{+xfgjQDFvB!DpCZC%c*T!I76TclZ|_ zx{IR19Hip)BeT64@eji}y1%?QyN5L!MTG@Og;mIjt0{RmgKwr!9< literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data new file mode 100644 index 0000000000000000000000000000000000000000..a05197b51daa805ee91c57d9363da0daa6ee2189 GIT binary patch literal 499 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C zb4b_000!YIlFphgZyz19_}0q2tDG4qC<(-@K+I?iBm|q|K7DzxTTm~EqhLSz4nelQDF#DvC#E~+4@Uc)?L_pRj6rCLo$jA zBan(3mpGB12h6`Y?`*K+^eVAMQ2};G80Uf_lg-P12R^H?o*wXS28s$3kQuGk8*}Vp zw0T!Hh`i$e>nVYv!W5)p<(lk6NljRdz8k6crXA6@l-PB79GIIsOt%IN$N(YdeYxOOOiA UzLlcW7`+Vd-SaYd^8KO?03}C*lmGw# literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index new file mode 100644 index 0000000000000000000000000000000000000000..9228013bfa781e1fdfdd8b1ea55ca72769c171fd GIT binary patch literal 221 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C za}+Uk0M+{B=cU6`FfwXkRg;{bTTql*T&(AknwD6aQ{tSTlA6o_1QXR=R!%MBYIAzd zwBGg9^lkrv09}lcvtmv%D+2=?0~;@pm5`8N!p6X0#b62)e+)E(lYx^5B+k%m#lX0EaF{_qQVfQ;$;1Dt@b^Ehc~#~ZL7ccG7UwA z5lF@PIpPs{>m`EuGS&)x4L)vxqQV%Y!o%ZCz}A;v+!vTU2$hX$+>W9G?9#tsFihi8C}?F)*|- G)B^xx2Tsob literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data new file mode 100644 index 0000000000000000000000000000000000000000..40cbf30418cb09231e837d21346d0da221394d1e GIT binary patch literal 499 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C zb4b_000#c<6^B>+J-t@zllQ_*|L{bhpd=8p0x_d0kPt9`@~nK@C+6IzOR5*;e%rwf z6q5pCHXvp)H2|rY8yq_K(N@<5{N{W41d?5^pr|kesc`yezDDPxYLK$b6oba+-@c%z zFaoJy$>@63(EhFOvyNBcucgH+P*fO$R8(F+Eb(7*qRITSMQ5DdV%DLkFafDJF1RS> z;vJ2&BUbFWL1r_4qNo7-#V#O-d08H}Tu6LUfqTR$FBBDKATwf01gA!%W%azhC2zyW z7<>>#g*iw?yVly}^^E?n`d2(%RIp!I0!4)dNX0q7Sw`qH1`1IES zC?*BOY(UIpW&k7wuB({*IH;wXT>jw4a)qgWx{F;< zR2YF&yf@Ku;=R~ZAKN6nVQ)v%2^1B^AQkJL^vYlQ&6^stL-<3%BI(5_Doj8su9|$3 zx~Sz8^IbkUfGsQe8Hx&1kct=6E~%>Kri&pvb7 z+=Z{IP*j+M%-E1wHpA%eN$pOx+$U_NJHMl-umGvZ;O}1X)*vDMV4$U0NT|yM6cv^r X6>A>--*;l_Io3Xpx%CUuAFToa5vh*G literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index new file mode 100644 index 0000000000000000000000000000000000000000..a0818358eb72b6aaf321eae16c134e76fa900a8b GIT binary patch literal 221 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C za}+Uk0M+{B=cU6`FfwXkRg;{bTTql*T&(AknwD6aQ{tSTlA6o_1PAY$73Xd}f9Arq z4QE#F5@`Mp1n6RnoE3ADSs57E7}$7$tb~LF6J`bmTLufD_+y|MoD7^iAaRCfD+Y!( GhI#;(6jO5m literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data new file mode 100644 index 0000000000000000000000000000000000000000..effda57ece6b2d945b6af33fe25b9138a781b8ff GIT binary patch literal 499 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C zb4b_000x`?XIOSxf7JhZ)=)uYSso)$P!fn)ftb-8NC=#8F?;8+d+sBn_XZajx7S|; zib(-68xS*@8-P@(UMz9=p(pWbl0!+%l`Y4=qo^a+P*i}!V=uErT$bWr)iVh;Kjl_kXhl(B0Wu?;>&UK@xj(+%uG<d!N$%H1$OV({llum5v(# literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index new file mode 100644 index 0000000000000000000000000000000000000000..a8eb1a1748bad7bffacfaf7ce6d21a7d8118bb97 GIT binary patch literal 221 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C za}+Uk0M+{B=cU6`FfwXkRg;{bTTql*T&(AknwD6aQ{tSTlA6o_1ka>5FS_F$<+18j zR@#wQkM#co0lF9?XT_XkRt5$(1~y(GDTr{upQmCj%!BNSvYBih-ex Gp&kGrG*QI> literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data new file mode 100644 index 0000000000000000000000000000000000000000..011b956c5f23f9af42f3247c61757e30323ee168 GIT binary patch literal 499 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C zb4b_000zyiT?W5-c@Oa&Fge0;;M6^!pd=8p0x_cnkPs;7%i0>8C({t>``3KKTH6+& zm=qAR0Wp(>0Z7I9QsSkc$0UGBt+JPK)*}Tu|%E;EigF8AwI!+&OzD98b)XTcNl@ zH_?0{iW%l06>sjWoO>dkHDTM5M^jnmi^-s<0EgI$FDYN{2`qBovQpXY8e80D6cv^r XGi(f7l_%Wzwdwbf+_}rSk7)t`Qs0j{ literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index new file mode 100644 index 0000000000000000000000000000000000000000..fada13a25649e97d10371af604bb0f353ab3ef6a GIT binary patch literal 221 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C za}+Uk0M+{B=cU6`FfwXkRg;{bTTql*T&(AknwD6aQ{tSTlA6o_1S@gD*z~Kh z^ysO-QcC}U09}lcvtmv%D+2=?0~;@pm5`9I1*p@W!3rq;7|7ye;N$^`Gc;QB^_1R|xsRXMqNp$gshE+(oobgM_=kCi;GP%X&M!kz zVFXeUlK!MH#`*PJ93W literal 0 HcmV?d00001 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest new file mode 100644 index 0000000000..a95421d027 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest @@ -0,0 +1,4 @@ +#Wed Dec 30 01:23:41 UTC 2020 +numHashFiles=10 +table=test +targetBatchSize=10 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions new file mode 100644 index 0000000000000000000000000000000000000000..1d447dd67a92849e26ffb1864ce2bc5135dfebf9 GIT binary patch literal 342 zcmWG`4P-OSFG|--EJ#ewNY%?oOv%qL(91|lEKb$S%-8eG%`GiSOv*`hsw_z@4ll|C za}+Uk0M+`H=H$RsFfwRiRg;{bTTql*T&(AknwD6aQ{tSTlA6o_1mV{-/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/ \ - * gs:///integration-test/ + * gsutil -m cp -r /bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test \ + * gs:/// * * Setup GCP credential: https://cloud.google.com/docs/authentication * Ensure your credential have access to Bigtable and Dataflow @@ -87,6 +104,8 @@ public class EndToEndIT { // Snapshot data setup private String hbaseSnapshotDir; + private String hashDir; + private String syncTableOutputDir; @Before public void setup() throws Exception { @@ -101,6 +120,13 @@ public void setup() throws Exception { hbaseSnapshotDir = cloudTestDataFolder + "data/"; UUID test_uuid = UUID.randomUUID(); + hashDir = cloudTestDataFolder + "hashtable/"; + + syncTableOutputDir = dataflowStagingLocation; + if (!syncTableOutputDir.endsWith(File.separator)) { + syncTableOutputDir = syncTableOutputDir + File.separator; + } + syncTableOutputDir = syncTableOutputDir + "sync-table-output/" + test_uuid + "/"; // Cloud Storage config GcpOptions gcpOptions = PipelineOptionsFactory.create().as(GcpOptions.class); @@ -118,6 +144,12 @@ public void setup() throws Exception { for (int i = 0; i < keys.length; i++) { keySplits[i] = keys[i].getBytes(); } + + // Create table in Bigtable + TableName tableName = TableName.valueOf(tableId); + HTableDescriptor descriptor = new HTableDescriptor(tableName); + descriptor.addFamily(new HColumnDescriptor(CF)); + connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys()); } private static String getTestProperty(String name) { @@ -126,6 +158,17 @@ private static String getTestProperty(String name) { @After public void teardown() throws IOException { + final List paths = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "/*")); + + if (!paths.isEmpty()) { + final List pathStrs = new ArrayList<>(); + + for (GcsPath path : paths) { + pathStrs.add(path.toString()); + } + this.gcsUtil.remove(pathStrs); + } + connection.close(); // delete test table @@ -134,18 +177,28 @@ public void teardown() throws IOException { .deleteTable(TableName.valueOf(tableId)); } - @Test - public void testHBaseSnapshotImport() throws Exception { - - // Crete table - TableName tableName = TableName.valueOf(tableId); - HTableDescriptor descriptor = new HTableDescriptor(tableName); + private SyncTableOptions createSyncTableOptions() { + DataflowPipelineOptions syncTableOpts = + PipelineOptionsFactory.as(DataflowPipelineOptions.class); + syncTableOpts.setRunner(DataflowRunner.class); + syncTableOpts.setGcpTempLocation(dataflowStagingLocation); + syncTableOpts.setNumWorkers(1); + syncTableOpts.setProject(projectId); - descriptor.addFamily(new HColumnDescriptor(CF)); + SyncTableOptions syncOpts = syncTableOpts.as(SyncTableOptions.class); + // Setup Bigtable params + syncOpts.setBigtableProject(StaticValueProvider.of(projectId)); + syncOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId)); + syncOpts.setBigtableTableId(StaticValueProvider.of(tableId)); + syncOpts.setBigtableAppProfileId(null); - connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys()); + // Setup Hashes + syncOpts.setHashTableOutputDir(StaticValueProvider.of(hashDir)); + syncOpts.setOutputPrefix(StaticValueProvider.of(syncTableOutputDir)); + return syncOpts; + } - // Start import + private ImportOptions createImportOptions() { DataflowPipelineOptions importPipelineOpts = PipelineOptionsFactory.as(DataflowPipelineOptions.class); importPipelineOpts.setRunner(DataflowRunner.class); @@ -154,10 +207,9 @@ public void testHBaseSnapshotImport() throws Exception { importPipelineOpts.setProject(projectId); importPipelineOpts.setRegion(region); - ImportJobFromHbaseSnapshot.ImportOptions importOpts = - importPipelineOpts.as(ImportJobFromHbaseSnapshot.ImportOptions.class); + ImportOptions importOpts = importPipelineOpts.as(ImportOptions.class); - // setup GCP and bigtable + // setup Bigtable options importOpts.setBigtableProject(StaticValueProvider.of(projectId)); importOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId)); importOpts.setBigtableTableId(StaticValueProvider.of(tableId)); @@ -165,17 +217,25 @@ public void testHBaseSnapshotImport() throws Exception { // setup HBase snapshot info importOpts.setHbaseSnapshotSourceDir(hbaseSnapshotDir); importOpts.setSnapshotName(TEST_SNAPSHOT_NAME); + return importOpts; + } + + private Map getCountMap(PipelineResult result) { + MetricQueryResults metrics = result.metrics().allMetrics(); + return StreamSupport.stream(metrics.getCounters().spliterator(), false) + .collect(Collectors.toMap((m) -> m.getName().getName(), (m) -> m.getAttempted())); + } + + @Test + public void testHBaseSnapshotImport() throws Exception { + + // Start import + ImportOptions importOpts = createImportOptions(); // run pipeline State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); Assert.assertEquals(State.DONE, state); - // check data in bigtable - BigtableTableUtils destTable = new BigtableTableUtils(connection, tableId, CF); - Assert.assertEquals( - 100 /* There are 100 rows in test snapshot*/, - destTable.readAllCellsFromTable().toArray().length); - // check that the .restore dir used for temp files has been removed Objects objects = gcsUtil.listObjects( @@ -185,6 +245,81 @@ public void testHBaseSnapshotImport() throws Exception { null); Assert.assertNull(objects.getItems()); - // TODO(vermas2012): Add more validations after this. + SyncTableOptions syncOpts = createSyncTableOptions(); + + PipelineResult result = SyncTableJob.buildPipeline(syncOpts).run(); + state = result.waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + List outputs = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "*")); + // FileSink will write an empty file when there are no mismatches + Assert.assertEquals(1, outputs.size()); + // TODO read the actual files and validate the ranges instead of size check + Assert.assertEquals(0, gcsUtil.fileSize(outputs.get(0))); + + // Validate the counters. + Map counters = getCountMap(result); + Assert.assertEquals(counters.size(), 1); + Assert.assertEquals(counters.get("ranges_matched"), (Long) 101L); + } + + /** + * Introduces multiple corruptions in imported table and validates that sync-table can detect + * them. + */ + @Test + public void testHBaseSnapshotImportWithCorruptions() throws Exception { + // Import snapshot + ImportOptions importOpts = createImportOptions(); + State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + // Introduce corruptions to the data in Bigtable. Delete data from Bigtable to simulate Bigtable + // missing data. Add data to Bigtable to simulate extra data in Bigtable. It is easier to update + // Bigtable than change the snapshots. + Table table = connection.getTable(TableName.valueOf(tableId)); + Cell cellInMiddle = table.get(new Get("24".getBytes())).rawCells()[0]; + List puts = + Arrays.asList( + // Add a row at the start + new Put(Bytes.toBytes("000")) + .addColumn(CF.getBytes(), "random_col".getBytes(), 1L, "value000".getBytes()) + .addColumn(CF.getBytes(), "random_col".getBytes(), 2L, "value001".getBytes()), + // change a cell in middle + new Put(cellInMiddle.getRowArray()) + .addColumn( + cellInMiddle.getFamilyArray(), + cellInMiddle.getQualifierArray(), + cellInMiddle.getTimestamp(), + "corrupted_val".getBytes()), + // add a new row in the end + new Put(Bytes.toBytes("9999")) + .addColumn(CF.getBytes(), "random_col".getBytes(), 100L, "value999".getBytes())); + + table.put(puts); + // Delete a random row in the middle. We should see 4 ranges mismatch as table is split on + // 1,2...9. We are splitting on 31, delete in 60s. + table.delete(new Delete("64".getBytes())); + + // Run SyncTable job and expect 4 mismatches. + SyncTableOptions syncOpts = createSyncTableOptions(); + PipelineResult result = SyncTableJob.buildPipeline(syncOpts).run(); + state = result.waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + List outputs = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "*")); + + System.out.println("OUTPUTS: " + outputs); + // FileSink will shard the outputs and will created >1 files. + Assert.assertTrue(outputs.size() > 1); + // TODO read the files and validate that the ranges are there instead of size check. + Assert.assertTrue((gcsUtil.fileSize(outputs.get(0)) + gcsUtil.fileSize(outputs.get(1))) > 0); + + // gcsUtil.getObject(outputs.get(0)); + + Map counters = getCountMap(result); + Assert.assertEquals(counters.size(), 2); + Assert.assertEquals(counters.get("ranges_matched"), (Long) 97L); + Assert.assertEquals(counters.get("ranges_not_matched"), (Long) 4L); } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java new file mode 100644 index 0000000000..ee574a9c2d --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java @@ -0,0 +1,162 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static org.junit.Assert.assertEquals; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.beam.sdk.testing.SourceTestUtils; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class BufferedHadoopHashTableSourceTest { + + private BufferedHadoopHashTableSource bufferedSource; + private FakeTableHashWrapper fakeTableHashWrapper; + + private static final String HASH_TABLE_OUTPUT_PATH_DIR = "gs://my-bucket/outputDir"; + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("AAAA".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = + new ImmutableBytesWritable("ZZZZ".getBytes()); + private static final ImmutableBytesWritable POST_STOP_ROW = + new ImmutableBytesWritable("z".getBytes()); // Lowercase z is lexicographically > uppercase Z + private static final ImmutableBytesWritable EMPTY_ROW = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + private static final ImmutableBytesWritable START_HASH = + new ImmutableBytesWritable("START-HASH".getBytes()); + private static final int BATCH_SIZE = 5; + + @Before + public void setUp() throws Exception { + fakeTableHashWrapper = + new FakeTableHashWrapper( + START_ROW, STOP_ROW, new ArrayList<>(), new ArrayList<>(), new Scan()); + bufferedSource = + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + StaticValueProvider.of("cbt-dev"), + StaticValueProvider.of(HASH_TABLE_OUTPUT_PATH_DIR), + START_ROW, + STOP_ROW, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)), + BATCH_SIZE); + } + + protected static ImmutableBytesWritable getKey(int keyIndex) { + return new ImmutableBytesWritable(("KEY-" + keyIndex).getBytes()); + } + + protected static ImmutableBytesWritable getHash(int hashIndex) { + return new ImmutableBytesWritable(("HASH-" + hashIndex).getBytes()); + } + + /** + * Populates the fakeTableHashWrapper with {@code numEntries} entries starting with startKey. + * Returns a List of expected RangeHashes for this data, for numEntries=1, single RangeHash is + * returned (startRow, stopRow, START_HASH). + */ + protected List>> setupTestData( + ImmutableBytesWritable startRow, ImmutableBytesWritable stopRow, int numEntries) { + fakeTableHashWrapper.startRowInclusive = startRow; + fakeTableHashWrapper.stopRowExclusive = stopRow; + fakeTableHashWrapper.hashes.add(KV.of(startRow, START_HASH)); + for (int i = 0; i < numEntries - 1; i++) { + fakeTableHashWrapper.hashes.add(KV.of(getKey(i), getHash(i))); + } + + List>> out = new ArrayList<>(); + // Setup RangeHashes to be returned + List expectedRangeHashes = new ArrayList<>(); + ImmutableBytesWritable key = startRow; + ImmutableBytesWritable hash = START_HASH; + for (int i = 0; i < numEntries - 1; i++) { + expectedRangeHashes.add(RangeHash.of(key, getKey(i), hash)); + key = getKey(i); + hash = getHash(i); + if (expectedRangeHashes.size() % BATCH_SIZE == 0) { + out.add( + KV.of( + Bytes.toStringBinary(expectedRangeHashes.get(0).startInclusive.copyBytes()), + expectedRangeHashes)); + expectedRangeHashes = new ArrayList<>(); + } + } + // Process the last range + expectedRangeHashes.add(RangeHash.of(key, stopRow, hash)); + // Finalize the last batch + out.add( + KV.of( + Bytes.toStringBinary(expectedRangeHashes.get(0).startInclusive.copyBytes()), + expectedRangeHashes)); + + return out; + } + + @Test + public void testHashReaderEmpty() throws IOException { + // The tableHashWrapper has no hashes, this should result in empty source. + assertEquals(Arrays.asList(), SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testHashReaderPartialBuffer() throws IOException { + // Setup 4 entries in this hashtable datafile. + List>> expected = setupTestData(START_ROW, STOP_ROW, 4); + assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testHashReaderMultipleBatches() throws IOException { + // Setup 4 entries in this hashtable datafile. + List>> expected = setupTestData(START_ROW, STOP_ROW, 20); + assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testHashReaderMultipleBatchesWithPartialBatchAtEnd() throws IOException { + // Setup 4 entries in this hashtable datafile. + List>> expected = setupTestData(START_ROW, STOP_ROW, 23); + assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testSplitEqualsUnsplit() throws Exception { + fakeTableHashWrapper.partitions = Arrays.asList(getKey(4), getKey(9)); + SourceTestUtils.assertSourcesEqualReferenceSource( + bufferedSource, bufferedSource.split(0, null), null); + } + + @Test + public void testUnstartedReaderEqualsStarted() throws Exception { + setupTestData(START_ROW, STOP_ROW, 6); + SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource( + bufferedSource.createReader(null), null); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java new file mode 100644 index 0000000000..ed725d8d37 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java @@ -0,0 +1,444 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.models.GCRules.GCRULES; + +import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.BigtableTableAdminClient; +import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.BigtableTableAdminSettings; +import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.models.CreateTableRequest; +import com.google.cloud.bigtable.beam.CloudBigtableTableConfiguration; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.emulator.v2.BigtableEmulatorRule; +import com.google.cloud.bigtable.hbase.BigtableConfiguration; +import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@RunWith(JUnit4.class) +public class ComputeAndValidateHashFromBigtableDoFnTest { + + private static final byte[] EMPTY_ROW_KEY = HConstants.EMPTY_BYTE_ARRAY; + protected final Logger LOG = LoggerFactory.getLogger(getClass()); + + public static final String FAKE_TABLE = "fake-table"; + private static final String ROW_KEY_PREFIX = "row-"; + private static final String VALUE_PREFIX = "value-"; + private static final byte[] EXTRA_VALUE = "add".getBytes(); + private static final byte[] CF = "cf".getBytes(); + private static final byte[] CF2 = "cf".getBytes(); + private static final byte[] COL = "col".getBytes(); + private static final long TS = 1000l; + private static final int FIRST_ROW_INDEX = 20; + private static final int LAST_ROW_INDEX = 31; + + @Rule + public final BigtableEmulatorRule bigtableEmulator = BigtableEmulatorRule.create(); + + @Rule + public final transient TestPipeline p = TestPipeline.create(); + + private ComputeAndValidateHashFromBigtableDoFn doFn; + + // Clients that will be connected to the emulator + private BigtableTableAdminClient tableAdminClient; + private Table table; + // Fake a TableHashWrapper. + private FakeTableHashWrapper fakeTableHashWrapper; + + private List hashes; + + @Before + public void setUp() throws IOException { + hashes = new ArrayList<>(); + // Initialize the clients to connect to the emulator + tableAdminClient = + BigtableTableAdminClient.create( + BigtableTableAdminSettings.newBuilderForEmulator(bigtableEmulator.getPort()) + .setProjectId("fake-project") + .setInstanceId("fake-instance") + .build()); + + CloudBigtableTableConfiguration config = + new CloudBigtableTableConfiguration.Builder() + .withProjectId("fake-project") + .withInstanceId("fake-instance") + .withTableId(FAKE_TABLE) + .withConfiguration( + BigtableOptionsFactory.BIGTABLE_EMULATOR_HOST_KEY, + "localhost:" + bigtableEmulator.getPort()) + .build(); + + Connection connection = BigtableConfiguration.connect(config.toHBaseConfig()); + table = connection.getTable(TableName.valueOf(FAKE_TABLE)); + fakeTableHashWrapper = new FakeTableHashWrapper(); + // Scan all the cells for the column, HBase scan fetches 1 cell/column by default + fakeTableHashWrapper.scan = new Scan().setMaxVersions(); + + FakeTableHashWrapperFactory fakeFactory = new FakeTableHashWrapperFactory(fakeTableHashWrapper); + + doFn = + new ComputeAndValidateHashFromBigtableDoFn( + config, + StaticValueProvider.of(FAKE_TABLE), + StaticValueProvider.of("proj"), + StaticValueProvider.of("hash"), + fakeFactory); + + // Create a test table that can be used in tests + tableAdminClient.createTable( + CreateTableRequest.of(FAKE_TABLE) + .addFamily(new String(CF), GCRULES.maxVersions(100)) + .addFamily(new String(CF2), GCRULES.maxVersions(100))); + + p.getCoderRegistry().registerCoderForClass(RangeHash.class, new RangeHashCoder()); + + // Fill CBT table with data. + writeDataToTable(); + } + + @After + public void tearDown() { + // TODO should we delete the table for each test? + tableAdminClient.deleteTable(FAKE_TABLE); + } + + private byte[] getRowKey(int i) { + return (ROW_KEY_PREFIX + i).getBytes(); + } + + private byte[] getValue(int rowIndex, int cellIndex) { + return (VALUE_PREFIX + rowIndex + "-" + cellIndex).getBytes(); + } + + private void writeDataToTable() throws IOException { + List puts = new ArrayList<>(); + // Tests use the rows 21-30. Setup some extra data simulate the real world scenario where + // there will be other workitems working parallely on the table. + for (int i = 20; i < 32; i++) { + for (int j = 0; j < 2; j++) { + // Insert rows with 2 cells each + Put put = new Put(getRowKey(i)); + put.addColumn(CF, COL, TS + j, getValue(i, j)); + puts.add(put); + } + } + table.put(puts); + } + + /** + * Deletes the row range [startIndex, stopIndex) + */ + private void deleteRange(int startIndex, int stopIndex) throws IOException { + for (int i = startIndex; i < stopIndex; i++) { + table.delete(new Delete(getRowKey(i))); + } + } + + // Creates a RangeHash for range [startRow, stopRow). + private RangeHash createHash(byte[] startRow, byte[] stopRow) throws IOException { + LOG.debug("Creating hash for rows " + startRow + " to " + stopRow); + BigtableResultHasher hasher = new BigtableResultHasher(); + hasher.startBatch(new ImmutableBytesWritable(startRow)); + + // Scan all the cells for a column. + Scan scan = new Scan().setMaxVersions().withStartRow(startRow).withStopRow(stopRow, false); + + // Read the rows from Bigtable and compute the expected hash. + for (Result result : table.getScanner(scan)) { + LOG.debug("Adding result to hash: " + result); + hasher.hashResult(result); + } + hasher.finishBatch(); + return RangeHash.of( + new ImmutableBytesWritable(startRow), + new ImmutableBytesWritable(stopRow), + hasher.getBatchHash()); + } + + ////////// Happy case tests for various setups////////////////////// + @Test + public void testHashMatchesForMultipleRange() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(28))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).empty(); + p.run(); + } + + @Test + public void testHashMatchesForSingleRange() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(); + p.run(); + } + + @Test + public void testHashMatchesForFullTableScanWithMultipleRange() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(24))); + hashes.add(createHash(getRowKey(24), EMPTY_ROW_KEY)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).empty(); + p.run(); + } + + @Test + public void testHashMatchesForMultipleSingleRowRange() throws Exception { + hashes.add(createHash(getRowKey(22), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(25))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(22)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).empty(); + p.run(); + } + + ///////////////// Test mismatches when Bigtable has extra rows //////////////////// + @Test + public void testAdditionalCellInMiddle() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(30))); + + // Add an extra cell in the table + table.put(new Put(getRowKey(25)).addColumn(CF, COL, EXTRA_VALUE)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes.get(1)); + p.run(); + } + + @Test + public void testAdditionalRowsAtEnds() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(27))); + hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY)); + + // Add an extra row in the beginning + table.put(new Put(getRowKey(1)).addColumn(CF, COL, EXTRA_VALUE)); + + // Add an extra row at the end. + table.put(new Put(getRowKey(5)).addColumn(CF, COL, EXTRA_VALUE)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2)); + p.run(); + } + + ///////////////////// Test different values /////////////////////////// + @Test + public void testDifferentValues() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY)); + + // Modify the CF + table.delete(new Delete(getRowKey(20)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(1)).addColumn(CF2, COL, TS, getValue(20, 0))); + + // Modify the qualifier + table.delete(new Delete(getRowKey(22)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(22)).addColumn(CF, "random-col".getBytes(), TS, getValue(22, 0))); + + // Modify the timestamp + table.delete(new Delete(getRowKey(24)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(24)).addColumn(CF, COL, 1, getValue(24, 0))); + + // Modify the value + table.delete(new Delete(getRowKey(26)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(26)).addColumn(CF, COL, getValue(26, 0))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output) + .containsInAnyOrder(hashes.get(0), hashes.get(1), hashes.get(2), hashes.get(3)); + p.run(); + } + + ////////////////// Tests with CBT missing data ////////////////////////////// + @Test + public void testMissingRows() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY)); + + // Delete a row at the beginning + table.delete(new Delete(getRowKey(FIRST_ROW_INDEX))); + + // Delete a row at the middle + table.delete(new Delete(getRowKey(24))); + + // Delete a row at the end + table.delete(new Delete(getRowKey(LAST_ROW_INDEX))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4)); + p.run(); + } + + @Test + public void testMissingRanges() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(29))); + hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY)); + + // Delete a range at the beginning + deleteRange(FIRST_ROW_INDEX, 21); + + // Delete a range in middle + deleteRange(23, 25); + + // Delete row ranges at the end, bigtable scanner will finish with multiple row-ranges to process. + deleteRange(27, LAST_ROW_INDEX + 1); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output) + .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5)); + p.run(); + } + + @Test + public void testCbtEmpty() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(29))); + hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY)); + + // Delete all data from bigtable + deleteRange(FIRST_ROW_INDEX, LAST_ROW_INDEX); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes); + p.run(); + } + + ////////////////////// Test that scan is used from TableHash.//////////////////////// + @Test + public void testScanFromTableHash() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(30))); + + // Update the TableHashWrapper Scan to default. Scan from HashTable.TableHash determines the + // cells used to compute hash. CBT has to use the same cells for validation. + fakeTableHashWrapper.scan = new Scan(); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes); + p.run(); + } + + ////////////////////// Combination of different cases ////////////////////////////////// + @Test + public void testMismatchesComprehensive() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(29))); + hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY)); + + // Delete a range at the beginning from CBT + deleteRange(FIRST_ROW_INDEX, 21); + + // Delete a row in middle from CBT + table.delete(new Delete(getRowKey(23))); + + // Update a value in CBT + table.delete(new Delete(getRowKey(27)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(27)).addColumn(CF, COL, getValue(27, 0))); + + // Add an extra row at the end. + table.put(new Put(getRowKey(5)).addColumn(CF, COL, EXTRA_VALUE)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output) + .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5)); + p.run(); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java new file mode 100644 index 0000000000..6e3e5f004d --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java @@ -0,0 +1,153 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.common.collect.ImmutableList; +import com.google.gson.Gson; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.ArrayList; +import java.util.List; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * A fake for TableHashWrapper that allows us to mock the behavior of hbase's HashTable.TableHash + */ +public class FakeTableHashWrapper implements TableHashWrapper { + + // Sorted list of partition keys splitting the key range. + public List partitions; + // List of sorted by key. + public List> hashes; + public ImmutableBytesWritable startRowInclusive; + public ImmutableBytesWritable stopRowExclusive; + public Scan scan; + private static final long serialVersionUID = 34876543L; + + public FakeTableHashWrapper() { + this( + new ImmutableBytesWritable(), + new ImmutableBytesWritable(), + new ArrayList<>(), + new ArrayList<>(), + new Scan()); + } + + public FakeTableHashWrapper( + ImmutableBytesWritable startRowInclusive, + ImmutableBytesWritable stopRowExclusive, + List partitions, + List> hashes, + Scan scan) { + super(); + this.startRowInclusive = startRowInclusive; + this.stopRowExclusive = stopRowExclusive; + this.partitions = partitions; + this.hashes = hashes; + this.scan = scan; + } + + @Override + public int getNumHashFiles() { + return partitions.size() + 1; + } + + @Override + public ImmutableList getPartitions() { + return ImmutableList.copyOf(partitions); + } + + @Override + public ImmutableBytesWritable getStartRow() { + return startRowInclusive; + } + + @Override + public ImmutableBytesWritable getStopRow() { + return stopRowExclusive; + } + + @Override + public Scan getScan() { + return scan; + } + + @Override + public TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow) { + return new FakeTableHashReader(startRow); + } + + private void writeObject(ObjectOutputStream s) throws IOException { + Gson gson = new Gson(); + s.writeObject(gson.toJson(scan)); + s.writeObject(gson.toJson(startRowInclusive)); + s.writeObject(gson.toJson(stopRowExclusive)); + s.writeObject(gson.toJson(partitions)); + s.writeObject(gson.toJson(hashes)); + } + + private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { + Gson gson = new Gson(); + scan = gson.fromJson((String) s.readObject(), Scan.class); + startRowInclusive = gson.fromJson((String) s.readObject(), ImmutableBytesWritable.class); + stopRowExclusive = gson.fromJson((String) s.readObject(), ImmutableBytesWritable.class); + partitions = gson.fromJson((String) s.readObject(), ArrayList.class); + hashes = gson.fromJson((String) s.readObject(), ArrayList.class); + } + + public class FakeTableHashReader implements TableHashReader { + private final ImmutableBytesWritable startRow; + // Copy of items to be read by this reader. + private final List> entriesToRead; + // First next() will make index = 0, and compare it with the size of entriesToRead. + private int index = -1; + + public FakeTableHashReader(ImmutableBytesWritable startRow) { + this.startRow = startRow; + entriesToRead = new ArrayList<>(); + for (KV hash : hashes) { + // Collect all the entries after startRow. + if (hash.getKey().compareTo(startRow) >= 0) { + entriesToRead.add(hash); + } + } + } + + @Override + public boolean next() throws IOException { + return ++index < entriesToRead.size(); + } + + @Override + public ImmutableBytesWritable getCurrentKey() { + return entriesToRead.get(index).getKey(); + } + + @Override + public ImmutableBytesWritable getCurrentHash() { + return entriesToRead.get(index).getValue(); + } + + @Override + public void close() throws IOException { + // NOOP + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java new file mode 100644 index 0000000000..9a3acc19a9 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java @@ -0,0 +1,32 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +public class FakeTableHashWrapperFactory extends TableHashWrapperFactory { + + private static final long serialVersionUID = 269854624L; + + private final FakeTableHashWrapper fakeTableHashWrapper; + + public FakeTableHashWrapperFactory(FakeTableHashWrapper wrapper) { + this.fakeTableHashWrapper = wrapper; + } + + @Override + public TableHashWrapper getTableHash(String projectId, String sourceHashDir) { + return fakeTableHashWrapper; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java new file mode 100644 index 0000000000..20abf02d06 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java @@ -0,0 +1,181 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static org.junit.Assert.assertEquals; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.beam.sdk.testing.SourceTestUtils; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class HadoopHashBasedReaderTest { + + private HadoopHashTableSource hashTableSource; + private FakeTableHashWrapper fakeTableHashWrapper; + + private static final String HASH_TABLE_OUTPUT_PATH_DIR = "gs://my-bucket/outputDir"; + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("AAAA".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = + new ImmutableBytesWritable("ZZZZ".getBytes()); + private static final ImmutableBytesWritable POST_STOP_ROW = + new ImmutableBytesWritable("z".getBytes()); // Lowercase z is lexicographically > uppercase Z + private static final ImmutableBytesWritable EMPTY_ROW = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + private static final ImmutableBytesWritable START_HASH = + new ImmutableBytesWritable("START-HASH".getBytes()); + + @Before + public void setUp() throws Exception { + fakeTableHashWrapper = + new FakeTableHashWrapper( + START_ROW, STOP_ROW, new ArrayList<>(), new ArrayList<>(), new Scan()); + hashTableSource = + new HadoopHashTableSource( + StaticValueProvider.of("cbt-dev"), + StaticValueProvider.of(HASH_TABLE_OUTPUT_PATH_DIR), + START_ROW, + STOP_ROW, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)); + } + + protected static ImmutableBytesWritable getKey(int keyIndex) { + return new ImmutableBytesWritable(("KEY-" + keyIndex).getBytes()); + } + + protected static ImmutableBytesWritable getHash(int hashIndex) { + return new ImmutableBytesWritable(("HASH-" + hashIndex).getBytes()); + } + + /** + * Populates the fakeTableHashWrapper with {@code numEntries} entries starting with startKey. + * Returns a List of expected RangeHashes for this data, for numEntries=1, single RangeHash is + * returned (startRow, stopRow, START_HASH). + */ + protected List setupTestData( + ImmutableBytesWritable startRow, ImmutableBytesWritable stopRow, int numEntries) { + fakeTableHashWrapper.startRowInclusive = startRow; + fakeTableHashWrapper.stopRowExclusive = stopRow; + fakeTableHashWrapper.hashes.add(KV.of(startRow, START_HASH)); + for (int i = 0; i < numEntries - 1; i++) { + fakeTableHashWrapper.hashes.add(KV.of(getKey(i), getHash(i))); + } + + // Setup RangeHashes to be returned + List expectedRangeHashes = new ArrayList<>(); + ImmutableBytesWritable key = startRow; + ImmutableBytesWritable hash = START_HASH; + for (int i = 0; i < numEntries - 1; i++) { + expectedRangeHashes.add(RangeHash.of(key, getKey(i), hash)); + key = getKey(i); + hash = getHash(i); + } + expectedRangeHashes.add(RangeHash.of(key, stopRow, hash)); + return expectedRangeHashes; + } + + /////////////////////////////// Test the end of HashTable Output ///////////////////////// + + @Test + public void testHashReaderEmpty() throws IOException { + // The tableHashWrapper has no hashes, this should result in empty source. + assertEquals(Arrays.asList(), SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderSingleHashBatch() throws IOException { + // Setup 1 entry in this hashtable datafile. The test is setup so that HashTable datafile has + // only 1 entry. + List expected = setupTestData(START_ROW, STOP_ROW, 1); + + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderMultipleHashBatch() throws IOException { + // Setup 4 entries in this hashtable datafile. + List expected = setupTestData(START_ROW, STOP_ROW, 4); + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + //////////////////// Test the end of HashTable output when end of range is ""///////////////// + @Test + public void testHashReaderWithEmptyEndRow() throws IOException { + // Setup 4 entries in this hashtable datafile with no start or stop keys set. + List expected = setupTestData(EMPTY_ROW, EMPTY_ROW, 4); + hashTableSource.startRowInclusive = EMPTY_ROW; + hashTableSource.stopRowExclusive = EMPTY_ROW; + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + /////////////////////////////// Test reader.getCurrent() >= stopRow ///////////////////////// + + @Test + public void testHashReaderWorkItemEndedOnFirstBatch() throws IOException { + // Setup 1 entry in this hashtable datafile. This entry is outside of the workitem's row + fakeTableHashWrapper.hashes.add(KV.of(POST_STOP_ROW, START_HASH)); + // Source will be empty as no hashes fall in its bounds. + assertEquals(new ArrayList(), SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderWorkItemEndedOnSecondEntry() throws IOException { + // Setup 1 entry in this hashtable datafile. The test is setup so that HashTable datafile has + // only 1 entry. + List expected = setupTestData(START_ROW, STOP_ROW, 1); + // Add a next entry at the stop row. Reader should stop and read just 1 entry. + fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, getHash(100))); + + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderWorkItemEndedAfterMultipleBatches() throws IOException { + // Setup 4 entries in this hashtable datafile. + List expected = setupTestData(START_ROW, STOP_ROW, 4); + // Add a next entry after the stop row. Reader should stop and read just 4 entry. + fakeTableHashWrapper.hashes.add(KV.of(POST_STOP_ROW, getHash(100))); + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testSplitEqualsUnsplit() throws Exception { + setupTestData(START_ROW, STOP_ROW, 6); + fakeTableHashWrapper.partitions = Arrays.asList(getKey(2), getKey(4)); + SourceTestUtils.assertSourcesEqualReferenceSource( + hashTableSource, hashTableSource.split(1, null), null); + } + + @Test + public void testUnstartedReaderEqualsStarted() throws Exception { + setupTestData(START_ROW, STOP_ROW, 6); + SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource( + hashTableSource.createReader(null), null); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java new file mode 100644 index 0000000000..bc79f4300b --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java @@ -0,0 +1,209 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.HashBasedReader; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.util.List; +import junit.framework.TestCase; +import org.apache.beam.sdk.io.BoundedSource; +import org.apache.beam.sdk.io.BoundedSource.BoundedReader; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class HadoopHashTableSourceTest extends TestCase { + + HadoopHashTableSource source; + FakeTableHashWrapper fakeTableHashWrapper; + + private static final ValueProvider PROJECT_ID = StaticValueProvider.of("test-project"); + private static final ValueProvider HASH_TABLE_OUTPUT_PATH_DIR = + StaticValueProvider.of("gs://my-bucket/outputDir"); + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("a".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = new ImmutableBytesWritable("z".getBytes()); + private static final ImmutableBytesWritable PARTITION1 = + new ImmutableBytesWritable("d".getBytes()); + private static final ImmutableBytesWritable PARTITION2 = + new ImmutableBytesWritable("g".getBytes()); + private static final ImmutableBytesWritable EMPTY_ROW_KEY = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + + @Before + public void setUp() throws Exception { + super.setUp(); + fakeTableHashWrapper = new FakeTableHashWrapper(); + } + + private List> getSplitSources( + List partitions, + ImmutableBytesWritable startRow, + ImmutableBytesWritable stopRow) + throws IOException { + fakeTableHashWrapper.startRowInclusive = startRow; + fakeTableHashWrapper.stopRowExclusive = stopRow; + fakeTableHashWrapper.partitions = partitions; + + source = + new HadoopHashTableSource( + PROJECT_ID, + HASH_TABLE_OUTPUT_PATH_DIR, + startRow, + stopRow, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)); + return (List>) source.split(0, null); + } + + private void testSourceSplits( + List partitions, + ImmutableBytesWritable startRow, + ImmutableBytesWritable stopRow, + List> expectedSources) + throws IOException { + assertEquals(expectedSources, getSplitSources(partitions, startRow, stopRow)); + } + + @Test + public void testSplitZeroPartitions() throws IOException { + // Row range [a-z) with no splits. + List> expected = + ImmutableList.of( + new HadoopHashTableSource(PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, STOP_ROW)); + testSourceSplits(ImmutableList.of(), START_ROW, STOP_ROW, expected); + } + + @Test + public void testSplitOnePartition() throws IOException { + // Row range [a-z) with 1 splits. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, STOP_ROW)); + testSourceSplits(ImmutableList.of(PARTITION1), START_ROW, STOP_ROW, expected); + } + + @Test + public void testMultiplePartitons() throws IOException { + // Row range [a-z) with splits on {d,g}. The data files will be for {[a,d), [d,g), [g,z)}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, STOP_ROW)); + testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), START_ROW, STOP_ROW, expected); + } + + @Test + public void testSplitEmptyStartRow() throws IOException { + // Row range [""-z) with splits on {d,g}. The data files will be for {["",d), [d,g), [g,z)}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, EMPTY_ROW_KEY, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, STOP_ROW)); + testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), EMPTY_ROW_KEY, STOP_ROW, expected); + } + + @Test + public void testSplitEmptyStopRow() throws IOException { + // Row range [a-"") with splits on {d,g}. The data files will be for {[a,d), [d,g), [g,"")}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, EMPTY_ROW_KEY)); + testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), START_ROW, EMPTY_ROW_KEY, expected); + } + + @Test + public void testSplitFullTableScan() throws IOException { + // Row range [""-"") with splits on {d,g}. The data files will be for {["",d), [d,g), [g,"")}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, EMPTY_ROW_KEY, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, EMPTY_ROW_KEY)); + testSourceSplits( + ImmutableList.of(PARTITION1, PARTITION2), EMPTY_ROW_KEY, EMPTY_ROW_KEY, expected); + } + + @Test + public void testCreateReaderWithoutSplit() throws IOException { + source = + new HadoopHashTableSource( + PROJECT_ID, + HASH_TABLE_OUTPUT_PATH_DIR, + // When split is not called, start/stop are uninitialized. Start/stop are runtime params + // and are initialized in split/createReader. + null, + null, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)); + // Setup boundaries on the TableHashWrapper to be used in Source. + fakeTableHashWrapper.startRowInclusive = START_ROW; + fakeTableHashWrapper.stopRowExclusive = STOP_ROW; + + // Create a new Reader + BoundedReader reader = source.createReader(null); + + // Validate that the reader was properly created. + assertEquals(HashBasedReader.class, reader.getClass()); + assertEquals(source, reader.getCurrentSource()); + HashBasedReader hashBasedReader = (HashBasedReader) reader; + assertEquals(START_ROW, hashBasedReader.startRowInclusive); + assertEquals(STOP_ROW, hashBasedReader.stopRowExclusive); + } + + @Test + public void testCreateReaderAfterSplit() throws IOException { + // Single partitions will return a 2 sources. + List> splitSources = + getSplitSources(ImmutableList.of(PARTITION1), START_ROW, STOP_ROW); + BoundedSource splitHashSource = splitSources.get(0); + + // Create a new Reader + BoundedReader reader = splitHashSource.createReader(null); + + // Validate that the reader was properly created. + assertEquals(HashBasedReader.class, reader.getClass()); + assertEquals(splitHashSource, reader.getCurrentSource()); + HashBasedReader hashBasedReader = (HashBasedReader) reader; + assertEquals(START_ROW, hashBasedReader.startRowInclusive); + assertEquals(PARTITION1, hashBasedReader.stopRowExclusive); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java new file mode 100644 index 0000000000..216f5d219e --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java @@ -0,0 +1,127 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.common.truth.Truth.assertWithMessage; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import junit.framework.TestCase; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class HashBasedSourceSerializationTest extends TestCase { + + public static final String SOURCE_HASH_DIR = "gs://my-bucket/outputDir"; + public static final String PROJECT_ID = "test-project"; + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("a".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = new ImmutableBytesWritable("y".getBytes()); + + @Before + public void setUp() throws Exception { + super.setUp(); + } + + @Test + public void testSerializeDefaultConstructor() throws IOException { + checkSerialization(new HadoopHashTableSource()); + } + + @Test + public void testSerializeWithValueProviders() throws IOException { + checkSerialization( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR))); + } + + @Test + public void testSerializeWithStartStop() throws IOException { + checkSerialization( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), + StaticValueProvider.of(SOURCE_HASH_DIR), + new ImmutableBytesWritable(START_ROW), + new ImmutableBytesWritable(STOP_ROW))); + } + + @Test + public void testBufferedSourceSerialize() { + checkSerialization( + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR)))); + } + + @Test + public void testBufferedSourceSerializeWithBatchSize() { + checkSerialization( + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR)), + 5)); + } + + private static void checkSerialization(Object source) { + try { + Object deserialized = serializeDeserialize(source); + checkClassDeclaresSerialVersionUid(source.getClass()); + assertEquals(source, deserialized); + } catch (IOException | ClassNotFoundException e) { + fail(e.toString()); + } + } + + private static void checkClassDeclaresSerialVersionUid(Class cls) { + String uid = "serialVersionUID"; + for (Field field : cls.getDeclaredFields()) { + if (field.getName() == uid) { + int modifiers = field.getModifiers(); + assertWithMessage(field + " is not static").that(Modifier.isStatic(modifiers)).isTrue(); + assertWithMessage(field + " is not final").that(Modifier.isFinal(modifiers)).isTrue(); + assertWithMessage(field + " is not private").that(Modifier.isPrivate(modifiers)).isTrue(); + assertWithMessage(field + " must be long") + .that(field.getType().getSimpleName()) + .isEqualTo("long"); + return; + } + } + fail(cls + " does not declare serialVersionUID"); + } + + private static Object serializeDeserialize(Object obj) + throws IOException, ClassNotFoundException { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try (ObjectOutputStream outStream = new ObjectOutputStream(bos)) { + outStream.writeObject(obj); + } + + ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray()); + try (ObjectInputStream inStream = new ObjectInputStream(bis)) { + return inStream.readObject(); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java new file mode 100644 index 0000000000..bad5cd8fff --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java @@ -0,0 +1,51 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import org.apache.beam.sdk.coders.CoderException; +import org.apache.beam.sdk.testing.CoderProperties; +import org.apache.beam.sdk.util.CoderUtils; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Assert; +import org.junit.Test; + +public class RangeHashCoderTest { + private static final RangeHashCoder TEST_CODER = new RangeHashCoder(); + private static final ImmutableBytesWritable START = + new ImmutableBytesWritable("Start".getBytes()); + private static final ImmutableBytesWritable STOP = new ImmutableBytesWritable("Stop".getBytes()); + private static final ImmutableBytesWritable HASH = new ImmutableBytesWritable("hash".getBytes()); + private static final ImmutableBytesWritable EMPTY = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + + @Test + public void encodeRangeHash() throws Exception { + CoderProperties.coderDecodeEncodeEqual(TEST_CODER, RangeHash.of(START, STOP, HASH)); + } + + @Test(expected = CoderException.class) + public void encodeNullThrowsCoderException() throws Exception { + CoderUtils.encodeToByteArray(TEST_CODER, null); + } + + @Test + public void testEncodedTypeDescriptor() throws Exception { + Assert.assertEquals(TEST_CODER.getEncodedTypeDescriptor(), TypeDescriptor.of(RangeHash.class)); + } +} From 40ea4b0c1b8038e4b7e0045353a9021fe6838005 Mon Sep 17 00:00:00 2001 From: shitanshu verma Date: Thu, 4 Feb 2021 16:36:55 -0500 Subject: [PATCH 2/8] Fix lint error. --- .../validation/BufferedHadoopHashTableSource.java | 3 ++- .../ComputeAndValidateHashFromBigtableDoFnTest.java | 13 +++++-------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java index eb018832ce..3cf415be5d 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java @@ -111,7 +111,8 @@ public boolean equals(Object o) { return false; } BufferedHadoopHashTableSource that = (BufferedHadoopHashTableSource) o; - return maxBufferSize == that.maxBufferSize && Objects.equal(hashTableSource, that.hashTableSource); + return maxBufferSize == that.maxBufferSize + && Objects.equal(hashTableSource, that.hashTableSource); } @Override diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java index ed725d8d37..d1fa56ba44 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java @@ -72,11 +72,9 @@ public class ComputeAndValidateHashFromBigtableDoFnTest { private static final int FIRST_ROW_INDEX = 20; private static final int LAST_ROW_INDEX = 31; - @Rule - public final BigtableEmulatorRule bigtableEmulator = BigtableEmulatorRule.create(); + @Rule public final BigtableEmulatorRule bigtableEmulator = BigtableEmulatorRule.create(); - @Rule - public final transient TestPipeline p = TestPipeline.create(); + @Rule public final transient TestPipeline p = TestPipeline.create(); private ComputeAndValidateHashFromBigtableDoFn doFn; @@ -166,9 +164,7 @@ private void writeDataToTable() throws IOException { table.put(puts); } - /** - * Deletes the row range [startIndex, stopIndex) - */ + /** Deletes the row range [startIndex, stopIndex) */ private void deleteRange(int startIndex, int stopIndex) throws IOException { for (int i = startIndex; i < stopIndex; i++) { table.delete(new Delete(getRowKey(i))); @@ -362,7 +358,8 @@ public void testMissingRanges() throws Exception { // Delete a range in middle deleteRange(23, 25); - // Delete row ranges at the end, bigtable scanner will finish with multiple row-ranges to process. + // Delete row ranges at the end, bigtable scanner will finish with multiple row-ranges to + // process. deleteRange(27, LAST_ROW_INDEX + 1); PCollection>>> input = From b91459be62a7c170bd2b182387ee95c85b6bc3a5 Mon Sep 17 00:00:00 2001 From: shitanshu verma Date: Fri, 5 Feb 2021 13:08:03 -0500 Subject: [PATCH 3/8] Fixing maven dependency. --- .../bigtable-beam-import/pom.xml | 16 ++++++++++------ .../src/test/generate_test_data.txt | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 8ee5ba861b..ffc95bdf03 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -77,6 +77,16 @@ limitations under the License. + + com.google.api + api-common + 1.10.0 + + + com.google.code.gson + gson + 2.2.4 + org.apache.beam @@ -224,11 +234,6 @@ limitations under the License. 1.0.1 test - - com.google.auto.service - auto-service-annotations - 1.0-rc7 - com.google.cloud google-cloud-bigtable-emulator @@ -360,7 +365,6 @@ limitations under the License. - com.google.auto.value:auto-value commons-codec:commons-codec com.squareup.okhttp:okhttp org.apache.beam:beam-sdks-java-io-hadoop-common diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt b/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt index 921caf2d6d..6e66d3e096 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt @@ -110,7 +110,7 @@ hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -snapshot test-snapshot -c // Create the hashes for the table. Run the command from unix shell on an HBase // node. -hbase org.apache.hadoop.hbase.mapreduce.HashTable --batchsize=100 --numhashfiles=10 test /integration-test/hashtable +hbase org.apache.hadoop.hbase.mapreduce.HashTable --batchsize=10 --numhashfiles=10 test /integration-test/hashtable // Export the data into GCS hadoop fs -copyToLocal /integration-test /tmp/ From e7de6b662758ce1ebfb1dc1025b50120dae7569e Mon Sep 17 00:00:00 2001 From: shitanshu verma Date: Thu, 11 Feb 2021 13:30:01 -0500 Subject: [PATCH 4/8] Incorporating review feedback. --- .../bigtable-beam-import/pom.xml | 11 --- .../BufferedHadoopHashTableSource.java | 46 +++------- ...omputeAndValidateHashFromBigtableDoFn.java | 36 +++----- .../validation/HadoopHashTableSource.java | 91 +++++++++---------- .../beam/validation/RangeHashCoder.java | 2 +- .../beam/validation/SyncTableJob.java | 14 +-- .../beam/validation/SyncTableUtils.java | 4 +- .../beam/validation/TableHashWrapper.java | 4 +- .../validation/TableHashWrapperFactory.java | 2 +- .../beam/validation/TableHashWrapperImpl.java | 11 +-- .../mapreduce/BigtableTableHashAccessor.java | 4 +- .../beam/hbasesnapshots/EndToEndIT.java | 4 +- .../BufferedHadoopHashTableSourceTest.java | 2 +- ...teAndValidateHashFromBigtableDoFnTest.java | 81 ++++++++++++++--- .../beam/validation/FakeTableHashWrapper.java | 2 +- .../FakeTableHashWrapperFactory.java | 2 +- .../validation/HadoopHashBasedReaderTest.java | 10 +- .../validation/HadoopHashTableSourceTest.java | 2 +- .../HashBasedSourceSerializationTest.java | 2 +- .../beam/validation/RangeHashCoderTest.java | 2 +- 20 files changed, 170 insertions(+), 162 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index ffc95bdf03..93a94e106d 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -77,17 +77,6 @@ limitations under the License. - - com.google.api - api-common - 1.10.0 - - - com.google.code.gson - gson - 2.2.4 - - org.apache.beam beam-sdks-java-core diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java index 3cf415be5d..a616441655 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,13 +17,10 @@ import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; -import com.google.api.core.InternalApi; import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; import com.google.common.base.Objects; import com.google.common.base.Preconditions; import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; import java.util.ArrayList; import java.util.List; import org.apache.beam.sdk.coders.Coder; @@ -33,8 +30,6 @@ import org.apache.beam.sdk.io.BoundedSource; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.values.KV; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.util.Bytes; /** @@ -45,19 +40,20 @@ *

Hadoop HashTable output is sorted by row-key and contains a row-range and hash. Beam * Pcollection do not guarantee any ordering. To fetch a batch of ranges in 1 ReadRows operation, * this source buffers then and outputs a List guaranteeing the sorted order of ranges. + * + *

Emits a batch of sorted RangeHashes keyed by the start key of the first range. */ -@InternalApi class BufferedHadoopHashTableSource extends BoundedSource>> { private static final long serialVersionUID = 39842743L; - public static final Log LOG = LogFactory.getLog(BufferedHadoopHashTableSource.class); private static final int DEFAULT_BATCH_SIZE = 50; + private static final Coder>> CODER = + KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of()));; // Max number of RangeHashes to buffer. - private int maxBufferSize; - private HadoopHashTableSource hashTableSource; - private Coder>> coder; + private final int maxBufferSize; + private final HadoopHashTableSource hashTableSource; public BufferedHadoopHashTableSource(HadoopHashTableSource source) { this(source, DEFAULT_BATCH_SIZE); @@ -65,7 +61,6 @@ public BufferedHadoopHashTableSource(HadoopHashTableSource source) { public BufferedHadoopHashTableSource(HadoopHashTableSource hashTableSource, int maxBufferSize) { this.hashTableSource = hashTableSource; - this.coder = KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of())); this.maxBufferSize = maxBufferSize; } @@ -88,13 +83,13 @@ public List>>> split( @Override public Coder>> getOutputCoder() { - return coder; + return CODER; } @Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { // HashTable data files don't expose a method to estimate size or lineCount. - return 0; + return hashTableSource.getEstimatedSizeBytes(options); } @Override @@ -130,21 +125,10 @@ public String toString() { + maxBufferSize; } - private void writeObject(ObjectOutputStream s) throws IOException { - s.writeObject(hashTableSource); - s.writeInt(maxBufferSize); - } - - private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { - this.hashTableSource = (HadoopHashTableSource) s.readObject(); - this.coder = KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of())); - this.maxBufferSize = s.readInt(); - } - private static class BufferedHashBasedReader extends BoundedReader>> { - private BoundedReader hashReader; - private BufferedHadoopHashTableSource source; + private final BoundedReader hashReader; + private final BufferedHadoopHashTableSource source; private List buffer; @@ -181,6 +165,9 @@ private boolean bufferRangeHashes() throws IOException { @Override public boolean advance() throws IOException { + // Reset the buffer for next batch. + buffer = new ArrayList<>(source.maxBufferSize); + return bufferRangeHashes(); } @@ -188,15 +175,12 @@ public boolean advance() throws IOException { public KV> getCurrent() { // getCurrent only gets called when buffer is not empty. Preconditions.checkArgument(!buffer.isEmpty(), "Can not get current on empty buffer."); - List hashes = buffer; - // Reset the buffer for next batch. - buffer = new ArrayList<>(source.maxBufferSize); // GroupBy key is a string and not ImmutableBytesWritable because the WritableCoder is not // deterministic. The outputted PCollection is grouped by the K and needs a deterministic // coder. Having a String K leads to an unfortunate double encoding, ImmutableBytesWritable-> // HEX string -> UTF8 encoded string. The number of batches are significantly smaller than // data fetched from Bigtable and should not have meaningful impact on the job performance. - return KV.of(Bytes.toStringBinary(hashes.get(0).startInclusive.copyBytes()), hashes); + return KV.of(Bytes.toStringBinary(buffer.get(0).startInclusive.copyBytes()), buffer); } @Override diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java index 3801465f2f..62984e8ce2 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; -import com.google.api.core.InternalApi; +import com.google.bigtable.repackaged.com.google.common.base.Preconditions; import com.google.cloud.bigtable.beam.AbstractCloudBigtableTableDoFn; import com.google.cloud.bigtable.beam.CloudBigtableConfiguration; import com.google.cloud.bigtable.beam.TemplateUtils; @@ -45,7 +45,6 @@ * A {@link DoFn} that takes a row range and hash from HBase and validates the hash from rows read * from Cloud Bigtable. */ -@InternalApi class ComputeAndValidateHashFromBigtableDoFn extends AbstractCloudBigtableTableDoFn>>, RangeHash> { @@ -91,7 +90,7 @@ public void processElement(ProcessContext context) throws Exception { for (List rangeHashes : context.element().getValue()) { if (rangeHashes.isEmpty()) { // No rows ranges found, return; - return; + continue; } ImmutableBytesWritable rangeStartInclusive = rangeHashes.get(0).startInclusive; @@ -126,19 +125,16 @@ public void processElement(ProcessContext context) throws Exception { // rangeHashes until rowKey's range is found. while (!isWithinUpperBound(currentRangeHash.stopExclusive, rowKey)) { validateBatchHash(context, resultHasher, currentRangeHash); - if (!rangeHashIterator.hasNext()) { - // THIS SHOULD NEVER HAPPEN. Bigtable is being scanned till the last - // RangeHash.endKeyExclusive(), so bigtable's result should not outlast the - // rangeHashes. - throw new IllegalStateException( - "Buffer reached to end while scan is still active at row :" - + immutableBytesToString(result.getRow()) - + ". Affected Range: [" - + immutableBytesToString(rangeStartInclusive) - + ", " - + immutableBytesToString(rangeEndExclusive) - + ")."); - } + // THIS SHOULD NEVER HAPPEN. Bigtable is being scanned till the last + // RangeHash.endKeyExclusive(), so bigtable's result should not outlast the + // rangeHashes. + Preconditions.checkState( + rangeHashIterator.hasNext(), + "Buffer reached to end while scan is still active at row : %s. " + + "Affected Range: [%s, %s)." + + immutableBytesToString(result.getRow()) + + immutableBytesToString(rangeStartInclusive) + + immutableBytesToString(rangeEndExclusive)); currentRangeHash = rangeHashIterator.next(); } @@ -195,10 +191,6 @@ private ResultScanner createBigtableScan(byte[] startKeyInclusive, byte[] stopKe /** * Determines if row >= stopExclusive for a row range (start, stopExclusive). Empty stopExclusive * represents a range with no upper bound. - * - * @param stopExclusive - * @param row - * @return */ private boolean isWithinUpperBound( ImmutableBytesWritable stopExclusive, ImmutableBytesWritable row) { @@ -216,7 +208,7 @@ private void validateBatchHash( matches.inc(); } // Start a new batch - resultHasher.startBatch(new ImmutableBytesWritable(currentRangeHash.stopExclusive)); + resultHasher.startBatch(currentRangeHash.stopExclusive); } private void reportMismatch(ProcessContext context, RangeHash currentRangeHash) { diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java index 20b693963a..59095c8b54 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java @@ -18,11 +18,9 @@ import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.createConfiguration; import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; -import autovalue.shaded.com.google$.common.annotations.$VisibleForTesting; -import com.google.api.core.InternalApi; +import com.google.bigtable.repackaged.com.google.common.annotations.VisibleForTesting; import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; import com.google.cloud.bigtable.beam.validation.TableHashWrapper.TableHashReader; -import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Objects; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; @@ -32,6 +30,7 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.List; +import javax.annotation.Nullable; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.DefaultCoder; import org.apache.beam.sdk.io.BoundedSource; @@ -46,7 +45,6 @@ * A beam source to read output of Hadoop HashTable job. The source creates 1 workitem per HashTable * data file and emits a row-range/hash pair. */ -@InternalApi class HadoopHashTableSource extends BoundedSource implements Serializable { private static final long serialVersionUID = 2383724L; @@ -120,9 +118,9 @@ public int hashCode() { private RangeHashCoder coder; // Row range owned by this source. - @VisibleForTesting ImmutableBytesWritable startRowInclusive; + @VisibleForTesting @Nullable ImmutableBytesWritable startRowInclusive; - @VisibleForTesting ImmutableBytesWritable stopRowExclusive; + @VisibleForTesting @Nullable ImmutableBytesWritable stopRowExclusive; private TableHashWrapperFactory tableHashWrapperFactory; @@ -143,12 +141,12 @@ public HadoopHashTableSource( * Constructor to initialize a HadoopHashTableSource for a given row-range. Used for creating * split sources. */ - @$VisibleForTesting + @VisibleForTesting HadoopHashTableSource( ValueProvider projectId, ValueProvider sourceHashDir, - ImmutableBytesWritable startRowInclusive, - ImmutableBytesWritable stopRowExclusive) { + @Nullable ImmutableBytesWritable startRowInclusive, + @Nullable ImmutableBytesWritable stopRowExclusive) { this( projectId, sourceHashDir, @@ -161,8 +159,8 @@ public HadoopHashTableSource( HadoopHashTableSource( ValueProvider projectId, ValueProvider hadoopHashTableOutputDir, - ImmutableBytesWritable startRowInclusive, - ImmutableBytesWritable stopRowExclusive, + @Nullable ImmutableBytesWritable startRowInclusive, + @Nullable ImmutableBytesWritable stopRowExclusive, TableHashWrapperFactory tableHashWrapperFactory) { this.coder = new RangeHashCoder(); this.projectId = projectId; @@ -192,15 +190,15 @@ public List> split( new HadoopHashTableSource( projectId, sourceHashDir, - new ImmutableBytesWritable(hash.getStartRow()), - new ImmutableBytesWritable(hash.getStopRow()), + hash.getStartRow(), + hash.getStopRow(), tableHashWrapperFactory)); return splitSources; } // Use the HashTable start key. The value is HConstants.EMPTY_START_ROW for full table scan. - ImmutableBytesWritable startRow = new ImmutableBytesWritable(hash.getStartRow()); - ImmutableBytesWritable stopRow = new ImmutableBytesWritable(hash.getStopRow()); + ImmutableBytesWritable startRow = hash.getStartRow(); + ImmutableBytesWritable stopRow = hash.getStopRow(); // The output of HashTable is organized as partition file and a set of datafiles. // Partition file contains a list of partitions, these partitions split the key-range of a table @@ -238,7 +236,7 @@ public List> split( projectId, sourceHashDir, partitions.get(numPartitions - 1), - new ImmutableBytesWritable(stopRow), + stopRow, tableHashWrapperFactory)); LOG.info("Returning " + splitSources.size() + " sources from " + numPartitions + " partitions"); return splitSources; @@ -270,11 +268,11 @@ public BoundedReader createReader(PipelineOptions options) throws IOException { return new HashBasedReader( this, - new ImmutableBytesWritable(startRowInclusive), - new ImmutableBytesWritable(stopRowExclusive), + startRowInclusive, + stopRowExclusive, hash.newReader( createConfiguration(this.projectId.get(), this.sourceHashDir.get()), - new ImmutableBytesWritable(startRowInclusive))); + startRowInclusive)); } @Override @@ -307,7 +305,6 @@ public String toString() { } private void writeObject(ObjectOutputStream s) throws IOException { - // s.defaultWriteObject(); s.writeObject(projectId); s.writeObject(sourceHashDir); s.writeObject(tableHashWrapperFactory); @@ -328,50 +325,48 @@ private void writeObject(ObjectOutputStream s) throws IOException { } private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { - // s.defaultReadObject(); - this.projectId = (ValueProvider) s.readObject(); - this.sourceHashDir = (ValueProvider) s.readObject(); - this.tableHashWrapperFactory = (TableHashWrapperFactory) s.readObject(); + projectId = (ValueProvider) s.readObject(); + sourceHashDir = (ValueProvider) s.readObject(); + tableHashWrapperFactory = (TableHashWrapperFactory) s.readObject(); // start/stop can be null, they are preceded by a boolean indicating their presence. if (s.readBoolean() == true) { - this.startRowInclusive = new ImmutableBytesWritable((byte[]) s.readObject()); + startRowInclusive = new ImmutableBytesWritable((byte[]) s.readObject()); } if (s.readBoolean() == true) { - this.stopRowExclusive = new ImmutableBytesWritable((byte[]) s.readObject()); + stopRowExclusive = new ImmutableBytesWritable((byte[]) s.readObject()); } } @VisibleForTesting static class HashBasedReader extends BoundedReader { - final HadoopHashTableSource source; - final TableHashReader reader; + private final HadoopHashTableSource source; + private final TableHashReader reader; - final ImmutableBytesWritable startRowInclusive; - final ImmutableBytesWritable stopRowExclusive; + @VisibleForTesting final ImmutableBytesWritable startRowInclusive; + @VisibleForTesting final ImmutableBytesWritable stopRowExclusive; - long numKeys = 0; + private long numKeys = 0; // Flag indicating that this workitem is finished. - boolean isDone = false; - ImmutableBytesWritable currentRangeStartKey; + private boolean isDone = false; + private ImmutableBytesWritable currentRangeStartKey; // Hash for the current range. - ImmutableBytesWritable currentHash; - RangeHash currentRangeHash; + private ImmutableBytesWritable currentHash; + private RangeHash currentRangeHash; public HashBasedReader( HadoopHashTableSource source, ImmutableBytesWritable startRowInclusive, ImmutableBytesWritable stopRowExclusive, TableHashReader reader) { - this.reader = reader; this.source = source; this.startRowInclusive = startRowInclusive; this.stopRowExclusive = stopRowExclusive; + this.reader = reader; } @Override public boolean start() throws IOException { - // NO CHECKED EXCEPTIONS HERE. LOG.debug( "Starting a new reader at key range [" + immutableBytesToString(startRowInclusive) @@ -401,14 +396,9 @@ public boolean advance() throws IOException { ImmutableBytesWritable startKey = this.currentRangeStartKey; ImmutableBytesWritable hash = this.currentHash; - if (!readNextKey()) { - this.currentRangeHash = RangeHash.of(startKey, stopRowExclusive, hash); - // return true since we have lastBatchStartKey to emit. Set isDone=true to prevent reading - // from a potentially exhausted reader. - isDone = true; - } else { - this.currentRangeHash = RangeHash.of(startKey, reader.getCurrentKey(), hash); - } + // if there is nothing to read, we are done. readNextKey advances the currentRangeStartKey. + isDone = !readNextKey(); + currentRangeHash = RangeHash.of(startKey, currentRangeStartKey, hash); return true; } @@ -417,22 +407,23 @@ public boolean advance() throws IOException { private boolean readNextKey() throws IOException { if (reader.next()) { numKeys++; - this.currentRangeStartKey = reader.getCurrentKey(); + currentRangeStartKey = reader.getCurrentKey(); if ( // StopRow is not set, everything is in bounds. (stopRowExclusive.equals(HConstants.EMPTY_END_ROW) || currentRangeStartKey.compareTo(stopRowExclusive) < 0)) { // currentKey < stopKey // There is a key to read and the key is within the bounds of this workitem. Return true. - this.currentHash = reader.getCurrentHash(); + currentHash = reader.getCurrentHash(); return true; } else { // There is a key to read but its outside of the bounds of this workitem. - this.currentHash = null; + currentHash = null; return false; } } - // Nothing left to read for this workitem. - currentRangeStartKey = null; + // Nothing left to read for this workitem. Next range would have started from + // stopRowExclusive. + currentRangeStartKey = stopRowExclusive; currentHash = null; return false; } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java index 6799d63872..d6341a08f2 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java index a664ea2602..56b38fc3cb 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,10 @@ package com.google.cloud.bigtable.beam.validation; import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; +import com.google.bigtable.repackaged.com.google.gson.Gson; import com.google.cloud.bigtable.beam.sequencefiles.Utils; import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; import com.google.common.annotations.VisibleForTesting; -import com.google.gson.Gson; import java.util.List; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; @@ -183,17 +183,11 @@ public static Pipeline buildPipeline(SyncTableOptions opts) { static class RangeHashToString extends SimpleFunction { // TODO maybe explore a sequenceFile sink for RangeHash. Hadoop jobs using this output may be // easier to write for sequence file. - - // GSON is not serializable, keep it transient. Member variable to avoid creating a Gson object - // per apply call. - private transient Gson gson = null; + private static final Gson GSON = new Gson(); @Override public String apply(RangeHash input) { - if (gson == null) { - gson = new Gson(); - } - return gson.toJson(input); + return GSON.toJson(input); } } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java index 2f0c5cc4cc..cc92bea6a4 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,12 +15,14 @@ */ package com.google.cloud.bigtable.beam.validation; +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.util.Bytes; /** Utility class for SyncTable job. */ +@InternalApi public class SyncTableUtils { private SyncTableUtils() {} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java index 2f75c5722a..55200570ed 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ package com.google.cloud.bigtable.beam.validation; -import com.google.api.core.InternalApi; +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; import com.google.common.collect.ImmutableList; import java.io.Closeable; import java.io.IOException; diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java index 262aadc7c5..67776299a4 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java index 71a0f6ddaa..b04bd538a6 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,11 +36,10 @@ static TableHashWrapper create(Configuration conf, String hashTableOutputDir) th TableHashWrapper tableHashWrapper = new TableHashWrapperImpl(tableHash); Preconditions.checkArgument( tableHashWrapper.getNumHashFiles() == (tableHashWrapper.getPartitions().size() + 1), - String.format( - "Corrupt hashtable output. %d hash files for %d partitions. Expected %d files.", - tableHashWrapper.getNumHashFiles(), - tableHashWrapper.getPartitions().size(), - tableHashWrapper.getPartitions().size() + 1)); + "Corrupt hashtable output. %d hash files for %d partitions. Expected %d files.", + tableHashWrapper.getNumHashFiles(), + tableHashWrapper.getPartitions().size(), + tableHashWrapper.getPartitions().size() + 1); return tableHashWrapper; } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java index a5312d6c52..a7db0add1c 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ package org.apache.hadoop.hbase.mapreduce; +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; import com.google.common.collect.ImmutableList; import java.io.IOException; import org.apache.hadoop.hbase.client.Result; @@ -24,6 +25,7 @@ import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash; /** A helper class to access package private fields of HashTable.TableHash. */ +@InternalApi public class BigtableTableHashAccessor { // Restrict object creation. This class should only be used to access state from TableHash. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index e7f777f9bc..3c8e26cbfa 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -166,6 +166,8 @@ public void teardown() throws IOException { for (GcsPath path : paths) { pathStrs.add(path.toString()); } + // TODO: cleanup fails when tests time out. Add a orphan cleaner in the setup() + // https://github.com/googleapis/java-bigtable/blob/35588d89b9b243eb691a29d3aff16b9f5a08fbb8/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/test_helpers/env/AbstractTestEnv.java#L108-L119 this.gcsUtil.remove(pathStrs); } @@ -309,7 +311,7 @@ public void testHBaseSnapshotImportWithCorruptions() throws Exception { List outputs = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "*")); - System.out.println("OUTPUTS: " + outputs); + LOG.warn("OUTPUTS: " + outputs); // FileSink will shard the outputs and will created >1 files. Assert.assertTrue(outputs.size() > 1); // TODO read the files and validate that the ranges are there instead of size check. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java index ee574a9c2d..96d5960423 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java index d1fa56ba44..2e9b6fd8ed 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,11 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.metrics.MetricQueryResults; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; @@ -47,6 +52,7 @@ import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -192,6 +198,20 @@ private RangeHash createHash(byte[] startRow, byte[] stopRow) throws IOException hasher.getBatchHash()); } + private void validateCounters( + PipelineResult result, Long expectedMatches, Long expectedMismatches) { + MetricQueryResults metrics = result.metrics().allMetrics(); + Map counters = + StreamSupport.stream(metrics.getCounters().spliterator(), false) + .collect(Collectors.toMap((m) -> m.getName().getName(), (m) -> m.getAttempted())); + if (expectedMatches > 0) { + Assert.assertEquals(expectedMatches, counters.get("ranges_matched")); + } + if (expectedMismatches > 0) { + Assert.assertEquals(expectedMismatches, counters.get("ranges_not_matched")); + } + } + ////////// Happy case tests for various setups////////////////////// @Test public void testHashMatchesForMultipleRange() throws Exception { @@ -203,7 +223,8 @@ public void testHashMatchesForMultipleRange() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output).empty(); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 2L, 0L); } @Test @@ -215,7 +236,8 @@ public void testHashMatchesForSingleRange() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output).containsInAnyOrder(); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 1L, 0L); } @Test @@ -228,7 +250,8 @@ public void testHashMatchesForFullTableScanWithMultipleRange() throws Exception PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output).empty(); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 2L, 0L); } @Test @@ -242,7 +265,31 @@ public void testHashMatchesForMultipleSingleRowRange() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output).empty(); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 3L, 0L); + } + + ///////////////// Test mismatches with multiple ranges per Key in KV<> //////////////////// + @Test + public void testHashMisMatchesForMultipleRangeAcrossKV() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(28))); + + // Corrupt both the ranges + table.delete(new Delete(getRowKey(21)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(24)).addColumn(CF2, COL, TS, getValue(20, 0))); + + PCollection>>> input = + p.apply( + Create.of( + KV.of( + new String(getRowKey(21)), + Arrays.asList(Arrays.asList(hashes.get(0)), Arrays.asList(hashes.get(1)))))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes); + PipelineResult result = p.run(); + validateCounters(result, 0L, 2L); } ///////////////// Test mismatches when Bigtable has extra rows //////////////////// @@ -260,7 +307,8 @@ public void testAdditionalCellInMiddle() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output).containsInAnyOrder(hashes.get(1)); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 2L, 1L); } @Test @@ -280,7 +328,8 @@ public void testAdditionalRowsAtEnds() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2)); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 1L, 2L); } ///////////////////// Test different values /////////////////////////// @@ -314,7 +363,8 @@ public void testDifferentValues() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output) .containsInAnyOrder(hashes.get(0), hashes.get(1), hashes.get(2), hashes.get(3)); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 1L, 4L); } ////////////////// Tests with CBT missing data ////////////////////////////// @@ -340,7 +390,8 @@ public void testMissingRows() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4)); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 2L, 3L); } @Test @@ -368,7 +419,8 @@ public void testMissingRanges() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output) .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5)); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 2L, 4L); } @Test @@ -385,7 +437,8 @@ public void testCbtEmpty() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output).containsInAnyOrder(hashes); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 0L, 3L); } ////////////////////// Test that scan is used from TableHash.//////////////////////// @@ -404,7 +457,8 @@ public void testScanFromTableHash() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output).containsInAnyOrder(hashes); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 0L, 3L); } ////////////////////// Combination of different cases ////////////////////////////////// @@ -436,6 +490,7 @@ public void testMismatchesComprehensive() throws Exception { PCollection output = input.apply(ParDo.of(doFn)); PAssert.that(output) .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5)); - p.run(); + PipelineResult result = p.run(); + validateCounters(result, 2L, 4L); } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java index 6e3e5f004d..04cce0b1cd 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java index 9a3acc19a9..2e65e3b855 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java index 20abf02d06..fa88a56d14 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,8 +44,6 @@ public class HadoopHashBasedReaderTest { new ImmutableBytesWritable("AAAA".getBytes()); private static final ImmutableBytesWritable STOP_ROW = new ImmutableBytesWritable("ZZZZ".getBytes()); - private static final ImmutableBytesWritable POST_STOP_ROW = - new ImmutableBytesWritable("z".getBytes()); // Lowercase z is lexicographically > uppercase Z private static final ImmutableBytesWritable EMPTY_ROW = new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); private static final ImmutableBytesWritable START_HASH = @@ -139,7 +137,7 @@ public void testHashReaderWithEmptyEndRow() throws IOException { @Test public void testHashReaderWorkItemEndedOnFirstBatch() throws IOException { // Setup 1 entry in this hashtable datafile. This entry is outside of the workitem's row - fakeTableHashWrapper.hashes.add(KV.of(POST_STOP_ROW, START_HASH)); + fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, START_HASH)); // Source will be empty as no hashes fall in its bounds. assertEquals(new ArrayList(), SourceTestUtils.readFromSource(hashTableSource, null)); } @@ -159,8 +157,8 @@ public void testHashReaderWorkItemEndedOnSecondEntry() throws IOException { public void testHashReaderWorkItemEndedAfterMultipleBatches() throws IOException { // Setup 4 entries in this hashtable datafile. List expected = setupTestData(START_ROW, STOP_ROW, 4); - // Add a next entry after the stop row. Reader should stop and read just 4 entry. - fakeTableHashWrapper.hashes.add(KV.of(POST_STOP_ROW, getHash(100))); + // Add a next entry at the stop row. Reader should stop and read just 4 entry. + fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, getHash(100))); assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java index bc79f4300b..a3aba3f756 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java index 216f5d219e..8c7f6cc8c4 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java index bad5cd8fff..5f644e3b50 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 5093d87bb9d67e5617dcc0e7859dbe2d9dfad67b Mon Sep 17 00:00:00 2001 From: shitanshu verma Date: Thu, 11 Feb 2021 14:08:34 -0500 Subject: [PATCH 5/8] Fixing maven build issues. --- bigtable-dataflow-parent/bigtable-beam-import/pom.xml | 6 ++++++ .../bigtable/beam/validation/FakeTableHashWrapper.java | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 93a94e106d..b1d909f25a 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -229,6 +229,11 @@ limitations under the License. 0.124.0 test + + com.google.code.findbugs + jsr305 + ${jsr305.version} + @@ -354,6 +359,7 @@ limitations under the License. + com.google.auto.value:auto-value commons-codec:commons-codec com.squareup.okhttp:okhttp org.apache.beam:beam-sdks-java-io-hadoop-common diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java index 04cce0b1cd..ee2b6814e2 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java @@ -15,8 +15,8 @@ */ package com.google.cloud.bigtable.beam.validation; +import com.google.bigtable.repackaged.com.google.gson.Gson; import com.google.common.collect.ImmutableList; -import com.google.gson.Gson; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; From 4073bf63124a87b47e443769292992ebdb3cc4af Mon Sep 17 00:00:00 2001 From: shitanshu verma Date: Fri, 12 Feb 2021 10:50:25 -0500 Subject: [PATCH 6/8] Adding validation of mismatches in integration tests. --- .../bigtable-beam-import/pom.xml | 1 - .../validation/HadoopHashTableSource.java | 4 +- .../beam/hbasesnapshots/EndToEndIT.java | 111 ++++++++++++++---- 3 files changed, 89 insertions(+), 27 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index b1d909f25a..778083f0b9 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -26,7 +26,6 @@ limitations under the License. com.google.cloud.bigtable.beam.Main - false diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java index 59095c8b54..138ba3f860 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java @@ -18,6 +18,7 @@ import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.createConfiguration; import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; import com.google.bigtable.repackaged.com.google.common.annotations.VisibleForTesting; import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; import com.google.cloud.bigtable.beam.validation.TableHashWrapper.TableHashReader; @@ -45,7 +46,8 @@ * A beam source to read output of Hadoop HashTable job. The source creates 1 workitem per HashTable * data file and emits a row-range/hash pair. */ -class HadoopHashTableSource extends BoundedSource implements Serializable { +@InternalApi +public class HadoopHashTableSource extends BoundedSource implements Serializable { private static final long serialVersionUID = 2383724L; diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index 3c8e26cbfa..0320dd1a61 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -18,13 +18,20 @@ import static com.google.common.base.Preconditions.checkNotNull; import com.google.api.services.storage.model.Objects; +import com.google.bigtable.repackaged.com.google.gson.Gson; import com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot.ImportOptions; +import com.google.cloud.bigtable.beam.sequencefiles.HBaseResultToMutationFn; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; import com.google.cloud.bigtable.beam.validation.SyncTableJob; import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions; import com.google.cloud.bigtable.hbase.BigtableConfiguration; import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -42,10 +49,9 @@ import org.apache.beam.sdk.metrics.MetricQueryResults; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Connection; @@ -54,11 +60,12 @@ import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.snapshot.SnapshotTestingUtils; -import org.apache.hadoop.hbase.util.Bytes; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /* * End to end integration test for pipeline that import HBase snapshot data into Cloud Bigtable and @@ -79,7 +86,7 @@ */ public class EndToEndIT { - private final Log LOG = LogFactory.getLog(getClass()); + private static Logger LOG = LoggerFactory.getLogger(HBaseResultToMutationFn.class); private static final String TEST_SNAPSHOT_NAME = "test-snapshot"; // Location of test data hosted on Google Cloud Storage, for on-cloud dataflow tests. private static final String CLOUD_TEST_DATA_FOLDER = "cloud.test.data.folder"; @@ -228,6 +235,60 @@ private Map getCountMap(PipelineResult result) { .collect(Collectors.toMap((m) -> m.getName().getName(), (m) -> m.getAttempted())); } + /** + * Reads the output of SyncTable job and returns a list of mismatched RangeHashes. + * + * @throws IOException + */ + private List readMismatchesFromOutputFiles() throws IOException { + Gson gson = new Gson(); + // Find output files + List outputFiles = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "*")); + List rangeHashes = new ArrayList<>(); + + // Read each file line by line and create a RangeHash from it. + for (GcsPath outputFile : outputFiles) { + int size = (int) gcsUtil.fileSize(outputFile); + byte[] fileContents = new byte[size]; + gcsUtil.open(outputFile).read(ByteBuffer.wrap(fileContents)); + BufferedReader reader = + new BufferedReader(new InputStreamReader(new ByteArrayInputStream(fileContents))); + String serializedRangeHash; + while ((serializedRangeHash = reader.readLine()) != null) { + try { + rangeHashes.add(gson.fromJson(serializedRangeHash.trim(), RangeHash.class)); + } catch (Exception e) { + LOG.error("Failed to parse JSON: [" + serializedRangeHash + "]", e); + throw e; + } + } + } + return rangeHashes; + } + + // Asserts that all the rowKeys belong in mismatches. + // Throws AssertionException + private void validateRowInRangeHashes(List rowKeys, Iterable mismatches) { + for (byte[] mismatchedRowKey : rowKeys) { + Assert.assertTrue(containsRow(mismatchedRowKey, mismatches)); + } + } + + // Returns true if the rowKey belongs in one of the ranges contained in rangeHashes. + private boolean containsRow(byte[] rowKey, Iterable rangeHashes) { + for (RangeHash mismatchedRange : rangeHashes) { + // TODO: There maybe a better Range.belongs() utility function somewhere? + // Empty start/end key means that there is no start/end key. + if ((mismatchedRange.startInclusive.equals(HConstants.EMPTY_BYTE_ARRAY) + || mismatchedRange.startInclusive.compareTo(rowKey) <= 0) + && (mismatchedRange.stopExclusive.equals(HConstants.EMPTY_BYTE_ARRAY) + || mismatchedRange.stopExclusive.compareTo(rowKey) > 0)) { + return true; + } + } + return false; + } + @Test public void testHBaseSnapshotImport() throws Exception { @@ -253,16 +314,13 @@ public void testHBaseSnapshotImport() throws Exception { state = result.waitUntilFinish(); Assert.assertEquals(State.DONE, state); - List outputs = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "*")); - // FileSink will write an empty file when there are no mismatches - Assert.assertEquals(1, outputs.size()); - // TODO read the actual files and validate the ranges instead of size check - Assert.assertEquals(0, gcsUtil.fileSize(outputs.get(0))); + // Read the output files and validate that there are no mismatches. + Assert.assertEquals(0, readMismatchesFromOutputFiles().size()); // Validate the counters. Map counters = getCountMap(result); - Assert.assertEquals(counters.size(), 1); Assert.assertEquals(counters.get("ranges_matched"), (Long) 101L); + Assert.assertNull(counters.get("ranges_not_matched")); } /** @@ -276,15 +334,21 @@ public void testHBaseSnapshotImportWithCorruptions() throws Exception { State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); Assert.assertEquals(State.DONE, state); + // Rows where corruptions will be added. + byte[] mismatchRowAtStart = "000".getBytes(); + byte[] mismatchRowInMiddle = "24".getBytes(); + byte[] mismatchRowDeleted = "64".getBytes(); + byte[] mismatchRowAtTheEnd = "999".getBytes(); + // Introduce corruptions to the data in Bigtable. Delete data from Bigtable to simulate Bigtable // missing data. Add data to Bigtable to simulate extra data in Bigtable. It is easier to update // Bigtable than change the snapshots. Table table = connection.getTable(TableName.valueOf(tableId)); - Cell cellInMiddle = table.get(new Get("24".getBytes())).rawCells()[0]; + Cell cellInMiddle = table.get(new Get(mismatchRowInMiddle)).rawCells()[0]; List puts = Arrays.asList( // Add a row at the start - new Put(Bytes.toBytes("000")) + new Put(mismatchRowAtStart) .addColumn(CF.getBytes(), "random_col".getBytes(), 1L, "value000".getBytes()) .addColumn(CF.getBytes(), "random_col".getBytes(), 2L, "value001".getBytes()), // change a cell in middle @@ -295,13 +359,13 @@ public void testHBaseSnapshotImportWithCorruptions() throws Exception { cellInMiddle.getTimestamp(), "corrupted_val".getBytes()), // add a new row in the end - new Put(Bytes.toBytes("9999")) + new Put(mismatchRowAtTheEnd) .addColumn(CF.getBytes(), "random_col".getBytes(), 100L, "value999".getBytes())); table.put(puts); // Delete a random row in the middle. We should see 4 ranges mismatch as table is split on - // 1,2...9. We are splitting on 31, delete in 60s. - table.delete(new Delete("64".getBytes())); + // 1,2...9. All the updates are happening on a different split. + table.delete(new Delete(mismatchRowDeleted)); // Run SyncTable job and expect 4 mismatches. SyncTableOptions syncOpts = createSyncTableOptions(); @@ -309,18 +373,15 @@ public void testHBaseSnapshotImportWithCorruptions() throws Exception { state = result.waitUntilFinish(); Assert.assertEquals(State.DONE, state); - List outputs = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "*")); - - LOG.warn("OUTPUTS: " + outputs); - // FileSink will shard the outputs and will created >1 files. - Assert.assertTrue(outputs.size() > 1); - // TODO read the files and validate that the ranges are there instead of size check. - Assert.assertTrue((gcsUtil.fileSize(outputs.get(0)) + gcsUtil.fileSize(outputs.get(1))) > 0); - - // gcsUtil.getObject(outputs.get(0)); + List syncTableOutputMismatches = readMismatchesFromOutputFiles(); + Assert.assertEquals(4, syncTableOutputMismatches.size()); + validateRowInRangeHashes( + Arrays.asList( + mismatchRowAtStart, mismatchRowAtTheEnd, mismatchRowDeleted, mismatchRowInMiddle), + syncTableOutputMismatches); + // Assert that the output collection is the right one. Map counters = getCountMap(result); - Assert.assertEquals(counters.size(), 2); Assert.assertEquals(counters.get("ranges_matched"), (Long) 97L); Assert.assertEquals(counters.get("ranges_not_matched"), (Long) 4L); } From 443e2d16a36e16ad0b0241b620f289725baf2f4d Mon Sep 17 00:00:00 2001 From: shitanshu verma Date: Fri, 12 Feb 2021 12:02:09 -0500 Subject: [PATCH 7/8] Incorporating code review feedback. --- .../BufferedHadoopHashTableSource.java | 7 +- ...omputeAndValidateHashFromBigtableDoFn.java | 151 +++++++++--------- .../validation/HadoopHashTableSource.java | 71 ++++---- ...teAndValidateHashFromBigtableDoFnTest.java | 23 --- .../HashBasedSourceSerializationTest.java | 5 - 5 files changed, 104 insertions(+), 153 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java index a616441655..e62b3c8215 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java @@ -68,6 +68,7 @@ public BufferedHadoopHashTableSource(HadoopHashTableSource hashTableSource, int public List>>> split( long desiredBundleSizeBytes, PipelineOptions options) throws IOException { + @SuppressWarnings("unchecked") List splitHashTableSources = (List) hashTableSource.split(desiredBundleSizeBytes, options); @@ -93,7 +94,8 @@ public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { } @Override - public BoundedReader createReader(PipelineOptions options) throws IOException { + public BoundedReader>> createReader(PipelineOptions options) + throws IOException { return new BufferedHashBasedReader(this, hashTableSource.createReader(options)); } @@ -174,7 +176,8 @@ public boolean advance() throws IOException { @Override public KV> getCurrent() { // getCurrent only gets called when buffer is not empty. - Preconditions.checkArgument(!buffer.isEmpty(), "Can not get current on empty buffer."); + Preconditions.checkState( + !buffer.isEmpty(), "getCurrent() should only be called when start/advance return true."); // GroupBy key is a string and not ImmutableBytesWritable because the WritableCoder is not // deterministic. The outputted PCollection is grouped by the K and needs a deterministic // coder. Having a String K leads to an unfortunate double encoding, ImmutableBytesWritable-> diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java index 62984e8ce2..a19eb9d218 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java @@ -18,6 +18,7 @@ import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; import com.google.bigtable.repackaged.com.google.common.base.Preconditions; +import com.google.bigtable.repackaged.com.google.common.collect.Lists; import com.google.cloud.bigtable.beam.AbstractCloudBigtableTableDoFn; import com.google.cloud.bigtable.beam.CloudBigtableConfiguration; import com.google.cloud.bigtable.beam.TemplateUtils; @@ -85,87 +86,81 @@ public ComputeAndValidateHashFromBigtableDoFn(SyncTableOptions options) { @ProcessElement public void processElement(ProcessContext context) throws Exception { - // BufferedHadoopHashTableSource generates only 1 item per groupby key, but iterate just in - // case. - for (List rangeHashes : context.element().getValue()) { - if (rangeHashes.isEmpty()) { - // No rows ranges found, return; - continue; + List> wrapperdRangeHashes = Lists.newArrayList(context.element().getValue()); + // BufferedHadoopHashTableSource generates only 1 item per groupby key, key is startKey for the + // Sorted ranges. + Preconditions.checkState( + wrapperdRangeHashes.size() == 1, "Can not have muiple entries for a key"); + List rangeHashes = wrapperdRangeHashes.get(0); + Preconditions.checkState(!rangeHashes.isEmpty(), "Can not have empty ranges in DO_FN"); + + ImmutableBytesWritable rangeStartInclusive = rangeHashes.get(0).startInclusive; + ImmutableBytesWritable rangeEndExclusive = + rangeHashes.get(rangeHashes.size() - 1).stopExclusive; + + BigtableResultHasher resultHasher = new BigtableResultHasher(); + resultHasher.startBatch(rangeStartInclusive); + + // Since all the row-ranges are sorted in HashTable's data files, 1 big scan can be used + // to read all the row ranges. Parallelism is achieved by splitting the HashTable's data + // files into smaller bundle of row-ranges in GroupBy. + ResultScanner scanner = + createBigtableScan(rangeStartInclusive.copyBytes(), rangeEndExclusive.copyBytes()); + + Iterator rangeHashIterator = rangeHashes.iterator(); + long numRows = 0; + + RangeHash currentRangeHash = rangeHashIterator.next(); + + // Process each row and validate hashes + for (Result result : scanner) { + numRows++; + if (numRows % 10_000 == 0) { + // Heartbeat in logs in case a large scan gets hung. + DOFN_LOG.debug("Processed " + numRows + " rows "); } - ImmutableBytesWritable rangeStartInclusive = rangeHashes.get(0).startInclusive; - ImmutableBytesWritable rangeEndExclusive = - rangeHashes.get(rangeHashes.size() - 1).stopExclusive; - - BigtableResultHasher resultHasher = new BigtableResultHasher(); - resultHasher.startBatch(rangeStartInclusive); - - // Since all the row-ranges are sorted in HashTable's data files, 1 big scan can be used - // to read all the row ranges. Parallelism is achieved by splitting the HashTable's data - // files into smaller bundle of row-ranges in GroupBy. - ResultScanner scanner = - createBigtableScan(rangeStartInclusive.copyBytes(), rangeEndExclusive.copyBytes()); - - Iterator rangeHashIterator = rangeHashes.iterator(); - long numRows = 0; - - RangeHash currentRangeHash = rangeHashIterator.next(); - - // Process each row and validate hashes - for (Result result : scanner) { - numRows++; - if (numRows % 10_000 == 0) { - // Heartbeat in logs in case a large scan gets hung. - DOFN_LOG.debug("Processed " + numRows + " rows "); - } - - ImmutableBytesWritable rowKey = new ImmutableBytesWritable(result.getRow()); - - // Check if the rowKey belongs to current range, if not keep iterating through the - // rangeHashes until rowKey's range is found. - while (!isWithinUpperBound(currentRangeHash.stopExclusive, rowKey)) { - validateBatchHash(context, resultHasher, currentRangeHash); - // THIS SHOULD NEVER HAPPEN. Bigtable is being scanned till the last - // RangeHash.endKeyExclusive(), so bigtable's result should not outlast the - // rangeHashes. - Preconditions.checkState( - rangeHashIterator.hasNext(), - "Buffer reached to end while scan is still active at row : %s. " - + "Affected Range: [%s, %s)." - + immutableBytesToString(result.getRow()) - + immutableBytesToString(rangeStartInclusive) - + immutableBytesToString(rangeEndExclusive)); - currentRangeHash = rangeHashIterator.next(); - } - - // Always Hash the current row. - resultHasher.hashResult(result); + ImmutableBytesWritable rowKey = new ImmutableBytesWritable(result.getRow()); + + // Check if the rowKey belongs to current range, if not keep iterating through the + // rangeHashes until rowKey's range is found. + while (!isWithinUpperBound(currentRangeHash.stopExclusive, rowKey)) { + validateBatchHash(context, resultHasher, currentRangeHash); + // THIS SHOULD NEVER HAPPEN. Bigtable is being scanned till the last + // RangeHash.endKeyExclusive(), so bigtable's result should not outlast the + // rangeHashes. + Preconditions.checkState( + rangeHashIterator.hasNext(), + "Buffer reached to end while scan is still active at row : %s. " + + "Affected Range: [%s, %s)." + + immutableBytesToString(result.getRow()) + + immutableBytesToString(rangeStartInclusive) + + immutableBytesToString(rangeEndExclusive)); + currentRangeHash = rangeHashIterator.next(); } - // Bigtable scan is finished at this point and rangeHashes may contain additional row ranges. - // Last range will always be unverified as the range end is exclusive and - // currentRow > rangeEndExclusive will never by true. Verify the last range. - validateBatchHash(context, resultHasher, currentRangeHash); + // Always Hash the current row. + resultHasher.hashResult(result); + } - // If there are remaining ranges in the rangeHashes they all need to reported as mismatched as - // there is nothing in Cloud Bigtable for those row ranges. - // for (int i = bufferIndex; i < rangeHashes.size(); i++) { - while (rangeHashIterator.hasNext()) { - currentRangeHash = rangeHashIterator.next(); - reportMismatch(context, currentRangeHash); - } + // Bigtable scan is finished at this point and rangeHashes may contain additional row ranges. + // Last range will always be unverified as the range end is exclusive and + // currentRow > rangeEndExclusive will never by true. Verify the last range. + validateBatchHash(context, resultHasher, currentRangeHash); - DOFN_LOG.debug( - "Finishing context by outputting " - + rangeHashes.size() - + " keys in range [" - + ((!rangeHashes.isEmpty()) - ? immutableBytesToString(rangeStartInclusive) - + ", " - + immutableBytesToString(rangeEndExclusive) - + ")." - : ", ).")); + // If there are remaining ranges in the rangeHashes they all need to reported as mismatched as + // there is nothing in Cloud Bigtable for those row ranges. + // for (int i = bufferIndex; i < rangeHashes.size(); i++) { + while (rangeHashIterator.hasNext()) { + currentRangeHash = rangeHashIterator.next(); + reportMismatch(context, currentRangeHash); } + + DOFN_LOG.debug( + "Finishing context by outputting {} keys in range [{}, {}).", + rangeHashes.size(), + immutableBytesToString(rangeStartInclusive), + immutableBytesToString(rangeEndExclusive)); } private ResultScanner createBigtableScan(byte[] startKeyInclusive, byte[] stopKeyExclusive) @@ -214,11 +209,9 @@ private void validateBatchHash( private void reportMismatch(ProcessContext context, RangeHash currentRangeHash) { mismatches.inc(); DOFN_LOG.info( - "MISMATCH ON RANGE [" - + immutableBytesToString(currentRangeHash.startInclusive) - + ", " - + immutableBytesToString(currentRangeHash.stopExclusive) - + ")."); + "MISMATCH ON RANGE [{}, {}).", + immutableBytesToString(currentRangeHash.startInclusive), + immutableBytesToString(currentRangeHash.stopExclusive)); context.output(currentRangeHash); } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java index 138ba3f860..f6ecf21e24 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Google Inc. All Rights Reserved. + * Copyright 2021 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ package com.google.cloud.bigtable.beam.validation; -import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.createConfiguration; import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; import com.google.bigtable.repackaged.com.google.api.core.InternalApi; @@ -51,8 +50,11 @@ public class HadoopHashTableSource extends BoundedSource implements S private static final long serialVersionUID = 2383724L; + private static final Coder CODER = RangeHashCoder.of(); + /** * A simple POJO encapsulating a row range and the corresponding hash generated by HashTable job. + * TODO Evaluate if we can use AutoValue for this class. */ @DefaultCoder(RangeHashCoder.class) public static class RangeHash { @@ -111,24 +113,18 @@ public int hashCode() { public static final Log LOG = LogFactory.getLog(HadoopHashTableSource.class); - private ValueProvider projectId; + private final ValueProvider projectId; // Path to the output of HashTable job. Usually in GCS. - private ValueProvider sourceHashDir; - - // Coder to encode/decode the RangeHash - private RangeHashCoder coder; + private final ValueProvider sourceHashDir; // Row range owned by this source. - @VisibleForTesting @Nullable ImmutableBytesWritable startRowInclusive; + // The Start and Stop row are serialized in a custom way. + @VisibleForTesting @Nullable transient ImmutableBytesWritable startRowInclusive; - @VisibleForTesting @Nullable ImmutableBytesWritable stopRowExclusive; + @VisibleForTesting @Nullable transient ImmutableBytesWritable stopRowExclusive; - private TableHashWrapperFactory tableHashWrapperFactory; - - public HadoopHashTableSource() { - this.coder = new RangeHashCoder(); - } + private final TableHashWrapperFactory tableHashWrapperFactory; /** * Creates a HadoopHashTableSource that reads HashTable data from hashTableOutputDir in GCS bucket @@ -164,7 +160,6 @@ public HadoopHashTableSource( @Nullable ImmutableBytesWritable startRowInclusive, @Nullable ImmutableBytesWritable stopRowExclusive, TableHashWrapperFactory tableHashWrapperFactory) { - this.coder = new RangeHashCoder(); this.projectId = projectId; this.sourceHashDir = hadoopHashTableOutputDir; // startRow and stopRow will be null when the template is initialized. startRow and stopRow are @@ -199,7 +194,7 @@ public List> split( } // Use the HashTable start key. The value is HConstants.EMPTY_START_ROW for full table scan. - ImmutableBytesWritable startRow = hash.getStartRow(); + ImmutableBytesWritable nextStartRow = hash.getStartRow(); ImmutableBytesWritable stopRow = hash.getStopRow(); // The output of HashTable is organized as partition file and a set of datafiles. @@ -211,42 +206,39 @@ public List> split( // partition{i}). // So a partition file containing entries [b,f] for a table with row range [a,z] will have 3 // data files containing hashes. - // file0 will contain [a(startRow), b), file1 will contain [b,f), and file3 will contain + // file0 will contain [a(nextStartRow), b), file1 will contain [b,f), and file3 will contain // [f,z(stopRow)) for (int i = 0; i < numPartitions; i++) { + // TODO make a utility function that generates [start, end) format from start/end. LOG.debug( "Adding: [" - + immutableBytesToString(startRow.get()) + + immutableBytesToString(nextStartRow.get()) + ", " + immutableBytesToString(partitions.get(i).get()) - + "]"); + + ")"); splitSources.add( new HadoopHashTableSource( - projectId, sourceHashDir, startRow, partitions.get(i), tableHashWrapperFactory)); - startRow = partitions.get(i); + projectId, sourceHashDir, nextStartRow, partitions.get(i), tableHashWrapperFactory)); + nextStartRow = partitions.get(i); } // Add the last range for [lastPartition, stopRow). LOG.debug( "Adding: [" - + immutableBytesToString(startRow.get()) + + immutableBytesToString(nextStartRow.get()) + ", " + immutableBytesToString(stopRow.get()) - + "]"); + + ")"); // Add the last range for [lastPartition, stopRow). splitSources.add( new HadoopHashTableSource( - projectId, - sourceHashDir, - partitions.get(numPartitions - 1), - stopRow, - tableHashWrapperFactory)); + projectId, sourceHashDir, nextStartRow, stopRow, tableHashWrapperFactory)); LOG.info("Returning " + splitSources.size() + " sources from " + numPartitions + " partitions"); return splitSources; } @Override public Coder getOutputCoder() { - return coder; + return CODER; } @Override @@ -256,7 +248,7 @@ public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { } @Override - public BoundedReader createReader(PipelineOptions options) throws IOException { + public BoundedReader createReader(PipelineOptions options) throws IOException { TableHashWrapper hash = tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get()); @@ -273,7 +265,7 @@ public BoundedReader createReader(PipelineOptions options) throws IOException { startRowInclusive, stopRowExclusive, hash.newReader( - createConfiguration(this.projectId.get(), this.sourceHashDir.get()), + SyncTableUtils.createConfiguration(this.projectId.get(), this.sourceHashDir.get()), startRowInclusive)); } @@ -294,7 +286,7 @@ public boolean equals(Object o) { @Override public int hashCode() { - return Objects.hashCode(projectId, sourceHashDir, coder, startRowInclusive, stopRowExclusive); + return Objects.hashCode(projectId, sourceHashDir, startRowInclusive, stopRowExclusive); } @Override @@ -307,9 +299,7 @@ public String toString() { } private void writeObject(ObjectOutputStream s) throws IOException { - s.writeObject(projectId); - s.writeObject(sourceHashDir); - s.writeObject(tableHashWrapperFactory); + s.defaultWriteObject(); // Start and Stop can be null, write a boolean to indicate if start/stop is expected. if (startRowInclusive == null) { s.writeBoolean(false); @@ -327,9 +317,7 @@ private void writeObject(ObjectOutputStream s) throws IOException { } private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { - projectId = (ValueProvider) s.readObject(); - sourceHashDir = (ValueProvider) s.readObject(); - tableHashWrapperFactory = (TableHashWrapperFactory) s.readObject(); + s.defaultReadObject(); // start/stop can be null, they are preceded by a boolean indicating their presence. if (s.readBoolean() == true) { startRowInclusive = new ImmutableBytesWritable((byte[]) s.readObject()); @@ -348,7 +336,6 @@ static class HashBasedReader extends BoundedReader { @VisibleForTesting final ImmutableBytesWritable startRowInclusive; @VisibleForTesting final ImmutableBytesWritable stopRowExclusive; - private long numKeys = 0; // Flag indicating that this workitem is finished. private boolean isDone = false; private ImmutableBytesWritable currentRangeStartKey; @@ -375,7 +362,6 @@ public boolean start() throws IOException { + " ," + immutableBytesToString(stopRowExclusive) + ")."); - numKeys = 0; if (readNextKey()) { // Dataflow calls start, followed by getCurrent. HashBased reader needs to read on TableHash @@ -408,7 +394,6 @@ public boolean advance() throws IOException { // Returns true if a key can be read for this workitem. private boolean readNextKey() throws IOException { if (reader.next()) { - numKeys++; currentRangeStartKey = reader.getCurrentKey(); if ( // StopRow is not set, everything is in bounds. (stopRowExclusive.equals(HConstants.EMPTY_END_ROW) @@ -442,9 +427,7 @@ public void close() throws IOException { + immutableBytesToString(startRowInclusive) + " ," + immutableBytesToString(stopRowExclusive) - + ") after reading " - + numKeys - + " keys. Ending at " + + "). Ending at " + immutableBytesToString(currentRangeStartKey)); reader.close(); } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java index 2e9b6fd8ed..8c608b74db 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java @@ -269,29 +269,6 @@ public void testHashMatchesForMultipleSingleRowRange() throws Exception { validateCounters(result, 3L, 0L); } - ///////////////// Test mismatches with multiple ranges per Key in KV<> //////////////////// - @Test - public void testHashMisMatchesForMultipleRangeAcrossKV() throws Exception { - hashes.add(createHash(getRowKey(21), getRowKey(24))); - hashes.add(createHash(getRowKey(24), getRowKey(28))); - - // Corrupt both the ranges - table.delete(new Delete(getRowKey(21)).addColumns(CF, COL, TS)); - table.put(new Put(getRowKey(24)).addColumn(CF2, COL, TS, getValue(20, 0))); - - PCollection>>> input = - p.apply( - Create.of( - KV.of( - new String(getRowKey(21)), - Arrays.asList(Arrays.asList(hashes.get(0)), Arrays.asList(hashes.get(1)))))); - - PCollection output = input.apply(ParDo.of(doFn)); - PAssert.that(output).containsInAnyOrder(hashes); - PipelineResult result = p.run(); - validateCounters(result, 0L, 2L); - } - ///////////////// Test mismatches when Bigtable has extra rows //////////////////// @Test public void testAdditionalCellInMiddle() throws Exception { diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java index 8c7f6cc8c4..f58becf3cb 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java @@ -46,11 +46,6 @@ public void setUp() throws Exception { super.setUp(); } - @Test - public void testSerializeDefaultConstructor() throws IOException { - checkSerialization(new HadoopHashTableSource()); - } - @Test public void testSerializeWithValueProviders() throws IOException { checkSerialization( From 0c3fd6786d0cfe366c64c115152516da34336d74 Mon Sep 17 00:00:00 2001 From: shitanshu verma Date: Fri, 12 Feb 2021 14:16:05 -0500 Subject: [PATCH 8/8] Incorporating code review feedback. --- .../beam/validation/ComputeAndValidateHashFromBigtableDoFn.java | 2 +- .../cloud/bigtable/beam/validation/TableHashWrapperFactory.java | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java index a19eb9d218..a75833b022 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java @@ -187,7 +187,7 @@ private ResultScanner createBigtableScan(byte[] startKeyInclusive, byte[] stopKe * Determines if row >= stopExclusive for a row range (start, stopExclusive). Empty stopExclusive * represents a range with no upper bound. */ - private boolean isWithinUpperBound( + private static boolean isWithinUpperBound( ImmutableBytesWritable stopExclusive, ImmutableBytesWritable row) { return stopExclusive.equals(HConstants.EMPTY_END_ROW) || row.compareTo(stopExclusive) < 0; } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java index 67776299a4..a4e3544519 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java @@ -18,10 +18,12 @@ import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.createConfiguration; +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; import java.io.IOException; import java.io.Serializable; /** Factory to create a TableHashWrapper. */ +@InternalApi public class TableHashWrapperFactory implements Serializable { private static final long serialVersionUID = 265433454L;