, Mutation> {
private static Logger logger = LoggerFactory.getLogger(HBaseResultToMutationFn.class);
private static final long serialVersionUID = 1L;
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java
index 62bad8d92b..7098a239d8 100644
--- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/sequencefiles/Utils.java
@@ -15,6 +15,7 @@
*/
package com.google.cloud.bigtable.beam.sequencefiles;
+import com.google.bigtable.repackaged.com.google.api.core.InternalApi;
import org.apache.beam.runners.dataflow.DataflowRunner;
import org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
@@ -29,7 +30,8 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-class Utils {
+@InternalApi
+public class Utils {
private static final Log LOG = LogFactory.getLog(Utils.class);
/**
@@ -98,7 +100,7 @@ public ResourceId apply(String input) {
*
* @param result
*/
- static void waitForPipelineToFinish(PipelineResult result) {
+ public static void waitForPipelineToFinish(PipelineResult result) {
try {
// Check to see if we are creating a template.
// This should throw {@link UnsupportedOperationException} when creating a template.
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java
new file mode 100644
index 0000000000..e62b3c8215
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java
@@ -0,0 +1,199 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString;
+
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import com.google.common.base.Objects;
+import com.google.common.base.Preconditions;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.ListCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.io.BoundedSource;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.values.KV;
+import org.apache.hadoop.hbase.util.Bytes;
+
+/**
+ * Buffers the RangeHashes generated by {@link HadoopHashTableSource}. This is an optimization that
+ * allows {@link ComputeAndValidateHashFromBigtableDoFn} to issue fewer ReadRow APIs with larger row
+ * ranges.
+ *
+ * Hadoop HashTable output is sorted by row-key and contains a row-range and hash. Beam
+ * Pcollection do not guarantee any ordering. To fetch a batch of ranges in 1 ReadRows operation,
+ * this source buffers then and outputs a List guaranteeing the sorted order of ranges.
+ *
+ * Emits a batch of sorted RangeHashes keyed by the start key of the first range.
+ */
+class BufferedHadoopHashTableSource extends BoundedSource>> {
+
+ private static final long serialVersionUID = 39842743L;
+
+ private static final int DEFAULT_BATCH_SIZE = 50;
+ private static final Coder>> CODER =
+ KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of()));;
+
+ // Max number of RangeHashes to buffer.
+ private final int maxBufferSize;
+ private final HadoopHashTableSource hashTableSource;
+
+ public BufferedHadoopHashTableSource(HadoopHashTableSource source) {
+ this(source, DEFAULT_BATCH_SIZE);
+ }
+
+ public BufferedHadoopHashTableSource(HadoopHashTableSource hashTableSource, int maxBufferSize) {
+ this.hashTableSource = hashTableSource;
+ this.maxBufferSize = maxBufferSize;
+ }
+
+ @Override
+ public List extends BoundedSource>>> split(
+ long desiredBundleSizeBytes, PipelineOptions options) throws IOException {
+
+ @SuppressWarnings("unchecked")
+ List splitHashTableSources =
+ (List) hashTableSource.split(desiredBundleSizeBytes, options);
+
+ List splitSources =
+ new ArrayList<>(splitHashTableSources.size());
+ // Keep the splits same as HashTableSource.
+ for (HadoopHashTableSource splitHashTableSource : splitHashTableSources) {
+ // Add the last range for [lastPartition, stopRow).
+ splitSources.add(new BufferedHadoopHashTableSource(splitHashTableSource));
+ }
+ return splitSources;
+ }
+
+ @Override
+ public Coder>> getOutputCoder() {
+ return CODER;
+ }
+
+ @Override
+ public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
+ // HashTable data files don't expose a method to estimate size or lineCount.
+ return hashTableSource.getEstimatedSizeBytes(options);
+ }
+
+ @Override
+ public BoundedReader>> createReader(PipelineOptions options)
+ throws IOException {
+ return new BufferedHashBasedReader(this, hashTableSource.createReader(options));
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof BufferedHadoopHashTableSource)) {
+ return false;
+ }
+ BufferedHadoopHashTableSource that = (BufferedHadoopHashTableSource) o;
+ return maxBufferSize == that.maxBufferSize
+ && Objects.equal(hashTableSource, that.hashTableSource);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hashCode(maxBufferSize, hashTableSource);
+ }
+
+ @Override
+ public String toString() {
+ return "BufferedHadoopHashTableSource ["
+ + immutableBytesToString(hashTableSource.startRowInclusive)
+ + ", "
+ + immutableBytesToString(hashTableSource.stopRowExclusive)
+ + "), maxBufferSize="
+ + maxBufferSize;
+ }
+
+ private static class BufferedHashBasedReader extends BoundedReader>> {
+
+ private final BoundedReader hashReader;
+ private final BufferedHadoopHashTableSource source;
+
+ private List buffer;
+
+ public BufferedHashBasedReader(
+ BufferedHadoopHashTableSource source, BoundedReader hashReader) {
+ this.source = source;
+ this.hashReader = hashReader;
+ this.buffer = new ArrayList<>(source.maxBufferSize);
+ }
+
+ @Override
+ public boolean start() throws IOException {
+ if (!hashReader.start()) {
+ // HashReader does not have any hashes, return empty reader.
+ return false;
+ }
+ // Start returned true, consume the current RangeHash.
+ buffer.add(hashReader.getCurrent());
+ bufferRangeHashes();
+ // Buffer is not empty, return true to consume the current buffer.
+ return true;
+ }
+
+ // Reads from hashReader and buffers the RangeHashes.
+ // Returns true if any RangeHashes were read from hashReader.
+ private boolean bufferRangeHashes() throws IOException {
+ boolean readRangeHashes = false;
+ while (buffer.size() < source.maxBufferSize && hashReader.advance()) {
+ readRangeHashes = true;
+ buffer.add(hashReader.getCurrent());
+ }
+ return readRangeHashes;
+ }
+
+ @Override
+ public boolean advance() throws IOException {
+ // Reset the buffer for next batch.
+ buffer = new ArrayList<>(source.maxBufferSize);
+
+ return bufferRangeHashes();
+ }
+
+ @Override
+ public KV> getCurrent() {
+ // getCurrent only gets called when buffer is not empty.
+ Preconditions.checkState(
+ !buffer.isEmpty(), "getCurrent() should only be called when start/advance return true.");
+ // GroupBy key is a string and not ImmutableBytesWritable because the WritableCoder is not
+ // deterministic. The outputted PCollection is grouped by the K and needs a deterministic
+ // coder. Having a String K leads to an unfortunate double encoding, ImmutableBytesWritable->
+ // HEX string -> UTF8 encoded string. The number of batches are significantly smaller than
+ // data fetched from Bigtable and should not have meaningful impact on the job performance.
+ return KV.of(Bytes.toStringBinary(buffer.get(0).startInclusive.copyBytes()), buffer);
+ }
+
+ @Override
+ public void close() throws IOException {
+ hashReader.close();
+ }
+
+ @Override
+ public BoundedSource>> getCurrentSource() {
+ return source;
+ }
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java
new file mode 100644
index 0000000000..82e24b55ef
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java
@@ -0,0 +1,222 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString;
+
+import com.google.bigtable.repackaged.com.google.common.base.Preconditions;
+import com.google.bigtable.repackaged.com.google.common.collect.Lists;
+import com.google.cloud.bigtable.beam.AbstractCloudBigtableTableDoFn;
+import com.google.cloud.bigtable.beam.CloudBigtableConfiguration;
+import com.google.cloud.bigtable.beam.TemplateUtils;
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions;
+import com.google.common.annotations.VisibleForTesting;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.beam.sdk.metrics.Counter;
+import org.apache.beam.sdk.metrics.Metrics;
+import org.apache.beam.sdk.options.ValueProvider;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.values.KV;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher;
+
+/**
+ * A {@link DoFn} that takes a row range and hash from HBase and validates the hash from rows read
+ * from Cloud Bigtable.
+ */
+class ComputeAndValidateHashFromBigtableDoFn
+ extends AbstractCloudBigtableTableDoFn>>, RangeHash> {
+
+ private static final long serialVersionUID = 2349094L;
+ private final ValueProvider tableName;
+ private final ValueProvider projectId;
+ private final ValueProvider sourceHashDir;
+
+ private final TableHashWrapperFactory tableHashWrapperFactory;
+
+ // Counter for reporting matching and mismatching ranges. Names are similar to HBase sync-table
+ // job.
+ private final Counter matches = Metrics.counter("cbt-dataflow-validate", "ranges_matched");
+ private final Counter mismatches = Metrics.counter("cbt-dataflow-validate", "ranges_not_matched");
+
+ public ComputeAndValidateHashFromBigtableDoFn(SyncTableOptions options) {
+ super(TemplateUtils.BuildSyncTableConfig(options));
+ this.tableName = options.getBigtableTableId();
+ // Create a local copy of ValueProviders, PipelineOptions are not serializable.
+ projectId = options.getBigtableProject();
+ sourceHashDir = options.getHashTableOutputDir();
+ tableHashWrapperFactory = new TableHashWrapperFactory();
+ }
+
+ @VisibleForTesting
+ ComputeAndValidateHashFromBigtableDoFn(
+ CloudBigtableConfiguration config,
+ ValueProvider tableName,
+ ValueProvider projectId,
+ ValueProvider sourceHashDir,
+ TableHashWrapperFactory factory) {
+ super(config);
+ this.tableName = tableName;
+ this.tableHashWrapperFactory = factory;
+ this.sourceHashDir = projectId;
+ this.projectId = sourceHashDir;
+ }
+
+ @ProcessElement
+ public void processElement(ProcessContext context) throws Exception {
+ List> wrapperdRangeHashes = Lists.newArrayList(context.element().getValue());
+ // BufferedHadoopHashTableSource generates only 1 item per groupby key, key is startKey for the
+ // Sorted ranges.
+ Preconditions.checkState(
+ wrapperdRangeHashes.size() == 1, "Can not have multiple entries for a key");
+ List rangeHashes = wrapperdRangeHashes.get(0);
+ Preconditions.checkState(!rangeHashes.isEmpty(), "Can not have empty ranges in DO_FN");
+
+ // If a metric is not logged, it is absent from all the metrics (as opposed to being
+ // 0). By logging a 0 value for the metrics we guarantee that they shows up on Dataflow UIs.
+ mismatches.inc(0);
+ matches.inc(0);
+
+ ImmutableBytesWritable rangeStartInclusive = rangeHashes.get(0).startInclusive;
+ ImmutableBytesWritable rangeEndExclusive =
+ rangeHashes.get(rangeHashes.size() - 1).stopExclusive;
+
+ BigtableResultHasher resultHasher = new BigtableResultHasher();
+ resultHasher.startBatch(rangeStartInclusive);
+
+ // Since all the row-ranges are sorted in HashTable's data files, 1 big scan can be used
+ // to read all the row ranges. Parallelism is achieved by splitting the HashTable's data
+ // files into smaller bundle of row-ranges in GroupBy.
+ ResultScanner scanner =
+ createBigtableScan(rangeStartInclusive.copyBytes(), rangeEndExclusive.copyBytes());
+
+ Iterator rangeHashIterator = rangeHashes.iterator();
+ long numRows = 0;
+
+ RangeHash currentRangeHash = rangeHashIterator.next();
+
+ // Process each row and validate hashes
+ for (Result result : scanner) {
+ numRows++;
+ if (numRows % 10_000 == 0) {
+ // Heartbeat in logs in case a large scan gets hung.
+ DOFN_LOG.debug("Processed " + numRows + " rows ");
+ }
+
+ ImmutableBytesWritable rowKey = new ImmutableBytesWritable(result.getRow());
+
+ // Check if the rowKey belongs to current range, if not keep iterating through the
+ // rangeHashes until rowKey's range is found.
+ while (!isWithinUpperBound(currentRangeHash.stopExclusive, rowKey)) {
+ validateBatchHash(context, resultHasher, currentRangeHash);
+ // THIS SHOULD NEVER HAPPEN. Bigtable is being scanned till the last
+ // RangeHash.endKeyExclusive(), so bigtable's result should not outlast the
+ // rangeHashes.
+ Preconditions.checkState(
+ rangeHashIterator.hasNext(),
+ "Buffer reached to end while scan is still active at row : %s. "
+ + "Affected Range: [%s, %s)."
+ + immutableBytesToString(result.getRow())
+ + immutableBytesToString(rangeStartInclusive)
+ + immutableBytesToString(rangeEndExclusive));
+ currentRangeHash = rangeHashIterator.next();
+ }
+
+ // Always Hash the current row.
+ resultHasher.hashResult(result);
+ }
+
+ // Bigtable scan is finished at this point and rangeHashes may contain additional row ranges.
+ // Last range will always be unverified as the range end is exclusive and
+ // currentRow > rangeEndExclusive will never by true. Verify the last range.
+ validateBatchHash(context, resultHasher, currentRangeHash);
+
+ // If there are remaining ranges in the rangeHashes they all need to reported as mismatched as
+ // there is nothing in Cloud Bigtable for those row ranges.
+ // for (int i = bufferIndex; i < rangeHashes.size(); i++) {
+ while (rangeHashIterator.hasNext()) {
+ currentRangeHash = rangeHashIterator.next();
+ reportMismatch(context, currentRangeHash);
+ }
+
+ DOFN_LOG.debug(
+ "Finishing context by outputting {} keys in range [{}, {}).",
+ rangeHashes.size(),
+ immutableBytesToString(rangeStartInclusive),
+ immutableBytesToString(rangeEndExclusive));
+ }
+
+ private ResultScanner createBigtableScan(byte[] startKeyInclusive, byte[] stopKeyExclusive)
+ throws IOException {
+ Table table = getConnection().getTable(TableName.valueOf(tableName.get()));
+ // Get the scan from TableHash, HashTable can be run to hash a small part of data (selected
+ // column families, timestamp range, maxVersions etc), this scan allows us to fetch the same
+ // data from Cloud Bigtable to match.
+ TableHashWrapper tableHash =
+ tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get());
+ Scan scan = tableHash.getScan();
+ // Set the workitem boundaries on the scan.
+ if (startKeyInclusive.length > 0) {
+ scan.withStartRow(startKeyInclusive, true);
+ }
+ if (stopKeyExclusive.length > 0) {
+ scan.withStopRow(stopKeyExclusive, false);
+ }
+
+ return table.getScanner(scan);
+ }
+
+ /**
+ * Determines if row >= stopExclusive for a row range (start, stopExclusive). Empty stopExclusive
+ * represents a range with no upper bound.
+ */
+ private static boolean isWithinUpperBound(
+ ImmutableBytesWritable stopExclusive, ImmutableBytesWritable row) {
+ return stopExclusive.equals(HConstants.EMPTY_END_ROW) || row.compareTo(stopExclusive) < 0;
+ }
+
+ private void validateBatchHash(
+ ProcessContext context, BigtableResultHasher resultHasher, RangeHash currentRangeHash) {
+ // The batch is always started, so its safe to finish the batch. If there were no rows, we will
+ // get a hash for empty batch.
+ resultHasher.finishBatch();
+ if (!resultHasher.getBatchHash().equals(currentRangeHash.hash)) {
+ reportMismatch(context, currentRangeHash);
+ } else {
+ matches.inc();
+ }
+ // Start a new batch
+ resultHasher.startBatch(currentRangeHash.stopExclusive);
+ }
+
+ private void reportMismatch(ProcessContext context, RangeHash currentRangeHash) {
+ mismatches.inc();
+ DOFN_LOG.info(
+ "MISMATCH ON RANGE [{}, {}).",
+ immutableBytesToString(currentRangeHash.startInclusive),
+ immutableBytesToString(currentRangeHash.stopExclusive));
+ context.output(currentRangeHash);
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java
new file mode 100644
index 0000000000..f6ecf21e24
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java
@@ -0,0 +1,440 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString;
+
+import com.google.bigtable.repackaged.com.google.api.core.InternalApi;
+import com.google.bigtable.repackaged.com.google.common.annotations.VisibleForTesting;
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import com.google.cloud.bigtable.beam.validation.TableHashWrapper.TableHashReader;
+import com.google.common.base.Objects;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import javax.annotation.Nullable;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.DefaultCoder;
+import org.apache.beam.sdk.io.BoundedSource;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.ValueProvider;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+
+/**
+ * A beam source to read output of Hadoop HashTable job. The source creates 1 workitem per HashTable
+ * data file and emits a row-range/hash pair.
+ */
+@InternalApi
+public class HadoopHashTableSource extends BoundedSource implements Serializable {
+
+ private static final long serialVersionUID = 2383724L;
+
+ private static final Coder CODER = RangeHashCoder.of();
+
+ /**
+ * A simple POJO encapsulating a row range and the corresponding hash generated by HashTable job.
+ * TODO Evaluate if we can use AutoValue for this class.
+ */
+ @DefaultCoder(RangeHashCoder.class)
+ public static class RangeHash {
+
+ public final ImmutableBytesWritable startInclusive;
+ public final ImmutableBytesWritable stopExclusive;
+ public final ImmutableBytesWritable hash;
+
+ private RangeHash(
+ ImmutableBytesWritable startInclusive,
+ ImmutableBytesWritable stopExclusive,
+ ImmutableBytesWritable hash) {
+ this.startInclusive = startInclusive;
+ this.stopExclusive = stopExclusive;
+ this.hash = hash;
+ }
+
+ static RangeHash of(
+ ImmutableBytesWritable startInclusive,
+ ImmutableBytesWritable stopExclusive,
+ ImmutableBytesWritable hash) {
+ Preconditions.checkNotNull(startInclusive);
+ Preconditions.checkNotNull(stopExclusive);
+ Preconditions.checkNotNull(hash);
+ return new RangeHash(startInclusive, stopExclusive, hash);
+ }
+
+ @Override
+ public String toString() {
+ return String.format(
+ "RangeHash{ range = [ %s, %s), hash: %s }",
+ immutableBytesToString(startInclusive),
+ immutableBytesToString(stopExclusive),
+ immutableBytesToString(hash));
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof RangeHash)) {
+ return false;
+ }
+ RangeHash rangeHash = (RangeHash) o;
+ return Objects.equal(startInclusive, rangeHash.startInclusive)
+ && Objects.equal(stopExclusive, rangeHash.stopExclusive)
+ && Objects.equal(hash, rangeHash.hash);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hashCode(startInclusive, stopExclusive, hash);
+ }
+ }
+
+ public static final Log LOG = LogFactory.getLog(HadoopHashTableSource.class);
+
+ private final ValueProvider projectId;
+
+ // Path to the output of HashTable job. Usually in GCS.
+ private final ValueProvider sourceHashDir;
+
+ // Row range owned by this source.
+ // The Start and Stop row are serialized in a custom way.
+ @VisibleForTesting @Nullable transient ImmutableBytesWritable startRowInclusive;
+
+ @VisibleForTesting @Nullable transient ImmutableBytesWritable stopRowExclusive;
+
+ private final TableHashWrapperFactory tableHashWrapperFactory;
+
+ /**
+ * Creates a HadoopHashTableSource that reads HashTable data from hashTableOutputDir in GCS bucket
+ * in project $(projectId).
+ */
+ public HadoopHashTableSource(
+ ValueProvider projectId, ValueProvider sourceHashDir) {
+ this(projectId, sourceHashDir, /*startRowInclusive*/ null, /*stopRowExclusive*/ null);
+ }
+
+ /**
+ * Constructor to initialize a HadoopHashTableSource for a given row-range. Used for creating
+ * split sources.
+ */
+ @VisibleForTesting
+ HadoopHashTableSource(
+ ValueProvider projectId,
+ ValueProvider sourceHashDir,
+ @Nullable ImmutableBytesWritable startRowInclusive,
+ @Nullable ImmutableBytesWritable stopRowExclusive) {
+ this(
+ projectId,
+ sourceHashDir,
+ startRowInclusive,
+ stopRowExclusive,
+ new TableHashWrapperFactory());
+ }
+
+ @VisibleForTesting
+ HadoopHashTableSource(
+ ValueProvider projectId,
+ ValueProvider hadoopHashTableOutputDir,
+ @Nullable ImmutableBytesWritable startRowInclusive,
+ @Nullable ImmutableBytesWritable stopRowExclusive,
+ TableHashWrapperFactory tableHashWrapperFactory) {
+ this.projectId = projectId;
+ this.sourceHashDir = hadoopHashTableOutputDir;
+ // startRow and stopRow will be null when the template is initialized. startRow and stopRow are
+ // read from the hashTableOutputDir, which is only available at pipeline runtime.
+ this.startRowInclusive = startRowInclusive;
+ this.stopRowExclusive = stopRowExclusive;
+ this.tableHashWrapperFactory = tableHashWrapperFactory;
+ }
+
+ @Override
+ public List extends BoundedSource> split(
+ long desiredBundleSizeBytes, PipelineOptions options) throws IOException {
+ // This method relies on the partitioning done by HBase-HashTable job. There is a possibility
+ // of stragglers. SyncTable handles it by using a group by and further splitting workitems.
+ TableHashWrapper hash =
+ tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get());
+
+ ImmutableList partitions = hash.getPartitions();
+ int numPartitions = partitions.size();
+
+ List splitSources = new ArrayList<>(numPartitions + 1);
+ if (numPartitions == 0) {
+ // There are 0 partitions and 1 hashfile, return single source with full key range.
+ splitSources.add(
+ new HadoopHashTableSource(
+ projectId,
+ sourceHashDir,
+ hash.getStartRow(),
+ hash.getStopRow(),
+ tableHashWrapperFactory));
+ return splitSources;
+ }
+
+ // Use the HashTable start key. The value is HConstants.EMPTY_START_ROW for full table scan.
+ ImmutableBytesWritable nextStartRow = hash.getStartRow();
+ ImmutableBytesWritable stopRow = hash.getStopRow();
+
+ // The output of HashTable is organized as partition file and a set of datafiles.
+ // Partition file contains a list of partitions, these partitions split the key-range of a table
+ // into roughly equal row-ranges and hashes for these row-ranges are stored in a single
+ // datafile.
+ //
+ // There are always numPartitions +1 data files. Datafile(i) covers hashes for [partition{i-1},
+ // partition{i}).
+ // So a partition file containing entries [b,f] for a table with row range [a,z] will have 3
+ // data files containing hashes.
+ // file0 will contain [a(nextStartRow), b), file1 will contain [b,f), and file3 will contain
+ // [f,z(stopRow))
+ for (int i = 0; i < numPartitions; i++) {
+ // TODO make a utility function that generates [start, end) format from start/end.
+ LOG.debug(
+ "Adding: ["
+ + immutableBytesToString(nextStartRow.get())
+ + ", "
+ + immutableBytesToString(partitions.get(i).get())
+ + ")");
+ splitSources.add(
+ new HadoopHashTableSource(
+ projectId, sourceHashDir, nextStartRow, partitions.get(i), tableHashWrapperFactory));
+ nextStartRow = partitions.get(i);
+ }
+ // Add the last range for [lastPartition, stopRow).
+ LOG.debug(
+ "Adding: ["
+ + immutableBytesToString(nextStartRow.get())
+ + ", "
+ + immutableBytesToString(stopRow.get())
+ + ")");
+ // Add the last range for [lastPartition, stopRow).
+ splitSources.add(
+ new HadoopHashTableSource(
+ projectId, sourceHashDir, nextStartRow, stopRow, tableHashWrapperFactory));
+ LOG.info("Returning " + splitSources.size() + " sources from " + numPartitions + " partitions");
+ return splitSources;
+ }
+
+ @Override
+ public Coder getOutputCoder() {
+ return CODER;
+ }
+
+ @Override
+ public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
+ // HashTable data files don't expose a method to estimate size or lineCount.
+ return 0;
+ }
+
+ @Override
+ public BoundedReader createReader(PipelineOptions options) throws IOException {
+ TableHashWrapper hash =
+ tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get());
+
+ // The row range for an un-split source is determined from the output of HashTable job.
+ // HashTableOutputDir is a runtime parameter and hence not available at construction time, so
+ // populate the start and stop here.
+ if (startRowInclusive == null || stopRowExclusive == null) {
+ startRowInclusive = hash.getStartRow();
+ stopRowExclusive = hash.getStopRow();
+ }
+
+ return new HashBasedReader(
+ this,
+ startRowInclusive,
+ stopRowExclusive,
+ hash.newReader(
+ SyncTableUtils.createConfiguration(this.projectId.get(), this.sourceHashDir.get()),
+ startRowInclusive));
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof HadoopHashTableSource)) {
+ return false;
+ }
+ HadoopHashTableSource that = (HadoopHashTableSource) o;
+ return Objects.equal(projectId, that.projectId)
+ && Objects.equal(sourceHashDir, that.sourceHashDir)
+ && Objects.equal(startRowInclusive, that.startRowInclusive)
+ && Objects.equal(stopRowExclusive, that.stopRowExclusive);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hashCode(projectId, sourceHashDir, startRowInclusive, stopRowExclusive);
+ }
+
+ @Override
+ public String toString() {
+ return "HadoopHashTableSource ["
+ + immutableBytesToString(startRowInclusive)
+ + ", "
+ + immutableBytesToString(stopRowExclusive)
+ + ')';
+ }
+
+ private void writeObject(ObjectOutputStream s) throws IOException {
+ s.defaultWriteObject();
+ // Start and Stop can be null, write a boolean to indicate if start/stop is expected.
+ if (startRowInclusive == null) {
+ s.writeBoolean(false);
+ } else {
+ s.writeBoolean(true);
+ s.writeObject(startRowInclusive.copyBytes());
+ }
+
+ if (stopRowExclusive == null) {
+ s.writeBoolean(false);
+ } else {
+ s.writeBoolean(true);
+ s.writeObject(stopRowExclusive.copyBytes());
+ }
+ }
+
+ private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException {
+ s.defaultReadObject();
+ // start/stop can be null, they are preceded by a boolean indicating their presence.
+ if (s.readBoolean() == true) {
+ startRowInclusive = new ImmutableBytesWritable((byte[]) s.readObject());
+ }
+ if (s.readBoolean() == true) {
+ stopRowExclusive = new ImmutableBytesWritable((byte[]) s.readObject());
+ }
+ }
+
+ @VisibleForTesting
+ static class HashBasedReader extends BoundedReader {
+
+ private final HadoopHashTableSource source;
+ private final TableHashReader reader;
+
+ @VisibleForTesting final ImmutableBytesWritable startRowInclusive;
+ @VisibleForTesting final ImmutableBytesWritable stopRowExclusive;
+
+ // Flag indicating that this workitem is finished.
+ private boolean isDone = false;
+ private ImmutableBytesWritable currentRangeStartKey;
+ // Hash for the current range.
+ private ImmutableBytesWritable currentHash;
+ private RangeHash currentRangeHash;
+
+ public HashBasedReader(
+ HadoopHashTableSource source,
+ ImmutableBytesWritable startRowInclusive,
+ ImmutableBytesWritable stopRowExclusive,
+ TableHashReader reader) {
+ this.source = source;
+ this.startRowInclusive = startRowInclusive;
+ this.stopRowExclusive = stopRowExclusive;
+ this.reader = reader;
+ }
+
+ @Override
+ public boolean start() throws IOException {
+ LOG.debug(
+ "Starting a new reader at key range ["
+ + immutableBytesToString(startRowInclusive)
+ + " ,"
+ + immutableBytesToString(stopRowExclusive)
+ + ").");
+
+ if (readNextKey()) {
+ // Dataflow calls start, followed by getCurrent. HashBased reader needs to read on TableHash
+ // twice to return a RangeHash since it specifies both range-start and range-end.
+ advance();
+ return true;
+ }
+
+ isDone = true;
+ return false;
+ }
+
+ @Override
+ public boolean advance() throws IOException {
+ if (isDone) {
+ LOG.debug("Ending workitem at key " + immutableBytesToString(currentRangeStartKey) + " .");
+ return false;
+ }
+
+ ImmutableBytesWritable startKey = this.currentRangeStartKey;
+ ImmutableBytesWritable hash = this.currentHash;
+
+ // if there is nothing to read, we are done. readNextKey advances the currentRangeStartKey.
+ isDone = !readNextKey();
+ currentRangeHash = RangeHash.of(startKey, currentRangeStartKey, hash);
+
+ return true;
+ }
+
+ // Returns true if a key can be read for this workitem.
+ private boolean readNextKey() throws IOException {
+ if (reader.next()) {
+ currentRangeStartKey = reader.getCurrentKey();
+ if ( // StopRow is not set, everything is in bounds.
+ (stopRowExclusive.equals(HConstants.EMPTY_END_ROW)
+ || currentRangeStartKey.compareTo(stopRowExclusive) < 0)) { // currentKey < stopKey
+ // There is a key to read and the key is within the bounds of this workitem. Return true.
+ currentHash = reader.getCurrentHash();
+ return true;
+ } else {
+ // There is a key to read but its outside of the bounds of this workitem.
+ currentHash = null;
+ return false;
+ }
+ }
+
+ // Nothing left to read for this workitem. Next range would have started from
+ // stopRowExclusive.
+ currentRangeStartKey = stopRowExclusive;
+ currentHash = null;
+ return false;
+ }
+
+ @Override
+ public RangeHash getCurrent() {
+ return currentRangeHash;
+ }
+
+ @Override
+ public void close() throws IOException {
+ LOG.info(
+ "Finishing a reader for key range ["
+ + immutableBytesToString(startRowInclusive)
+ + " ,"
+ + immutableBytesToString(stopRowExclusive)
+ + "). Ending at "
+ + immutableBytesToString(currentRangeStartKey));
+ reader.close();
+ }
+
+ @Override
+ public BoundedSource getCurrentSource() {
+ return source;
+ }
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java
new file mode 100644
index 0000000000..d6341a08f2
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InvalidObjectException;
+import java.io.OutputStream;
+import java.util.Collections;
+import java.util.List;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.CoderException;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+
+/** Coder used by beam to encode/decode @{@link RangeHash} objects. */
+public class RangeHashCoder extends Coder {
+
+ public static Coder of() {
+ return new RangeHashCoder();
+ }
+
+ @Override
+ public void encode(RangeHash value, OutputStream outStream) throws IOException {
+ if (value == null) {
+ throw new CoderException("Can not encode null objects.");
+ }
+ DataOutputStream dataOutputStream = new DataOutputStream(outStream);
+ // RangeHash fields can never be null.
+ value.startInclusive.write(dataOutputStream);
+ value.stopExclusive.write(dataOutputStream);
+ value.hash.write(dataOutputStream);
+ }
+
+ @Override
+ public RangeHash decode(InputStream inStream) throws IOException {
+ DataInputStream dataInputStream = new DataInputStream(inStream);
+
+ ImmutableBytesWritable startInclusive = new ImmutableBytesWritable();
+ startInclusive.readFields(dataInputStream);
+
+ ImmutableBytesWritable stopExclusive = new ImmutableBytesWritable();
+ stopExclusive.readFields(dataInputStream);
+
+ ImmutableBytesWritable hash = new ImmutableBytesWritable();
+ hash.readFields(dataInputStream);
+
+ return RangeHash.of(startInclusive, stopExclusive, hash);
+ }
+
+ @Override
+ public List extends Coder>> getCoderArguments() {
+ return Collections.emptyList();
+ }
+
+ @Override
+ public void verifyDeterministic() throws NonDeterministicException {
+ // This is a deterministic coder as it writes the byte[] in order.
+ }
+
+ /**
+ * !!! DO NOT DELETE !!!
+ *
+ * See readObjectNoData method in:
+ * https://docs.oracle.com/javase/7/docs/platform/serialization/spec/input.html#6053.
+ *
+ *
Disable backwards compatibility with previous versions that were serialized.
+ *
+ * @throws InvalidObjectException
+ */
+ @SuppressWarnings("unused")
+ private void readObjectNoData() throws InvalidObjectException {
+ throw new InvalidObjectException("Hash data required");
+ }
+
+ @Override
+ protected Object clone() throws CloneNotSupportedException {
+ return super.clone();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return other instanceof RangeHashCoder;
+ }
+
+ @Override
+ public int hashCode() {
+ return RangeHashCoder.class.hashCode();
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java
new file mode 100644
index 0000000000..56b38fc3cb
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly;
+import com.google.bigtable.repackaged.com.google.gson.Gson;
+import com.google.cloud.bigtable.beam.sequencefiles.Utils;
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import com.google.common.annotations.VisibleForTesting;
+import java.util.List;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
+import org.apache.beam.sdk.io.Read;
+import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.metrics.MetricQueryResults;
+import org.apache.beam.sdk.metrics.MetricResult;
+import org.apache.beam.sdk.options.Default;
+import org.apache.beam.sdk.options.Description;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.options.ValueProvider;
+import org.apache.beam.sdk.transforms.GroupByKey;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.SimpleFunction;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * A job that takes HBase HashTable output and compares the hashes from Cloud Bigtable table.
+ *
+ *
Execute the following command to run the job directly:
+ *
+ *
+ * mvn compile exec:java \
+ * -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
+ * -Dexec.args="--runner=DataflowRunner \
+ * --project=$PROJECT \
+ * --bigtableInstanceId=$INSTANCE \
+ * --bigtableTableId=$TABLE \
+ * --sourceHashDir=$SOURCE_HASH_DIR \
+ * --outputPrefix=$OUtPUT_PREFIX \
+ * --stagingLocation=$STAGING_LOC \
+ * --tempLocation=$TMP_LOC \
+ * --region=$REGION \
+ * --workerZone=$WORKER_ZONE"
+ *
+ *
+ * Execute the following command to create the Dataflow template:
+ *
+ *
+ * mvn compile exec:java \
+ * -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
+ * -Dexec.args="--runner=DataflowRunner \
+ * --project=$PROJECT \
+ * --stagingLocation=gs://$STAGING_PATH \
+ * --templateLocation=gs://$TEMPLATE_PATH \
+ * --wait=false"
+ *
+ *
+ * There are a few ways to run the pipeline using the template. See Dataflow doc for details:
+ * https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload
+ * a metadata file that contains information about the runtime parameters that can be used for
+ * parameter validation purpose and more. A sample metadata file can be found at
+ * "src/main/resources/SyncTableJob_metadata".
+ *
+ *
An example using gcloud command line:
+ *
+ *
+ * gcloud beta dataflow jobs run $JOB_NAME \
+ * --gcs-location gs://$TEMPLATE_PATH \
+ * --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,sourceHashDir=gs://$SOURCE_HASH_DIR,outputPrefix=$OUTPUT_PREFIX
+ *
+ */
+@InternalExtensionOnly
+public class SyncTableJob {
+
+ private static final Log LOG = LogFactory.getLog(SyncTableJob.class);
+
+ public interface SyncTableOptions extends GcpOptions {
+
+ @Description("This Bigtable App Profile id.")
+ ValueProvider getBigtableAppProfileId();
+
+ @SuppressWarnings("unused")
+ void setBigtableAppProfileId(ValueProvider appProfileId);
+
+ @Description("The project that contains the table to export. Defaults to --project.")
+ @Default.InstanceFactory(Utils.DefaultBigtableProjectFactory.class)
+ ValueProvider getBigtableProject();
+
+ @SuppressWarnings("unused")
+ void setBigtableProject(ValueProvider projectId);
+
+ @Description("The Bigtable instance id that contains the table to export.")
+ ValueProvider getBigtableInstanceId();
+
+ @SuppressWarnings("unused")
+ void setBigtableInstanceId(ValueProvider instanceId);
+
+ @Description("The Bigtable table id to export.")
+ ValueProvider getBigtableTableId();
+
+ @SuppressWarnings("unused")
+ void setBigtableTableId(ValueProvider tableId);
+
+ @Description("HBase HashTable job output dir.")
+ ValueProvider getHashTableOutputDir();
+
+ @SuppressWarnings("unused")
+ // Rename it to sourceHashDir as in HBase sync table job.
+ void setHashTableOutputDir(ValueProvider hashTableOutputDir);
+
+ @Description("File pattern for files containing mismatched row ranges.")
+ ValueProvider getOutputPrefix();
+
+ @SuppressWarnings("unused")
+ void setOutputPrefix(ValueProvider outputPrefix);
+
+ // When creating a template, this flag must be set to false.
+ @Description("Wait for pipeline to finish.")
+ @Default.Boolean(true)
+ boolean getWait();
+
+ @SuppressWarnings("unused")
+ void setWait(boolean wait);
+ }
+
+ public static void main(String[] args) {
+ PipelineOptionsFactory.register(SyncTableOptions.class);
+
+ SyncTableOptions opts =
+ PipelineOptionsFactory.fromArgs(args).withValidation().as(SyncTableOptions.class);
+
+ LOG.info("===> Building Pipeline");
+ Pipeline pipeline = buildPipeline(opts);
+
+ LOG.info("===> Running Pipeline");
+ PipelineResult result = pipeline.run();
+
+ if (opts.getWait()) {
+ Utils.waitForPipelineToFinish(result);
+ }
+
+ // Log all the counters for number of matches and number of mismatches.
+ MetricQueryResults metrics = result.metrics().allMetrics();
+ for (MetricResult counter : metrics.getCounters()) {
+ LOG.warn(counter.getName() + ":" + counter.getAttempted());
+ }
+ }
+
+ @VisibleForTesting
+ public static Pipeline buildPipeline(SyncTableOptions opts) {
+ Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts));
+ pipeline
+ .apply(
+ "Read HBase HashTable output",
+ Read.from(
+ new BufferedHadoopHashTableSource(
+ new HadoopHashTableSource(
+ opts.getBigtableProject(), opts.getHashTableOutputDir()))))
+ .apply(
+ "group by and create granular workitems", GroupByKey.>create())
+ .apply("validate hash", ParDo.of(new ComputeAndValidateHashFromBigtableDoFn(opts)))
+ .apply("Serialize the ranges", MapElements.via(new RangeHashToString()))
+ .apply("Write to file", TextIO.write().to(opts.getOutputPrefix()).withSuffix(".txt"));
+ return pipeline;
+ }
+
+ static class RangeHashToString extends SimpleFunction {
+ // TODO maybe explore a sequenceFile sink for RangeHash. Hadoop jobs using this output may be
+ // easier to write for sequence file.
+ private static final Gson GSON = new Gson();
+
+ @Override
+ public String apply(RangeHash input) {
+ return GSON.toJson(input);
+ }
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java
new file mode 100644
index 0000000000..cc92bea6a4
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import com.google.bigtable.repackaged.com.google.api.core.InternalApi;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.util.Bytes;
+
+/** Utility class for SyncTable job. */
+@InternalApi
+public class SyncTableUtils {
+
+ private SyncTableUtils() {}
+
+ public static String immutableBytesToString(ImmutableBytesWritable bytes) {
+ if (bytes == null) {
+ return "";
+ }
+ return immutableBytesToString(bytes.get());
+ }
+
+ public static String immutableBytesToString(byte[] bytes) {
+ return Bytes.toStringBinary(bytes);
+ }
+
+ /**
+ * Creates a HBase configuration for reading HashTable output from GCS bucket located in
+ * projectId.
+ *
+ * @param projectId project containing the GCS bucket holding hashtable output.
+ * @param sourceHashDir location of hashtable output from HBase.
+ * @return
+ */
+ public static Configuration createConfiguration(String projectId, String sourceHashDir) {
+ Configuration conf = HBaseConfiguration.create();
+ conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS");
+ conf.set("fs.gs.project.id", projectId);
+ conf.set("fs.defaultFS", sourceHashDir);
+ conf.set("google.cloud.auth.service.account.enable", "true");
+ return conf;
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java
new file mode 100644
index 0000000000..55200570ed
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import com.google.bigtable.repackaged.com.google.api.core.InternalApi;
+import com.google.common.collect.ImmutableList;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.Serializable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+
+/**
+ * Wraps HashTable.TableHash object and delegates the calls to it. This class exposes the minimal
+ * interface required from TableHash. This class is required for mocking purposes in unit tests.
+ */
+@InternalApi
+public interface TableHashWrapper extends Serializable {
+
+ int getNumHashFiles();
+
+ ImmutableList getPartitions();
+
+ ImmutableBytesWritable getStartRow();
+
+ ImmutableBytesWritable getStopRow();
+
+ Scan getScan();
+
+ TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow);
+
+ interface TableHashReader extends Closeable {
+ boolean next() throws IOException;
+
+ ImmutableBytesWritable getCurrentKey();
+
+ ImmutableBytesWritable getCurrentHash();
+
+ void close() throws IOException;
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java
new file mode 100644
index 0000000000..a4e3544519
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.cloud.bigtable.beam.validation;
+
+import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.createConfiguration;
+
+import com.google.bigtable.repackaged.com.google.api.core.InternalApi;
+import java.io.IOException;
+import java.io.Serializable;
+
+/** Factory to create a TableHashWrapper. */
+@InternalApi
+public class TableHashWrapperFactory implements Serializable {
+
+ private static final long serialVersionUID = 265433454L;
+
+ public TableHashWrapper getTableHash(String projectId, String sourceHashDir) throws IOException {
+ return TableHashWrapperImpl.create(
+ createConfiguration(projectId, sourceHashDir), sourceHashDir);
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java
new file mode 100644
index 0000000000..b04bd538a6
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor;
+import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash;
+import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash.Reader;
+
+class TableHashWrapperImpl implements TableHashWrapper {
+
+ static TableHashWrapper create(Configuration conf, String hashTableOutputDir) throws IOException {
+ TableHash tableHash = TableHash.read(conf, new Path(hashTableOutputDir));
+
+ TableHashWrapper tableHashWrapper = new TableHashWrapperImpl(tableHash);
+ Preconditions.checkArgument(
+ tableHashWrapper.getNumHashFiles() == (tableHashWrapper.getPartitions().size() + 1),
+ "Corrupt hashtable output. %d hash files for %d partitions. Expected %d files.",
+ tableHashWrapper.getNumHashFiles(),
+ tableHashWrapper.getPartitions().size(),
+ tableHashWrapper.getPartitions().size() + 1);
+ return tableHashWrapper;
+ }
+
+ private final TableHash hash;
+
+ private TableHashWrapperImpl(TableHash hash) {
+ this.hash = hash;
+ }
+
+ public int getNumHashFiles() {
+ return BigtableTableHashAccessor.getNumHashFiles(hash);
+ }
+
+ public ImmutableList getPartitions() {
+ return BigtableTableHashAccessor.getPartitions(hash);
+ }
+
+ public ImmutableBytesWritable getStartRow() {
+ return BigtableTableHashAccessor.getStartRow(hash);
+ }
+
+ public ImmutableBytesWritable getStopRow() {
+ return BigtableTableHashAccessor.getStopRow(hash);
+ }
+
+ public Scan getScan() {
+ try {
+ return BigtableTableHashAccessor.getScan(hash);
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to init a scan from TableHash: ", e);
+ }
+ }
+
+ public TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow) {
+ try {
+ return TableHashReaderImpl.create(hash.newReader(conf, startRow));
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "Failed to open reader at " + immutableBytesToString(startRow.copyBytes()), e);
+ }
+ }
+
+ static class TableHashReaderImpl implements TableHashReader {
+
+ private final Reader reader;
+
+ static TableHashReaderImpl create(TableHash.Reader reader) {
+ Preconditions.checkNotNull(reader, "Reader can not be null.");
+ return new TableHashReaderImpl(reader);
+ }
+
+ private TableHashReaderImpl(TableHash.Reader reader) {
+ this.reader = reader;
+ }
+
+ @Override
+ public boolean next() throws IOException {
+ return reader.next();
+ }
+
+ @Override
+ public ImmutableBytesWritable getCurrentKey() {
+ return reader.getCurrentKey();
+ }
+
+ @Override
+ public ImmutableBytesWritable getCurrentHash() {
+ return reader.getCurrentHash();
+ }
+
+ @Override
+ public void close() throws IOException {
+ reader.close();
+ }
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java
new file mode 100644
index 0000000000..a7db0add1c
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.mapreduce;
+
+import com.google.bigtable.repackaged.com.google.api.core.InternalApi;
+import com.google.common.collect.ImmutableList;
+import java.io.IOException;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.mapreduce.HashTable.ResultHasher;
+import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash;
+
+/** A helper class to access package private fields of HashTable.TableHash. */
+@InternalApi
+public class BigtableTableHashAccessor {
+
+ // Restrict object creation. This class should only be used to access state from TableHash.
+ private BigtableTableHashAccessor() {}
+
+ public static int getNumHashFiles(TableHash hash) {
+ return hash.numHashFiles;
+ }
+
+ public static ImmutableList getPartitions(TableHash hash) {
+ return ImmutableList.copyOf(hash.partitions);
+ }
+
+ public static ImmutableBytesWritable getStartRow(TableHash hash) {
+ return new ImmutableBytesWritable(hash.startRow);
+ }
+
+ public static ImmutableBytesWritable getStopRow(TableHash hash) {
+ return new ImmutableBytesWritable(hash.stopRow);
+ }
+
+ public static Scan getScan(TableHash hash) throws IOException {
+ return hash.initScan();
+ }
+
+ // Wrapper to access package private class ResultHasher. Delegates all the calls to underlying
+ // TableHash.ResultHasher, helps in mocking for unit tests.
+ public static class BigtableResultHasher {
+ private final ResultHasher hasher;
+
+ public BigtableResultHasher() {
+ hasher = new ResultHasher();
+ }
+
+ public void startBatch(ImmutableBytesWritable batchStartKey) {
+ hasher.startBatch(batchStartKey);
+ }
+
+ public void finishBatch() {
+ hasher.finishBatch();
+ }
+
+ public ImmutableBytesWritable getBatchHash() {
+ return hasher.getBatchHash();
+ }
+
+ public void hashResult(Result result) {
+ hasher.hashResult(result);
+ }
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt b/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt
new file mode 100644
index 0000000000..6e66d3e096
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt
@@ -0,0 +1,133 @@
+// Run from HBase shell. Run `hbase shell` from unix terminal on HBase master.
+create 'test', 'cf', {SPLITS => ["1", "2", "3", "4", "5", "6", "7", "8", "9"]}
+put 'test','1', 'cf:a', 'value1', 100
+put 'test','2', 'cf:a', 'value2', 100
+put 'test','3', 'cf:a', 'value3', 100
+put 'test','4', 'cf:a', 'value4', 100
+put 'test','5', 'cf:a', 'value5', 100
+put 'test','6', 'cf:a', 'value6', 100
+put 'test','7', 'cf:a', 'value7', 100
+put 'test','8', 'cf:a', 'value8', 100
+put 'test','9', 'cf:a', 'value9', 100
+put 'test','10', 'cf:a', 'value10', 100
+put 'test','11', 'cf:a', 'value11', 100
+put 'test','12', 'cf:a', 'value12', 100
+put 'test','13', 'cf:a', 'value13', 100
+put 'test','14', 'cf:a', 'value14', 100
+put 'test','15', 'cf:a', 'value15', 100
+put 'test','16', 'cf:a', 'value16', 100
+put 'test','17', 'cf:a', 'value17', 100
+put 'test','18', 'cf:a', 'value18', 100
+put 'test','19', 'cf:a', 'value19', 100
+put 'test','20', 'cf:a', 'value20', 100
+put 'test','21', 'cf:a', 'value21', 100
+put 'test','22', 'cf:a', 'value22', 100
+put 'test','23', 'cf:a', 'value23', 100
+put 'test','24', 'cf:a', 'value24', 100
+put 'test','25', 'cf:a', 'value25', 100
+put 'test','26', 'cf:a', 'value26', 100
+put 'test','27', 'cf:a', 'value27', 100
+put 'test','28', 'cf:a', 'value28', 100
+put 'test','29', 'cf:a', 'value29', 100
+put 'test','30', 'cf:a', 'value30', 100
+put 'test','31', 'cf:a', 'value31', 100
+put 'test','32', 'cf:a', 'value32', 100
+put 'test','33', 'cf:a', 'value33', 100
+put 'test','34', 'cf:a', 'value34', 100
+put 'test','35', 'cf:a', 'value35', 100
+put 'test','36', 'cf:a', 'value36', 100
+put 'test','37', 'cf:a', 'value37', 100
+put 'test','38', 'cf:a', 'value38', 100
+put 'test','39', 'cf:a', 'value39', 100
+put 'test','40', 'cf:a', 'value40', 100
+put 'test','41', 'cf:a', 'value41', 100
+put 'test','42', 'cf:a', 'value42', 100
+put 'test','43', 'cf:a', 'value43', 100
+put 'test','44', 'cf:a', 'value44', 100
+put 'test','45', 'cf:a', 'value45', 100
+put 'test','46', 'cf:a', 'value46', 100
+put 'test','47', 'cf:a', 'value47', 100
+put 'test','48', 'cf:a', 'value48', 100
+put 'test','49', 'cf:a', 'value49', 100
+put 'test','50', 'cf:a', 'value50', 100
+put 'test','51', 'cf:a', 'value51', 100
+put 'test','52', 'cf:a', 'value52', 100
+put 'test','53', 'cf:a', 'value53', 100
+put 'test','54', 'cf:a', 'value54', 100
+put 'test','55', 'cf:a', 'value55', 100
+put 'test','56', 'cf:a', 'value56', 100
+put 'test','57', 'cf:a', 'value57', 100
+put 'test','58', 'cf:a', 'value58', 100
+put 'test','59', 'cf:a', 'value59', 100
+put 'test','60', 'cf:a', 'value60', 100
+put 'test','61', 'cf:a', 'value61', 100
+put 'test','62', 'cf:a', 'value62', 100
+put 'test','63', 'cf:a', 'value63', 100
+put 'test','64', 'cf:a', 'value64', 100
+put 'test','65', 'cf:a', 'value65', 100
+put 'test','66', 'cf:a', 'value66', 100
+put 'test','67', 'cf:a', 'value67', 100
+put 'test','68', 'cf:a', 'value68', 100
+put 'test','69', 'cf:a', 'value69', 100
+put 'test','70', 'cf:a', 'value70', 100
+put 'test','71', 'cf:a', 'value71', 100
+put 'test','72', 'cf:a', 'value72', 100
+put 'test','73', 'cf:a', 'value73', 100
+put 'test','74', 'cf:a', 'value74', 100
+put 'test','75', 'cf:a', 'value75', 100
+put 'test','76', 'cf:a', 'value76', 100
+put 'test','77', 'cf:a', 'value77', 100
+put 'test','78', 'cf:a', 'value78', 100
+put 'test','79', 'cf:a', 'value79', 100
+put 'test','80', 'cf:a', 'value80', 100
+put 'test','81', 'cf:a', 'value81', 100
+put 'test','82', 'cf:a', 'value82', 100
+put 'test','83', 'cf:a', 'value83', 100
+put 'test','84', 'cf:a', 'value84', 100
+put 'test','85', 'cf:a', 'value85', 100
+put 'test','86', 'cf:a', 'value86', 100
+put 'test','87', 'cf:a', 'value87', 100
+put 'test','88', 'cf:a', 'value88', 100
+put 'test','89', 'cf:a', 'value89', 100
+put 'test','90', 'cf:a', 'value90', 100
+put 'test','91', 'cf:a', 'value91', 100
+put 'test','92', 'cf:a', 'value92', 100
+put 'test','93', 'cf:a', 'value93', 100
+put 'test','94', 'cf:a', 'value94', 100
+put 'test','95', 'cf:a', 'value95', 100
+put 'test','96', 'cf:a', 'value96', 100
+put 'test','97', 'cf:a', 'value97', 100
+put 'test','98', 'cf:a', 'value98', 100
+put 'test','99', 'cf:a', 'value99', 100
+put 'test','100', 'cf:a', 'value100', 100
+snapshot 'test', 'test-snapshot'
+list_snapshots
+
+
+////////////////////Run from Unix shell on HBase master node//////////////////
+// Export the snapshot
+hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -snapshot test-snapshot -copy-to /integration-test/data -mappers 16
+
+// Create the hashes for the table. Run the command from unix shell on an HBase
+// node.
+hbase org.apache.hadoop.hbase.mapreduce.HashTable --batchsize=10 --numhashfiles=10 test /integration-test/hashtable
+
+// Export the data into GCS
+hadoop fs -copyToLocal /integration-test /tmp/
+gsutil cp -r /tmp/integration-test gs:///
+
+// GCS bucket should look like this:
+$ gsutil ls gs:///integration-test/data
+gs:///integration-test/data/
+gs:///integration-test/data/.hbase-snapshot/
+gs:///integration-test/data/archive/
+$ gsutil ls gs:///integration-test/hashtable
+gs:///integration-test/hashtable/manifest
+gs:///integration-test/hashtable/partitions
+gs:///integration-test/hashtable/hashes/
+
+// Run from HBase shell. Run `hbase shell` from unix terminal on HBase master.
+// clean up the table
+disable 'test'
+drop 'test'
+exit
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo
new file mode 100644
index 0000000000..03ac02e452
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo
@@ -0,0 +1,2 @@
+
+
test-snapshottestϹ���. (@���������
\ No newline at end of file
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest
new file mode 100644
index 0000000000..6439f06130
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b
new file mode 100644
index 0000000000..1b91b948d8
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d
new file mode 100644
index 0000000000..951eb512ac
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b
new file mode 100644
index 0000000000..dc89f02ec2
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/3bfc13b0a9bf8148a91788a8d2b60117/cf/bab07e8089634e629a4c111ea2b415fe b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/3bfc13b0a9bf8148a91788a8d2b60117/cf/bab07e8089634e629a4c111ea2b415fe
new file mode 100644
index 0000000000..c7fb208f72
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/3bfc13b0a9bf8148a91788a8d2b60117/cf/bab07e8089634e629a4c111ea2b415fe differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d
new file mode 100644
index 0000000000..7638f6eabb
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b
new file mode 100644
index 0000000000..c6ba1f760b
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/818d6b145a50cfc3bf8ee865486fdda3/cf/afe596ef5c61440983da2dcb54d581ab b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/818d6b145a50cfc3bf8ee865486fdda3/cf/afe596ef5c61440983da2dcb54d581ab
new file mode 100644
index 0000000000..5a757daec8
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/818d6b145a50cfc3bf8ee865486fdda3/cf/afe596ef5c61440983da2dcb54d581ab differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2
new file mode 100644
index 0000000000..d29619e3ec
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2 differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85
new file mode 100644
index 0000000000..337b5f9280
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85 differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/_SUCCESS b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/_SUCCESS
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data
new file mode 100644
index 0000000000..26334294df
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/index
new file mode 100644
index 0000000000..f7ac1fc941
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/index differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data
new file mode 100644
index 0000000000..87b715673c
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/index
new file mode 100644
index 0000000000..4edcbd1ed5
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/index differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/data
new file mode 100644
index 0000000000..4b59b346f0
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/data differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/index
new file mode 100644
index 0000000000..4169ee8258
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/index differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data
new file mode 100644
index 0000000000..a05197b51d
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index
new file mode 100644
index 0000000000..9228013bfa
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/data
new file mode 100644
index 0000000000..6e29b085e7
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/data differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/index
new file mode 100644
index 0000000000..245c2ceb3f
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/index differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data
new file mode 100644
index 0000000000..40cbf30418
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/index
new file mode 100644
index 0000000000..dbbacaf8f0
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/index differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/data
new file mode 100644
index 0000000000..3f0e32269c
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/data differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index
new file mode 100644
index 0000000000..a0818358eb
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data
new file mode 100644
index 0000000000..effda57ece
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index
new file mode 100644
index 0000000000..a8eb1a1748
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data
new file mode 100644
index 0000000000..011b956c5f
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index
new file mode 100644
index 0000000000..fada13a256
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/data
new file mode 100644
index 0000000000..f55fa79aca
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/data differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/index
new file mode 100644
index 0000000000..8c8793cef8
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/index differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest
new file mode 100644
index 0000000000..a95421d027
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest
@@ -0,0 +1,4 @@
+#Wed Dec 30 01:23:41 UTC 2020
+numHashFiles=10
+table=test
+targetBatchSize=10
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions
new file mode 100644
index 0000000000..1d447dd67a
Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions differ
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java
new file mode 100644
index 0000000000..0183f856f1
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.hbasesnapshots;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThrows;
+
+import java.util.UUID;
+import org.junit.Test;
+
+public class CleanupHBaseSnapshotRestoreFilesFnTest {
+ private static final String TEST_BUCKET_NAME = "test-bucket";
+ private static final String TEST_SNAPSHOT_PATH = "gs://" + TEST_BUCKET_NAME + "/hbase-export";
+ private static final String TEST_RESTORE_PATH =
+ HBaseSnapshotInputConfigBuilder.RESTORE_DIR + UUID.randomUUID();
+ private static final String TEST_RESTORE_PREFIX = TEST_RESTORE_PATH.substring(1);
+
+ @Test
+ public void testGetWorkingBucketName() {
+ assertEquals(
+ TEST_BUCKET_NAME,
+ CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_SNAPSHOT_PATH));
+
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_BUCKET_NAME);
+ });
+ }
+
+ @Test
+ public void testGetListPrefix() {
+ assertEquals(
+ TEST_RESTORE_PREFIX, CleanupHBaseSnapshotRestoreFilesFn.getListPrefix(TEST_RESTORE_PATH));
+
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_RESTORE_PREFIX);
+ });
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java
new file mode 100644
index 0000000000..1a681a2e05
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java
@@ -0,0 +1,389 @@
+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.hbasesnapshots;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import com.google.api.services.storage.model.Objects;
+import com.google.bigtable.repackaged.com.google.gson.Gson;
+import com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot.ImportOptions;
+import com.google.cloud.bigtable.beam.sequencefiles.HBaseResultToMutationFn;
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import com.google.cloud.bigtable.beam.validation.SyncTableJob;
+import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions;
+import com.google.cloud.bigtable.hbase.BigtableConfiguration;
+import com.google.cloud.bigtable.hbase.BigtableOptionsFactory;
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+import org.apache.beam.runners.dataflow.DataflowRunner;
+import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.PipelineResult.State;
+import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
+import org.apache.beam.sdk.extensions.gcp.util.GcsUtil;
+import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath;
+import org.apache.beam.sdk.metrics.MetricQueryResults;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.Delete;
+import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.snapshot.SnapshotTestingUtils;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+ * End to end integration test for pipeline that import HBase snapshot data into Cloud Bigtable and
+ * validates the imported data with SyncTable.
+ * Prepare test data with gsutil(https://cloud.google.com/storage/docs/quickstart-gsutil):
+ * gsutil -m cp -r /bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test \
+ * gs:///
+ *
+ * Setup GCP credential: https://cloud.google.com/docs/authentication
+ * Ensure your credential have access to Bigtable and Dataflow
+ *
+ * Run with:
+ * mvn integration-test -PhbasesnapshotsIntegrationTest \
+ * -Dgoogle.bigtable.project.id= \
+ * -Dgoogle.bigtable.instance.id= \
+ * -Dgoogle.dataflow.stagingLocation=gs:///staging \
+ * -Dcloud.test.data.folder=gs:///integration-test/
+ */
+public class EndToEndIT {
+
+ private static Logger LOG = LoggerFactory.getLogger(HBaseResultToMutationFn.class);
+ private static final String TEST_SNAPSHOT_NAME = "test-snapshot";
+ // Location of test data hosted on Google Cloud Storage, for on-cloud dataflow tests.
+ private static final String CLOUD_TEST_DATA_FOLDER = "cloud.test.data.folder";
+ private static final String DATAFLOW_REGION = "region";
+
+ // Column family name used in all test bigtables.
+ private static final String CF = "cf";
+
+ // Full path of the Cloud Storage folder where dataflow jars are uploaded to.
+ private static final String GOOGLE_DATAFLOW_STAGING_LOCATION = "google.dataflow.stagingLocation";
+
+ private Connection connection;
+ private String projectId;
+ private String instanceId;
+ private String tableId;
+ private String region;
+
+ private GcsUtil gcsUtil;
+ private String dataflowStagingLocation;
+ private String workDir;
+ private byte[][] keySplits;
+
+ // Snapshot data setup
+ private String hbaseSnapshotDir;
+ private String hashDir;
+ private String syncTableOutputDir;
+
+ @Before
+ public void setup() throws Exception {
+ projectId = getTestProperty(BigtableOptionsFactory.PROJECT_ID_KEY);
+ instanceId = getTestProperty(BigtableOptionsFactory.INSTANCE_ID_KEY);
+ dataflowStagingLocation = getTestProperty(GOOGLE_DATAFLOW_STAGING_LOCATION);
+ region = getTestProperty(DATAFLOW_REGION);
+ String cloudTestDataFolder = getTestProperty(CLOUD_TEST_DATA_FOLDER);
+ if (!cloudTestDataFolder.endsWith(File.separator)) {
+ cloudTestDataFolder = cloudTestDataFolder + File.separator;
+ }
+
+ hbaseSnapshotDir = cloudTestDataFolder + "data/";
+ UUID test_uuid = UUID.randomUUID();
+ hashDir = cloudTestDataFolder + "hashtable/";
+
+ syncTableOutputDir = dataflowStagingLocation;
+ if (!syncTableOutputDir.endsWith(File.separator)) {
+ syncTableOutputDir = syncTableOutputDir + File.separator;
+ }
+ syncTableOutputDir = syncTableOutputDir + "sync-table-output/" + test_uuid + "/";
+
+ // Cloud Storage config
+ GcpOptions gcpOptions = PipelineOptionsFactory.create().as(GcpOptions.class);
+ gcpOptions.setProject(projectId);
+ gcsUtil = new GcsUtil.GcsUtilFactory().create(gcpOptions);
+
+ // Bigtable config
+ connection = BigtableConfiguration.connect(projectId, instanceId);
+ tableId = "test_" + UUID.randomUUID().toString();
+
+ LOG.info("Setting up integration tests");
+
+ String[] keys = new String[] {"1", "2", "3", "4", "5", "6", "7", "8", "9"};
+ keySplits = new byte[keys.length][];
+ for (int i = 0; i < keys.length; i++) {
+ keySplits[i] = keys[i].getBytes();
+ }
+
+ // Create table in Bigtable
+ TableName tableName = TableName.valueOf(tableId);
+ HTableDescriptor descriptor = new HTableDescriptor(tableName);
+ descriptor.addFamily(new HColumnDescriptor(CF));
+ connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys());
+ }
+
+ private static String getTestProperty(String name) {
+ return checkNotNull(System.getProperty(name), "Required property missing: " + name);
+ }
+
+ @After
+ public void teardown() throws IOException {
+ final List paths = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "/*"));
+
+ if (!paths.isEmpty()) {
+ final List pathStrs = new ArrayList<>();
+
+ for (GcsPath path : paths) {
+ pathStrs.add(path.toString());
+ }
+ // TODO: cleanup fails when tests time out. Add a orphan cleaner in the setup()
+ // https://github.com/googleapis/java-bigtable/blob/35588d89b9b243eb691a29d3aff16b9f5a08fbb8/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/test_helpers/env/AbstractTestEnv.java#L108-L119
+ this.gcsUtil.remove(pathStrs);
+ }
+
+ connection.close();
+
+ // delete test table
+ BigtableConfiguration.connect(projectId, instanceId)
+ .getAdmin()
+ .deleteTable(TableName.valueOf(tableId));
+ }
+
+ private SyncTableOptions createSyncTableOptions() {
+ DataflowPipelineOptions syncTableOpts =
+ PipelineOptionsFactory.as(DataflowPipelineOptions.class);
+ syncTableOpts.setRunner(DataflowRunner.class);
+ syncTableOpts.setGcpTempLocation(dataflowStagingLocation);
+ syncTableOpts.setNumWorkers(1);
+ syncTableOpts.setProject(projectId);
+ syncTableOpts.setRegion(region);
+
+ SyncTableOptions syncOpts = syncTableOpts.as(SyncTableOptions.class);
+ // Setup Bigtable params
+ syncOpts.setBigtableProject(StaticValueProvider.of(projectId));
+ syncOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId));
+ syncOpts.setBigtableTableId(StaticValueProvider.of(tableId));
+ syncOpts.setBigtableAppProfileId(null);
+
+ // Setup Hashes
+ syncOpts.setHashTableOutputDir(StaticValueProvider.of(hashDir));
+ syncOpts.setOutputPrefix(StaticValueProvider.of(syncTableOutputDir));
+ return syncOpts;
+ }
+
+ private ImportOptions createImportOptions() {
+ DataflowPipelineOptions importPipelineOpts =
+ PipelineOptionsFactory.as(DataflowPipelineOptions.class);
+ importPipelineOpts.setRunner(DataflowRunner.class);
+ importPipelineOpts.setGcpTempLocation(dataflowStagingLocation);
+ importPipelineOpts.setNumWorkers(1);
+ importPipelineOpts.setProject(projectId);
+ importPipelineOpts.setRegion(region);
+
+ ImportOptions importOpts = importPipelineOpts.as(ImportOptions.class);
+
+ // setup Bigtable options
+ importOpts.setBigtableProject(StaticValueProvider.of(projectId));
+ importOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId));
+ importOpts.setBigtableTableId(StaticValueProvider.of(tableId));
+
+ // setup HBase snapshot info
+ importOpts.setHbaseSnapshotSourceDir(hbaseSnapshotDir);
+ importOpts.setSnapshotName(TEST_SNAPSHOT_NAME);
+ return importOpts;
+ }
+
+ private Map getCountMap(PipelineResult result) {
+ MetricQueryResults metrics = result.metrics().allMetrics();
+ return StreamSupport.stream(metrics.getCounters().spliterator(), false)
+ .collect(Collectors.toMap((m) -> m.getName().getName(), (m) -> m.getAttempted()));
+ }
+
+ /**
+ * Reads the output of SyncTable job and returns a list of mismatched RangeHashes.
+ *
+ * @throws IOException
+ */
+ private List readMismatchesFromOutputFiles() throws IOException {
+ Gson gson = new Gson();
+ // Find output files
+ List outputFiles = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "*"));
+ List rangeHashes = new ArrayList<>();
+
+ // Read each file line by line and create a RangeHash from it.
+ for (GcsPath outputFile : outputFiles) {
+ int size = (int) gcsUtil.fileSize(outputFile);
+ byte[] fileContents = new byte[size];
+ gcsUtil.open(outputFile).read(ByteBuffer.wrap(fileContents));
+ BufferedReader reader =
+ new BufferedReader(new InputStreamReader(new ByteArrayInputStream(fileContents)));
+ String serializedRangeHash;
+ while ((serializedRangeHash = reader.readLine()) != null) {
+ try {
+ rangeHashes.add(gson.fromJson(serializedRangeHash.trim(), RangeHash.class));
+ } catch (Exception e) {
+ LOG.error("Failed to parse JSON: [" + serializedRangeHash + "]", e);
+ throw e;
+ }
+ }
+ }
+ return rangeHashes;
+ }
+
+ // Asserts that all the rowKeys belong in mismatches.
+ // Throws AssertionException
+ private void validateRowInRangeHashes(List rowKeys, Iterable mismatches) {
+ for (byte[] mismatchedRowKey : rowKeys) {
+ Assert.assertTrue(containsRow(mismatchedRowKey, mismatches));
+ }
+ }
+
+ // Returns true if the rowKey belongs in one of the ranges contained in rangeHashes.
+ private boolean containsRow(byte[] rowKey, Iterable rangeHashes) {
+ for (RangeHash mismatchedRange : rangeHashes) {
+ // TODO: There maybe a better Range.belongs() utility function somewhere?
+ // Empty start/end key means that there is no start/end key.
+ if ((mismatchedRange.startInclusive.equals(HConstants.EMPTY_BYTE_ARRAY)
+ || mismatchedRange.startInclusive.compareTo(rowKey) <= 0)
+ && (mismatchedRange.stopExclusive.equals(HConstants.EMPTY_BYTE_ARRAY)
+ || mismatchedRange.stopExclusive.compareTo(rowKey) > 0)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Test
+ public void testHBaseSnapshotImport() throws Exception {
+
+ // Start import
+ ImportOptions importOpts = createImportOptions();
+
+ // run pipeline
+ State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish();
+ Assert.assertEquals(State.DONE, state);
+
+ // check that the .restore dir used for temp files has been removed
+ Objects objects =
+ gcsUtil.listObjects(
+ GcsPath.fromUri(hbaseSnapshotDir).getBucket(),
+ CleanupHBaseSnapshotRestoreFilesFn.getListPrefix(
+ HBaseSnapshotInputConfigBuilder.RESTORE_DIR),
+ null);
+ Assert.assertNull(objects.getItems());
+
+ SyncTableOptions syncOpts = createSyncTableOptions();
+
+ PipelineResult result = SyncTableJob.buildPipeline(syncOpts).run();
+ state = result.waitUntilFinish();
+ Assert.assertEquals(State.DONE, state);
+
+ // Read the output files and validate that there are no mismatches.
+ Assert.assertEquals(0, readMismatchesFromOutputFiles().size());
+
+ // Validate the counters.
+ Map counters = getCountMap(result);
+ Assert.assertEquals(counters.get("ranges_matched"), (Long) 101L);
+ Assert.assertEquals(counters.get("ranges_not_matched"), (Long) 0L);
+ }
+
+ /**
+ * Introduces multiple corruptions in imported table and validates that sync-table can detect
+ * them.
+ */
+ @Test
+ public void testHBaseSnapshotImportWithCorruptions() throws Exception {
+ // Import snapshot
+ ImportOptions importOpts = createImportOptions();
+ State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish();
+ Assert.assertEquals(State.DONE, state);
+
+ // Rows where corruptions will be added.
+ byte[] mismatchRowAtStart = "000".getBytes();
+ byte[] mismatchRowInMiddle = "24".getBytes();
+ byte[] mismatchRowDeleted = "64".getBytes();
+ byte[] mismatchRowAtTheEnd = "999".getBytes();
+
+ // Introduce corruptions to the data in Bigtable. Delete data from Bigtable to simulate Bigtable
+ // missing data. Add data to Bigtable to simulate extra data in Bigtable. It is easier to update
+ // Bigtable than change the snapshots.
+ Table table = connection.getTable(TableName.valueOf(tableId));
+ Cell cellInMiddle = table.get(new Get(mismatchRowInMiddle)).rawCells()[0];
+ List puts =
+ Arrays.asList(
+ // Add a row at the start
+ new Put(mismatchRowAtStart)
+ .addColumn(CF.getBytes(), "random_col".getBytes(), 1L, "value000".getBytes())
+ .addColumn(CF.getBytes(), "random_col".getBytes(), 2L, "value001".getBytes()),
+ // change a cell in middle
+ new Put(cellInMiddle.getRowArray())
+ .addColumn(
+ cellInMiddle.getFamilyArray(),
+ cellInMiddle.getQualifierArray(),
+ cellInMiddle.getTimestamp(),
+ "corrupted_val".getBytes()),
+ // add a new row in the end
+ new Put(mismatchRowAtTheEnd)
+ .addColumn(CF.getBytes(), "random_col".getBytes(), 100L, "value999".getBytes()));
+
+ table.put(puts);
+ // Delete a random row in the middle. We should see 4 ranges mismatch as table is split on
+ // 1,2...9. All the updates are happening on a different split.
+ table.delete(new Delete(mismatchRowDeleted));
+
+ // Run SyncTable job and expect 4 mismatches.
+ SyncTableOptions syncOpts = createSyncTableOptions();
+ PipelineResult result = SyncTableJob.buildPipeline(syncOpts).run();
+ state = result.waitUntilFinish();
+ Assert.assertEquals(State.DONE, state);
+
+ List syncTableOutputMismatches = readMismatchesFromOutputFiles();
+ Assert.assertEquals(4, syncTableOutputMismatches.size());
+ validateRowInRangeHashes(
+ Arrays.asList(
+ mismatchRowAtStart, mismatchRowAtTheEnd, mismatchRowDeleted, mismatchRowInMiddle),
+ syncTableOutputMismatches);
+
+ // Assert that the output collection is the right one.
+ Map counters = getCountMap(result);
+ Assert.assertEquals(counters.get("ranges_matched"), (Long) 97L);
+ Assert.assertEquals(counters.get("ranges_not_matched"), (Long) 4L);
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java
new file mode 100644
index 0000000000..579a57c238
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.hbasesnapshots;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.junit.Test;
+
+public class HBaseSnapshotInputConfigBuilderTest {
+
+ private static final String TEST_PROJECT = "test_project";
+ private static final String TEST_SNAPSHOT_DIR = "gs://test-bucket/hbase-export";
+ private static final String TEST_SNAPSHOT_NAME = "test_snapshot";
+
+ @Test
+ public void testBuildingHBaseSnapshotInputConfigBuilder() {
+ Configuration conf =
+ new HBaseSnapshotInputConfigBuilder()
+ .setProjectId(TEST_PROJECT)
+ .setHbaseSnapshotSourceDir(TEST_SNAPSHOT_DIR)
+ .setSnapshotName(TEST_SNAPSHOT_NAME)
+ .createHBaseConfiguration();
+ assertEquals(
+ "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS", conf.get("fs.AbstractFileSystem.gs.impl"));
+ assertEquals(TEST_PROJECT, conf.get("fs.gs.project.id"));
+ assertEquals(TEST_SNAPSHOT_DIR, conf.get("hbase.rootdir"));
+ assertEquals(
+ TableSnapshotInputFormat.class,
+ conf.getClass(
+ "mapreduce.job.inputformat.class", TableSnapshotInputFormat.class, InputFormat.class));
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/it/CloudBigtableBeamITTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/it/CloudBigtableBeamITTest.java
index d2a095a5e3..fd9909f37f 100644
--- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/it/CloudBigtableBeamITTest.java
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/it/CloudBigtableBeamITTest.java
@@ -102,13 +102,13 @@ public class CloudBigtableBeamITTest {
private final Log LOG = LogFactory.getLog(getClass());
- private static final String STAGING_LOCATION_KEY = "dataflowStagingLocation";
- private static final String ZONE_ID_KEY = "dataflowZoneId";
+ private static final String STAGING_LOCATION_KEY = "google.dataflow.stagingLocation";
+ private static final String REGION_KEY = "region";
private static final String projectId = System.getProperty(PROJECT_ID_KEY);
private static final String instanceId = System.getProperty(INSTANCE_ID_KEY);
private static final String stagingLocation = System.getProperty(STAGING_LOCATION_KEY);
- private static final String zoneId = System.getProperty(ZONE_ID_KEY);
+ private static final String region = System.getProperty(REGION_KEY);
private static final String workerMachineType =
System.getProperty("workerMachineType", "n1" + "-standard-8");
@@ -129,7 +129,7 @@ public class CloudBigtableBeamITTest {
@BeforeClass
public static void setUpConfiguration() {
Preconditions.checkArgument(stagingLocation != null, "Set -D" + STAGING_LOCATION_KEY + ".");
- Preconditions.checkArgument(zoneId != null, "Set -D" + ZONE_ID_KEY + ".");
+ Preconditions.checkArgument(region != null, "Set -D" + REGION_KEY + ".");
Preconditions.checkArgument(projectId != null, "Set -D" + PROJECT_ID_KEY + ".");
Preconditions.checkArgument(instanceId != null, "Set -D" + INSTANCE_ID_KEY + ".");
}
@@ -255,7 +255,7 @@ private static byte[] createRandomValue() {
private DataflowPipelineOptions createOptions() {
DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
options.setProject(projectId);
- options.setZone(zoneId);
+ options.setRegion(region);
options.setStagingLocation(stagingLocation + "/stage");
options.setTempLocation(stagingLocation + "/temp");
options.setRunner(DataflowRunner.class);
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/EndToEndIT.java
index 8f5cd823c7..1958e04307 100644
--- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/EndToEndIT.java
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/EndToEndIT.java
@@ -55,6 +55,7 @@
public class EndToEndIT {
// Location of test data hosted on Google Cloud Storage, for on-cloud dataflow tests.
private static final String CLOUD_TEST_DATA_FOLDER = "cloud.test.data.folder";
+ private static final String DATAFLOW_REGION = "region";
// Column family name used in all test bigtables.
private static final String CF = "column_family";
@@ -66,6 +67,7 @@ public class EndToEndIT {
private String projectId;
private String instanceId;
private String tableId;
+ private String region;
private GcsUtil gcsUtil;
private String cloudTestDataFolder;
@@ -76,7 +78,7 @@ public class EndToEndIT {
public void setup() throws Exception {
projectId = getTestProperty(BigtableOptionsFactory.PROJECT_ID_KEY);
instanceId = getTestProperty(BigtableOptionsFactory.INSTANCE_ID_KEY);
-
+ region = getTestProperty(DATAFLOW_REGION);
dataflowStagingLocation = getTestProperty(GOOGLE_DATAFLOW_STAGING_LOCATION);
cloudTestDataFolder = getTestProperty(CLOUD_TEST_DATA_FOLDER);
@@ -152,6 +154,7 @@ public void testExportImport() throws Exception {
pipelineOpts.setGcpTempLocation(dataflowStagingLocation);
pipelineOpts.setNumWorkers(1);
pipelineOpts.setProject(projectId);
+ pipelineOpts.setRegion(region);
ExportOptions exportOpts = pipelineOpts.as(ExportOptions.class);
exportOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId));
@@ -172,6 +175,7 @@ public void testExportImport() throws Exception {
PipelineOptionsFactory.as(DataflowPipelineOptions.class);
createTablePipelineOpts.setRunner(DataflowRunner.class);
createTablePipelineOpts.setProject(projectId);
+ createTablePipelineOpts.setRegion(region);
CreateTableHelper.CreateTableOpts createOpts =
createTablePipelineOpts.as(CreateTableHelper.CreateTableOpts.class);
@@ -188,6 +192,7 @@ public void testExportImport() throws Exception {
importPipelineOpts.setGcpTempLocation(dataflowStagingLocation);
importPipelineOpts.setNumWorkers(1);
importPipelineOpts.setProject(projectId);
+ importPipelineOpts.setRegion(region);
ImportJob.ImportOptions importOpts = importPipelineOpts.as(ImportJob.ImportOptions.class);
importOpts.setBigtableProject(StaticValueProvider.of(projectId));
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java
new file mode 100644
index 0000000000..96d5960423
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java
@@ -0,0 +1,162 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import static org.junit.Assert.assertEquals;
+
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
+import org.apache.beam.sdk.testing.SourceTestUtils;
+import org.apache.beam.sdk.values.KV;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class BufferedHadoopHashTableSourceTest {
+
+ private BufferedHadoopHashTableSource bufferedSource;
+ private FakeTableHashWrapper fakeTableHashWrapper;
+
+ private static final String HASH_TABLE_OUTPUT_PATH_DIR = "gs://my-bucket/outputDir";
+ private static final ImmutableBytesWritable START_ROW =
+ new ImmutableBytesWritable("AAAA".getBytes());
+ private static final ImmutableBytesWritable STOP_ROW =
+ new ImmutableBytesWritable("ZZZZ".getBytes());
+ private static final ImmutableBytesWritable POST_STOP_ROW =
+ new ImmutableBytesWritable("z".getBytes()); // Lowercase z is lexicographically > uppercase Z
+ private static final ImmutableBytesWritable EMPTY_ROW =
+ new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY);
+ private static final ImmutableBytesWritable START_HASH =
+ new ImmutableBytesWritable("START-HASH".getBytes());
+ private static final int BATCH_SIZE = 5;
+
+ @Before
+ public void setUp() throws Exception {
+ fakeTableHashWrapper =
+ new FakeTableHashWrapper(
+ START_ROW, STOP_ROW, new ArrayList<>(), new ArrayList<>(), new Scan());
+ bufferedSource =
+ new BufferedHadoopHashTableSource(
+ new HadoopHashTableSource(
+ StaticValueProvider.of("cbt-dev"),
+ StaticValueProvider.of(HASH_TABLE_OUTPUT_PATH_DIR),
+ START_ROW,
+ STOP_ROW,
+ new FakeTableHashWrapperFactory(fakeTableHashWrapper)),
+ BATCH_SIZE);
+ }
+
+ protected static ImmutableBytesWritable getKey(int keyIndex) {
+ return new ImmutableBytesWritable(("KEY-" + keyIndex).getBytes());
+ }
+
+ protected static ImmutableBytesWritable getHash(int hashIndex) {
+ return new ImmutableBytesWritable(("HASH-" + hashIndex).getBytes());
+ }
+
+ /**
+ * Populates the fakeTableHashWrapper with {@code numEntries} entries starting with startKey.
+ * Returns a List of expected RangeHashes for this data, for numEntries=1, single RangeHash is
+ * returned (startRow, stopRow, START_HASH).
+ */
+ protected List>> setupTestData(
+ ImmutableBytesWritable startRow, ImmutableBytesWritable stopRow, int numEntries) {
+ fakeTableHashWrapper.startRowInclusive = startRow;
+ fakeTableHashWrapper.stopRowExclusive = stopRow;
+ fakeTableHashWrapper.hashes.add(KV.of(startRow, START_HASH));
+ for (int i = 0; i < numEntries - 1; i++) {
+ fakeTableHashWrapper.hashes.add(KV.of(getKey(i), getHash(i)));
+ }
+
+ List>> out = new ArrayList<>();
+ // Setup RangeHashes to be returned
+ List expectedRangeHashes = new ArrayList<>();
+ ImmutableBytesWritable key = startRow;
+ ImmutableBytesWritable hash = START_HASH;
+ for (int i = 0; i < numEntries - 1; i++) {
+ expectedRangeHashes.add(RangeHash.of(key, getKey(i), hash));
+ key = getKey(i);
+ hash = getHash(i);
+ if (expectedRangeHashes.size() % BATCH_SIZE == 0) {
+ out.add(
+ KV.of(
+ Bytes.toStringBinary(expectedRangeHashes.get(0).startInclusive.copyBytes()),
+ expectedRangeHashes));
+ expectedRangeHashes = new ArrayList<>();
+ }
+ }
+ // Process the last range
+ expectedRangeHashes.add(RangeHash.of(key, stopRow, hash));
+ // Finalize the last batch
+ out.add(
+ KV.of(
+ Bytes.toStringBinary(expectedRangeHashes.get(0).startInclusive.copyBytes()),
+ expectedRangeHashes));
+
+ return out;
+ }
+
+ @Test
+ public void testHashReaderEmpty() throws IOException {
+ // The tableHashWrapper has no hashes, this should result in empty source.
+ assertEquals(Arrays.asList(), SourceTestUtils.readFromSource(bufferedSource, null));
+ }
+
+ @Test
+ public void testHashReaderPartialBuffer() throws IOException {
+ // Setup 4 entries in this hashtable datafile.
+ List>> expected = setupTestData(START_ROW, STOP_ROW, 4);
+ assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null));
+ }
+
+ @Test
+ public void testHashReaderMultipleBatches() throws IOException {
+ // Setup 4 entries in this hashtable datafile.
+ List>> expected = setupTestData(START_ROW, STOP_ROW, 20);
+ assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null));
+ }
+
+ @Test
+ public void testHashReaderMultipleBatchesWithPartialBatchAtEnd() throws IOException {
+ // Setup 4 entries in this hashtable datafile.
+ List>> expected = setupTestData(START_ROW, STOP_ROW, 23);
+ assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null));
+ }
+
+ @Test
+ public void testSplitEqualsUnsplit() throws Exception {
+ fakeTableHashWrapper.partitions = Arrays.asList(getKey(4), getKey(9));
+ SourceTestUtils.assertSourcesEqualReferenceSource(
+ bufferedSource, bufferedSource.split(0, null), null);
+ }
+
+ @Test
+ public void testUnstartedReaderEqualsStarted() throws Exception {
+ setupTestData(START_ROW, STOP_ROW, 6);
+ SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource(
+ bufferedSource.createReader(null), null);
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java
new file mode 100644
index 0000000000..a27288f7da
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java
@@ -0,0 +1,469 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import static com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.models.GCRules.GCRULES;
+
+import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.BigtableTableAdminClient;
+import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.BigtableTableAdminSettings;
+import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.models.CreateTableRequest;
+import com.google.cloud.bigtable.beam.CloudBigtableTableConfiguration;
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import com.google.cloud.bigtable.emulator.v2.BigtableEmulatorRule;
+import com.google.cloud.bigtable.hbase.BigtableConfiguration;
+import com.google.cloud.bigtable.hbase.BigtableOptionsFactory;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.metrics.MetricQueryResults;
+import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
+import org.apache.beam.sdk.testing.PAssert;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.Delete;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@RunWith(JUnit4.class)
+public class ComputeAndValidateHashFromBigtableDoFnTest {
+
+ private static final byte[] EMPTY_ROW_KEY = HConstants.EMPTY_BYTE_ARRAY;
+ protected final Logger LOG = LoggerFactory.getLogger(getClass());
+
+ public static final String FAKE_TABLE = "fake-table";
+ private static final String ROW_KEY_PREFIX = "row-";
+ private static final String VALUE_PREFIX = "value-";
+ private static final byte[] EXTRA_VALUE = "add".getBytes();
+ private static final byte[] CF = "cf".getBytes();
+ private static final byte[] CF2 = "cf".getBytes();
+ private static final byte[] COL = "col".getBytes();
+ private static final long TS = 1000l;
+ private static final int FIRST_ROW_INDEX = 20;
+ private static final int LAST_ROW_INDEX = 31;
+
+ @Rule public final BigtableEmulatorRule bigtableEmulator = BigtableEmulatorRule.create();
+
+ @Rule public final transient TestPipeline p = TestPipeline.create();
+
+ private ComputeAndValidateHashFromBigtableDoFn doFn;
+
+ // Clients that will be connected to the emulator
+ private BigtableTableAdminClient tableAdminClient;
+ private Table table;
+ // Fake a TableHashWrapper.
+ private FakeTableHashWrapper fakeTableHashWrapper;
+
+ private List hashes;
+
+ @Before
+ public void setUp() throws IOException {
+ hashes = new ArrayList<>();
+ // Initialize the clients to connect to the emulator
+ tableAdminClient =
+ BigtableTableAdminClient.create(
+ BigtableTableAdminSettings.newBuilderForEmulator(bigtableEmulator.getPort())
+ .setProjectId("fake-project")
+ .setInstanceId("fake-instance")
+ .build());
+
+ CloudBigtableTableConfiguration config =
+ new CloudBigtableTableConfiguration.Builder()
+ .withProjectId("fake-project")
+ .withInstanceId("fake-instance")
+ .withTableId(FAKE_TABLE)
+ .withConfiguration(
+ BigtableOptionsFactory.BIGTABLE_EMULATOR_HOST_KEY,
+ "localhost:" + bigtableEmulator.getPort())
+ .build();
+
+ Connection connection = BigtableConfiguration.connect(config.toHBaseConfig());
+ table = connection.getTable(TableName.valueOf(FAKE_TABLE));
+ fakeTableHashWrapper = new FakeTableHashWrapper();
+ // Scan all the cells for the column, HBase scan fetches 1 cell/column by default
+ fakeTableHashWrapper.scan = new Scan().setMaxVersions();
+
+ FakeTableHashWrapperFactory fakeFactory = new FakeTableHashWrapperFactory(fakeTableHashWrapper);
+
+ doFn =
+ new ComputeAndValidateHashFromBigtableDoFn(
+ config,
+ StaticValueProvider.of(FAKE_TABLE),
+ StaticValueProvider.of("proj"),
+ StaticValueProvider.of("hash"),
+ fakeFactory);
+
+ // Create a test table that can be used in tests
+ tableAdminClient.createTable(
+ CreateTableRequest.of(FAKE_TABLE)
+ .addFamily(new String(CF), GCRULES.maxVersions(100))
+ .addFamily(new String(CF2), GCRULES.maxVersions(100)));
+
+ p.getCoderRegistry().registerCoderForClass(RangeHash.class, new RangeHashCoder());
+
+ // Fill CBT table with data.
+ writeDataToTable();
+ }
+
+ @After
+ public void tearDown() {
+ // TODO should we delete the table for each test?
+ tableAdminClient.deleteTable(FAKE_TABLE);
+ }
+
+ private byte[] getRowKey(int i) {
+ return (ROW_KEY_PREFIX + i).getBytes();
+ }
+
+ private byte[] getValue(int rowIndex, int cellIndex) {
+ return (VALUE_PREFIX + rowIndex + "-" + cellIndex).getBytes();
+ }
+
+ private void writeDataToTable() throws IOException {
+ List puts = new ArrayList<>();
+ // Tests use the rows 21-30. Setup some extra data simulate the real world scenario where
+ // there will be other workitems working parallely on the table.
+ for (int i = 20; i < 32; i++) {
+ for (int j = 0; j < 2; j++) {
+ // Insert rows with 2 cells each
+ Put put = new Put(getRowKey(i));
+ put.addColumn(CF, COL, TS + j, getValue(i, j));
+ puts.add(put);
+ }
+ }
+ table.put(puts);
+ }
+
+ /** Deletes the row range [startIndex, stopIndex) */
+ private void deleteRange(int startIndex, int stopIndex) throws IOException {
+ for (int i = startIndex; i < stopIndex; i++) {
+ table.delete(new Delete(getRowKey(i)));
+ }
+ }
+
+ // Creates a RangeHash for range [startRow, stopRow).
+ private RangeHash createHash(byte[] startRow, byte[] stopRow) throws IOException {
+ LOG.debug("Creating hash for rows " + startRow + " to " + stopRow);
+ BigtableResultHasher hasher = new BigtableResultHasher();
+ hasher.startBatch(new ImmutableBytesWritable(startRow));
+
+ // Scan all the cells for a column.
+ Scan scan = new Scan().setMaxVersions().withStartRow(startRow).withStopRow(stopRow, false);
+
+ // Read the rows from Bigtable and compute the expected hash.
+ for (Result result : table.getScanner(scan)) {
+ LOG.debug("Adding result to hash: " + result);
+ hasher.hashResult(result);
+ }
+ hasher.finishBatch();
+ return RangeHash.of(
+ new ImmutableBytesWritable(startRow),
+ new ImmutableBytesWritable(stopRow),
+ hasher.getBatchHash());
+ }
+
+ private void validateCounters(
+ PipelineResult result, Long expectedMatches, Long expectedMismatches) {
+ MetricQueryResults metrics = result.metrics().allMetrics();
+ Map counters =
+ StreamSupport.stream(metrics.getCounters().spliterator(), false)
+ .collect(Collectors.toMap((m) -> m.getName().getName(), (m) -> m.getAttempted()));
+ Assert.assertEquals(expectedMatches, counters.get("ranges_matched"));
+ Assert.assertEquals(expectedMismatches, counters.get("ranges_not_matched"));
+ }
+
+ ////////// Happy case tests for various setups//////////////////////
+ @Test
+ public void testHashMatchesForMultipleRange() throws Exception {
+ hashes.add(createHash(getRowKey(21), getRowKey(24)));
+ hashes.add(createHash(getRowKey(24), getRowKey(28)));
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output).empty();
+ PipelineResult result = p.run();
+ validateCounters(result, 2L, 0L);
+ }
+
+ @Test
+ public void testHashMatchesForSingleRange() throws Exception {
+ hashes.add(createHash(getRowKey(21), getRowKey(24)));
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output).containsInAnyOrder();
+ PipelineResult result = p.run();
+ validateCounters(result, 1L, 0L);
+ }
+
+ @Test
+ public void testHashMatchesForFullTableScanWithMultipleRange() throws Exception {
+ hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(24)));
+ hashes.add(createHash(getRowKey(24), EMPTY_ROW_KEY));
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output).empty();
+ PipelineResult result = p.run();
+ validateCounters(result, 2L, 0L);
+ }
+
+ @Test
+ public void testHashMatchesForMultipleSingleRowRange() throws Exception {
+ hashes.add(createHash(getRowKey(22), getRowKey(23)));
+ hashes.add(createHash(getRowKey(23), getRowKey(24)));
+ hashes.add(createHash(getRowKey(24), getRowKey(25)));
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(getRowKey(22)), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output).empty();
+ PipelineResult result = p.run();
+ validateCounters(result, 3L, 0L);
+ }
+
+ ///////////////// Test mismatches when Bigtable has extra rows ////////////////////
+ @Test
+ public void testAdditionalCellInMiddle() throws Exception {
+ hashes.add(createHash(getRowKey(21), getRowKey(24)));
+ hashes.add(createHash(getRowKey(24), getRowKey(27)));
+ hashes.add(createHash(getRowKey(27), getRowKey(30)));
+
+ // Add an extra cell in the table
+ table.put(new Put(getRowKey(25)).addColumn(CF, COL, EXTRA_VALUE));
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output).containsInAnyOrder(hashes.get(1));
+ PipelineResult result = p.run();
+ validateCounters(result, 2L, 1L);
+ }
+
+ @Test
+ public void testAdditionalRowsAtEnds() throws Exception {
+ hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(24)));
+ hashes.add(createHash(getRowKey(24), getRowKey(27)));
+ hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY));
+
+ // Add an extra row in the beginning
+ table.put(new Put(getRowKey(1)).addColumn(CF, COL, EXTRA_VALUE));
+
+ // Add an extra row at the end.
+ table.put(new Put(getRowKey(5)).addColumn(CF, COL, EXTRA_VALUE));
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2));
+ PipelineResult result = p.run();
+ validateCounters(result, 1L, 2L);
+ }
+
+ ///////////////////// Test different values ///////////////////////////
+ @Test
+ public void testDifferentValues() throws Exception {
+ hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21)));
+ hashes.add(createHash(getRowKey(21), getRowKey(23)));
+ hashes.add(createHash(getRowKey(23), getRowKey(25)));
+ hashes.add(createHash(getRowKey(25), getRowKey(27)));
+ hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY));
+
+ // Modify the CF
+ table.delete(new Delete(getRowKey(20)).addColumns(CF, COL, TS));
+ table.put(new Put(getRowKey(1)).addColumn(CF2, COL, TS, getValue(20, 0)));
+
+ // Modify the qualifier
+ table.delete(new Delete(getRowKey(22)).addColumns(CF, COL, TS));
+ table.put(new Put(getRowKey(22)).addColumn(CF, "random-col".getBytes(), TS, getValue(22, 0)));
+
+ // Modify the timestamp
+ table.delete(new Delete(getRowKey(24)).addColumns(CF, COL, TS));
+ table.put(new Put(getRowKey(24)).addColumn(CF, COL, 1, getValue(24, 0)));
+
+ // Modify the value
+ table.delete(new Delete(getRowKey(26)).addColumns(CF, COL, TS));
+ table.put(new Put(getRowKey(26)).addColumn(CF, COL, getValue(26, 0)));
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output)
+ .containsInAnyOrder(hashes.get(0), hashes.get(1), hashes.get(2), hashes.get(3));
+ PipelineResult result = p.run();
+ validateCounters(result, 1L, 4L);
+ }
+
+ ////////////////// Tests with CBT missing data //////////////////////////////
+ @Test
+ public void testMissingRows() throws Exception {
+ hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21)));
+ hashes.add(createHash(getRowKey(21), getRowKey(23)));
+ hashes.add(createHash(getRowKey(23), getRowKey(25)));
+ hashes.add(createHash(getRowKey(25), getRowKey(27)));
+ hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY));
+
+ // Delete a row at the beginning
+ table.delete(new Delete(getRowKey(FIRST_ROW_INDEX)));
+
+ // Delete a row at the middle
+ table.delete(new Delete(getRowKey(24)));
+
+ // Delete a row at the end
+ table.delete(new Delete(getRowKey(LAST_ROW_INDEX)));
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4));
+ PipelineResult result = p.run();
+ validateCounters(result, 2L, 3L);
+ }
+
+ @Test
+ public void testMissingRanges() throws Exception {
+ hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21)));
+ hashes.add(createHash(getRowKey(21), getRowKey(23)));
+ hashes.add(createHash(getRowKey(23), getRowKey(25)));
+ hashes.add(createHash(getRowKey(25), getRowKey(27)));
+ hashes.add(createHash(getRowKey(27), getRowKey(29)));
+ hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY));
+
+ // Delete a range at the beginning
+ deleteRange(FIRST_ROW_INDEX, 21);
+
+ // Delete a range in middle
+ deleteRange(23, 25);
+
+ // Delete row ranges at the end, bigtable scanner will finish with multiple row-ranges to
+ // process.
+ deleteRange(27, LAST_ROW_INDEX + 1);
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output)
+ .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5));
+ PipelineResult result = p.run();
+ validateCounters(result, 2L, 4L);
+ }
+
+ @Test
+ public void testCbtEmpty() throws Exception {
+ hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(25)));
+ hashes.add(createHash(getRowKey(25), getRowKey(29)));
+ hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY));
+
+ // Delete all data from bigtable
+ deleteRange(FIRST_ROW_INDEX, LAST_ROW_INDEX);
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output).containsInAnyOrder(hashes);
+ PipelineResult result = p.run();
+ validateCounters(result, 0L, 3L);
+ }
+
+ ////////////////////// Test that scan is used from TableHash.////////////////////////
+ @Test
+ public void testScanFromTableHash() throws Exception {
+ hashes.add(createHash(getRowKey(21), getRowKey(24)));
+ hashes.add(createHash(getRowKey(24), getRowKey(27)));
+ hashes.add(createHash(getRowKey(27), getRowKey(30)));
+
+ // Update the TableHashWrapper Scan to default. Scan from HashTable.TableHash determines the
+ // cells used to compute hash. CBT has to use the same cells for validation.
+ fakeTableHashWrapper.scan = new Scan();
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output).containsInAnyOrder(hashes);
+ PipelineResult result = p.run();
+ validateCounters(result, 0L, 3L);
+ }
+
+ ////////////////////// Combination of different cases //////////////////////////////////
+ @Test
+ public void testMismatchesComprehensive() throws Exception {
+ hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21)));
+ hashes.add(createHash(getRowKey(21), getRowKey(23)));
+ hashes.add(createHash(getRowKey(23), getRowKey(25)));
+ hashes.add(createHash(getRowKey(25), getRowKey(27)));
+ hashes.add(createHash(getRowKey(27), getRowKey(29)));
+ hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY));
+
+ // Delete a range at the beginning from CBT
+ deleteRange(FIRST_ROW_INDEX, 21);
+
+ // Delete a row in middle from CBT
+ table.delete(new Delete(getRowKey(23)));
+
+ // Update a value in CBT
+ table.delete(new Delete(getRowKey(27)).addColumns(CF, COL, TS));
+ table.put(new Put(getRowKey(27)).addColumn(CF, COL, getValue(27, 0)));
+
+ // Add an extra row at the end.
+ table.put(new Put(getRowKey(5)).addColumn(CF, COL, EXTRA_VALUE));
+
+ PCollection>>> input =
+ p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
+
+ PCollection output = input.apply(ParDo.of(doFn));
+ PAssert.that(output)
+ .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5));
+ PipelineResult result = p.run();
+ validateCounters(result, 2L, 4L);
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java
new file mode 100644
index 0000000000..ee2b6814e2
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java
@@ -0,0 +1,153 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import com.google.bigtable.repackaged.com.google.gson.Gson;
+import com.google.common.collect.ImmutableList;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.beam.sdk.values.KV;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+
+/**
+ * A fake for TableHashWrapper that allows us to mock the behavior of hbase's HashTable.TableHash
+ */
+public class FakeTableHashWrapper implements TableHashWrapper {
+
+ // Sorted list of partition keys splitting the key range.
+ public List partitions;
+ // List of sorted by key.
+ public List> hashes;
+ public ImmutableBytesWritable startRowInclusive;
+ public ImmutableBytesWritable stopRowExclusive;
+ public Scan scan;
+ private static final long serialVersionUID = 34876543L;
+
+ public FakeTableHashWrapper() {
+ this(
+ new ImmutableBytesWritable(),
+ new ImmutableBytesWritable(),
+ new ArrayList<>(),
+ new ArrayList<>(),
+ new Scan());
+ }
+
+ public FakeTableHashWrapper(
+ ImmutableBytesWritable startRowInclusive,
+ ImmutableBytesWritable stopRowExclusive,
+ List partitions,
+ List> hashes,
+ Scan scan) {
+ super();
+ this.startRowInclusive = startRowInclusive;
+ this.stopRowExclusive = stopRowExclusive;
+ this.partitions = partitions;
+ this.hashes = hashes;
+ this.scan = scan;
+ }
+
+ @Override
+ public int getNumHashFiles() {
+ return partitions.size() + 1;
+ }
+
+ @Override
+ public ImmutableList getPartitions() {
+ return ImmutableList.copyOf(partitions);
+ }
+
+ @Override
+ public ImmutableBytesWritable getStartRow() {
+ return startRowInclusive;
+ }
+
+ @Override
+ public ImmutableBytesWritable getStopRow() {
+ return stopRowExclusive;
+ }
+
+ @Override
+ public Scan getScan() {
+ return scan;
+ }
+
+ @Override
+ public TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow) {
+ return new FakeTableHashReader(startRow);
+ }
+
+ private void writeObject(ObjectOutputStream s) throws IOException {
+ Gson gson = new Gson();
+ s.writeObject(gson.toJson(scan));
+ s.writeObject(gson.toJson(startRowInclusive));
+ s.writeObject(gson.toJson(stopRowExclusive));
+ s.writeObject(gson.toJson(partitions));
+ s.writeObject(gson.toJson(hashes));
+ }
+
+ private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException {
+ Gson gson = new Gson();
+ scan = gson.fromJson((String) s.readObject(), Scan.class);
+ startRowInclusive = gson.fromJson((String) s.readObject(), ImmutableBytesWritable.class);
+ stopRowExclusive = gson.fromJson((String) s.readObject(), ImmutableBytesWritable.class);
+ partitions = gson.fromJson((String) s.readObject(), ArrayList.class);
+ hashes = gson.fromJson((String) s.readObject(), ArrayList.class);
+ }
+
+ public class FakeTableHashReader implements TableHashReader {
+ private final ImmutableBytesWritable startRow;
+ // Copy of items to be read by this reader.
+ private final List> entriesToRead;
+ // First next() will make index = 0, and compare it with the size of entriesToRead.
+ private int index = -1;
+
+ public FakeTableHashReader(ImmutableBytesWritable startRow) {
+ this.startRow = startRow;
+ entriesToRead = new ArrayList<>();
+ for (KV hash : hashes) {
+ // Collect all the entries after startRow.
+ if (hash.getKey().compareTo(startRow) >= 0) {
+ entriesToRead.add(hash);
+ }
+ }
+ }
+
+ @Override
+ public boolean next() throws IOException {
+ return ++index < entriesToRead.size();
+ }
+
+ @Override
+ public ImmutableBytesWritable getCurrentKey() {
+ return entriesToRead.get(index).getKey();
+ }
+
+ @Override
+ public ImmutableBytesWritable getCurrentHash() {
+ return entriesToRead.get(index).getValue();
+ }
+
+ @Override
+ public void close() throws IOException {
+ // NOOP
+ }
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java
new file mode 100644
index 0000000000..2e65e3b855
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+public class FakeTableHashWrapperFactory extends TableHashWrapperFactory {
+
+ private static final long serialVersionUID = 269854624L;
+
+ private final FakeTableHashWrapper fakeTableHashWrapper;
+
+ public FakeTableHashWrapperFactory(FakeTableHashWrapper wrapper) {
+ this.fakeTableHashWrapper = wrapper;
+ }
+
+ @Override
+ public TableHashWrapper getTableHash(String projectId, String sourceHashDir) {
+ return fakeTableHashWrapper;
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java
new file mode 100644
index 0000000000..fa88a56d14
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import static org.junit.Assert.assertEquals;
+
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
+import org.apache.beam.sdk.testing.SourceTestUtils;
+import org.apache.beam.sdk.values.KV;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class HadoopHashBasedReaderTest {
+
+ private HadoopHashTableSource hashTableSource;
+ private FakeTableHashWrapper fakeTableHashWrapper;
+
+ private static final String HASH_TABLE_OUTPUT_PATH_DIR = "gs://my-bucket/outputDir";
+ private static final ImmutableBytesWritable START_ROW =
+ new ImmutableBytesWritable("AAAA".getBytes());
+ private static final ImmutableBytesWritable STOP_ROW =
+ new ImmutableBytesWritable("ZZZZ".getBytes());
+ private static final ImmutableBytesWritable EMPTY_ROW =
+ new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY);
+ private static final ImmutableBytesWritable START_HASH =
+ new ImmutableBytesWritable("START-HASH".getBytes());
+
+ @Before
+ public void setUp() throws Exception {
+ fakeTableHashWrapper =
+ new FakeTableHashWrapper(
+ START_ROW, STOP_ROW, new ArrayList<>(), new ArrayList<>(), new Scan());
+ hashTableSource =
+ new HadoopHashTableSource(
+ StaticValueProvider.of("cbt-dev"),
+ StaticValueProvider.of(HASH_TABLE_OUTPUT_PATH_DIR),
+ START_ROW,
+ STOP_ROW,
+ new FakeTableHashWrapperFactory(fakeTableHashWrapper));
+ }
+
+ protected static ImmutableBytesWritable getKey(int keyIndex) {
+ return new ImmutableBytesWritable(("KEY-" + keyIndex).getBytes());
+ }
+
+ protected static ImmutableBytesWritable getHash(int hashIndex) {
+ return new ImmutableBytesWritable(("HASH-" + hashIndex).getBytes());
+ }
+
+ /**
+ * Populates the fakeTableHashWrapper with {@code numEntries} entries starting with startKey.
+ * Returns a List of expected RangeHashes for this data, for numEntries=1, single RangeHash is
+ * returned (startRow, stopRow, START_HASH).
+ */
+ protected List setupTestData(
+ ImmutableBytesWritable startRow, ImmutableBytesWritable stopRow, int numEntries) {
+ fakeTableHashWrapper.startRowInclusive = startRow;
+ fakeTableHashWrapper.stopRowExclusive = stopRow;
+ fakeTableHashWrapper.hashes.add(KV.of(startRow, START_HASH));
+ for (int i = 0; i < numEntries - 1; i++) {
+ fakeTableHashWrapper.hashes.add(KV.of(getKey(i), getHash(i)));
+ }
+
+ // Setup RangeHashes to be returned
+ List expectedRangeHashes = new ArrayList<>();
+ ImmutableBytesWritable key = startRow;
+ ImmutableBytesWritable hash = START_HASH;
+ for (int i = 0; i < numEntries - 1; i++) {
+ expectedRangeHashes.add(RangeHash.of(key, getKey(i), hash));
+ key = getKey(i);
+ hash = getHash(i);
+ }
+ expectedRangeHashes.add(RangeHash.of(key, stopRow, hash));
+ return expectedRangeHashes;
+ }
+
+ /////////////////////////////// Test the end of HashTable Output /////////////////////////
+
+ @Test
+ public void testHashReaderEmpty() throws IOException {
+ // The tableHashWrapper has no hashes, this should result in empty source.
+ assertEquals(Arrays.asList(), SourceTestUtils.readFromSource(hashTableSource, null));
+ }
+
+ @Test
+ public void testHashReaderSingleHashBatch() throws IOException {
+ // Setup 1 entry in this hashtable datafile. The test is setup so that HashTable datafile has
+ // only 1 entry.
+ List expected = setupTestData(START_ROW, STOP_ROW, 1);
+
+ assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null));
+ }
+
+ @Test
+ public void testHashReaderMultipleHashBatch() throws IOException {
+ // Setup 4 entries in this hashtable datafile.
+ List expected = setupTestData(START_ROW, STOP_ROW, 4);
+ assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null));
+ }
+
+ //////////////////// Test the end of HashTable output when end of range is ""/////////////////
+ @Test
+ public void testHashReaderWithEmptyEndRow() throws IOException {
+ // Setup 4 entries in this hashtable datafile with no start or stop keys set.
+ List expected = setupTestData(EMPTY_ROW, EMPTY_ROW, 4);
+ hashTableSource.startRowInclusive = EMPTY_ROW;
+ hashTableSource.stopRowExclusive = EMPTY_ROW;
+ assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null));
+ }
+
+ /////////////////////////////// Test reader.getCurrent() >= stopRow /////////////////////////
+
+ @Test
+ public void testHashReaderWorkItemEndedOnFirstBatch() throws IOException {
+ // Setup 1 entry in this hashtable datafile. This entry is outside of the workitem's row
+ fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, START_HASH));
+ // Source will be empty as no hashes fall in its bounds.
+ assertEquals(new ArrayList(), SourceTestUtils.readFromSource(hashTableSource, null));
+ }
+
+ @Test
+ public void testHashReaderWorkItemEndedOnSecondEntry() throws IOException {
+ // Setup 1 entry in this hashtable datafile. The test is setup so that HashTable datafile has
+ // only 1 entry.
+ List expected = setupTestData(START_ROW, STOP_ROW, 1);
+ // Add a next entry at the stop row. Reader should stop and read just 1 entry.
+ fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, getHash(100)));
+
+ assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null));
+ }
+
+ @Test
+ public void testHashReaderWorkItemEndedAfterMultipleBatches() throws IOException {
+ // Setup 4 entries in this hashtable datafile.
+ List expected = setupTestData(START_ROW, STOP_ROW, 4);
+ // Add a next entry at the stop row. Reader should stop and read just 4 entry.
+ fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, getHash(100)));
+ assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null));
+ }
+
+ @Test
+ public void testSplitEqualsUnsplit() throws Exception {
+ setupTestData(START_ROW, STOP_ROW, 6);
+ fakeTableHashWrapper.partitions = Arrays.asList(getKey(2), getKey(4));
+ SourceTestUtils.assertSourcesEqualReferenceSource(
+ hashTableSource, hashTableSource.split(1, null), null);
+ }
+
+ @Test
+ public void testUnstartedReaderEqualsStarted() throws Exception {
+ setupTestData(START_ROW, STOP_ROW, 6);
+ SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource(
+ hashTableSource.createReader(null), null);
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java
new file mode 100644
index 0000000000..a3aba3f756
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.HashBasedReader;
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import com.google.common.collect.ImmutableList;
+import java.io.IOException;
+import java.util.List;
+import junit.framework.TestCase;
+import org.apache.beam.sdk.io.BoundedSource;
+import org.apache.beam.sdk.io.BoundedSource.BoundedReader;
+import org.apache.beam.sdk.options.ValueProvider;
+import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class HadoopHashTableSourceTest extends TestCase {
+
+ HadoopHashTableSource source;
+ FakeTableHashWrapper fakeTableHashWrapper;
+
+ private static final ValueProvider PROJECT_ID = StaticValueProvider.of("test-project");
+ private static final ValueProvider HASH_TABLE_OUTPUT_PATH_DIR =
+ StaticValueProvider.of("gs://my-bucket/outputDir");
+ private static final ImmutableBytesWritable START_ROW =
+ new ImmutableBytesWritable("a".getBytes());
+ private static final ImmutableBytesWritable STOP_ROW = new ImmutableBytesWritable("z".getBytes());
+ private static final ImmutableBytesWritable PARTITION1 =
+ new ImmutableBytesWritable("d".getBytes());
+ private static final ImmutableBytesWritable PARTITION2 =
+ new ImmutableBytesWritable("g".getBytes());
+ private static final ImmutableBytesWritable EMPTY_ROW_KEY =
+ new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY);
+
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ fakeTableHashWrapper = new FakeTableHashWrapper();
+ }
+
+ private List> getSplitSources(
+ List partitions,
+ ImmutableBytesWritable startRow,
+ ImmutableBytesWritable stopRow)
+ throws IOException {
+ fakeTableHashWrapper.startRowInclusive = startRow;
+ fakeTableHashWrapper.stopRowExclusive = stopRow;
+ fakeTableHashWrapper.partitions = partitions;
+
+ source =
+ new HadoopHashTableSource(
+ PROJECT_ID,
+ HASH_TABLE_OUTPUT_PATH_DIR,
+ startRow,
+ stopRow,
+ new FakeTableHashWrapperFactory(fakeTableHashWrapper));
+ return (List>) source.split(0, null);
+ }
+
+ private void testSourceSplits(
+ List partitions,
+ ImmutableBytesWritable startRow,
+ ImmutableBytesWritable stopRow,
+ List> expectedSources)
+ throws IOException {
+ assertEquals(expectedSources, getSplitSources(partitions, startRow, stopRow));
+ }
+
+ @Test
+ public void testSplitZeroPartitions() throws IOException {
+ // Row range [a-z) with no splits.
+ List> expected =
+ ImmutableList.of(
+ new HadoopHashTableSource(PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, STOP_ROW));
+ testSourceSplits(ImmutableList.of(), START_ROW, STOP_ROW, expected);
+ }
+
+ @Test
+ public void testSplitOnePartition() throws IOException {
+ // Row range [a-z) with 1 splits.
+ List> expected =
+ ImmutableList.of(
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1),
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, STOP_ROW));
+ testSourceSplits(ImmutableList.of(PARTITION1), START_ROW, STOP_ROW, expected);
+ }
+
+ @Test
+ public void testMultiplePartitons() throws IOException {
+ // Row range [a-z) with splits on {d,g}. The data files will be for {[a,d), [d,g), [g,z)}.
+ List> expected =
+ ImmutableList.of(
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1),
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2),
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, STOP_ROW));
+ testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), START_ROW, STOP_ROW, expected);
+ }
+
+ @Test
+ public void testSplitEmptyStartRow() throws IOException {
+ // Row range [""-z) with splits on {d,g}. The data files will be for {["",d), [d,g), [g,z)}.
+ List> expected =
+ ImmutableList.of(
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, EMPTY_ROW_KEY, PARTITION1),
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2),
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, STOP_ROW));
+ testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), EMPTY_ROW_KEY, STOP_ROW, expected);
+ }
+
+ @Test
+ public void testSplitEmptyStopRow() throws IOException {
+ // Row range [a-"") with splits on {d,g}. The data files will be for {[a,d), [d,g), [g,"")}.
+ List> expected =
+ ImmutableList.of(
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1),
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2),
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, EMPTY_ROW_KEY));
+ testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), START_ROW, EMPTY_ROW_KEY, expected);
+ }
+
+ @Test
+ public void testSplitFullTableScan() throws IOException {
+ // Row range [""-"") with splits on {d,g}. The data files will be for {["",d), [d,g), [g,"")}.
+ List> expected =
+ ImmutableList.of(
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, EMPTY_ROW_KEY, PARTITION1),
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2),
+ new HadoopHashTableSource(
+ PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, EMPTY_ROW_KEY));
+ testSourceSplits(
+ ImmutableList.of(PARTITION1, PARTITION2), EMPTY_ROW_KEY, EMPTY_ROW_KEY, expected);
+ }
+
+ @Test
+ public void testCreateReaderWithoutSplit() throws IOException {
+ source =
+ new HadoopHashTableSource(
+ PROJECT_ID,
+ HASH_TABLE_OUTPUT_PATH_DIR,
+ // When split is not called, start/stop are uninitialized. Start/stop are runtime params
+ // and are initialized in split/createReader.
+ null,
+ null,
+ new FakeTableHashWrapperFactory(fakeTableHashWrapper));
+ // Setup boundaries on the TableHashWrapper to be used in Source.
+ fakeTableHashWrapper.startRowInclusive = START_ROW;
+ fakeTableHashWrapper.stopRowExclusive = STOP_ROW;
+
+ // Create a new Reader
+ BoundedReader reader = source.createReader(null);
+
+ // Validate that the reader was properly created.
+ assertEquals(HashBasedReader.class, reader.getClass());
+ assertEquals(source, reader.getCurrentSource());
+ HashBasedReader hashBasedReader = (HashBasedReader) reader;
+ assertEquals(START_ROW, hashBasedReader.startRowInclusive);
+ assertEquals(STOP_ROW, hashBasedReader.stopRowExclusive);
+ }
+
+ @Test
+ public void testCreateReaderAfterSplit() throws IOException {
+ // Single partitions will return a 2 sources.
+ List> splitSources =
+ getSplitSources(ImmutableList.of(PARTITION1), START_ROW, STOP_ROW);
+ BoundedSource splitHashSource = splitSources.get(0);
+
+ // Create a new Reader
+ BoundedReader reader = splitHashSource.createReader(null);
+
+ // Validate that the reader was properly created.
+ assertEquals(HashBasedReader.class, reader.getClass());
+ assertEquals(splitHashSource, reader.getCurrentSource());
+ HashBasedReader hashBasedReader = (HashBasedReader) reader;
+ assertEquals(START_ROW, hashBasedReader.startRowInclusive);
+ assertEquals(PARTITION1, hashBasedReader.stopRowExclusive);
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java
new file mode 100644
index 0000000000..f58becf3cb
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import static com.google.common.truth.Truth.assertWithMessage;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import junit.framework.TestCase;
+import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class HashBasedSourceSerializationTest extends TestCase {
+
+ public static final String SOURCE_HASH_DIR = "gs://my-bucket/outputDir";
+ public static final String PROJECT_ID = "test-project";
+ private static final ImmutableBytesWritable START_ROW =
+ new ImmutableBytesWritable("a".getBytes());
+ private static final ImmutableBytesWritable STOP_ROW = new ImmutableBytesWritable("y".getBytes());
+
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ }
+
+ @Test
+ public void testSerializeWithValueProviders() throws IOException {
+ checkSerialization(
+ new HadoopHashTableSource(
+ StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR)));
+ }
+
+ @Test
+ public void testSerializeWithStartStop() throws IOException {
+ checkSerialization(
+ new HadoopHashTableSource(
+ StaticValueProvider.of(PROJECT_ID),
+ StaticValueProvider.of(SOURCE_HASH_DIR),
+ new ImmutableBytesWritable(START_ROW),
+ new ImmutableBytesWritable(STOP_ROW)));
+ }
+
+ @Test
+ public void testBufferedSourceSerialize() {
+ checkSerialization(
+ new BufferedHadoopHashTableSource(
+ new HadoopHashTableSource(
+ StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR))));
+ }
+
+ @Test
+ public void testBufferedSourceSerializeWithBatchSize() {
+ checkSerialization(
+ new BufferedHadoopHashTableSource(
+ new HadoopHashTableSource(
+ StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR)),
+ 5));
+ }
+
+ private static void checkSerialization(Object source) {
+ try {
+ Object deserialized = serializeDeserialize(source);
+ checkClassDeclaresSerialVersionUid(source.getClass());
+ assertEquals(source, deserialized);
+ } catch (IOException | ClassNotFoundException e) {
+ fail(e.toString());
+ }
+ }
+
+ private static void checkClassDeclaresSerialVersionUid(Class cls) {
+ String uid = "serialVersionUID";
+ for (Field field : cls.getDeclaredFields()) {
+ if (field.getName() == uid) {
+ int modifiers = field.getModifiers();
+ assertWithMessage(field + " is not static").that(Modifier.isStatic(modifiers)).isTrue();
+ assertWithMessage(field + " is not final").that(Modifier.isFinal(modifiers)).isTrue();
+ assertWithMessage(field + " is not private").that(Modifier.isPrivate(modifiers)).isTrue();
+ assertWithMessage(field + " must be long")
+ .that(field.getType().getSimpleName())
+ .isEqualTo("long");
+ return;
+ }
+ }
+ fail(cls + " does not declare serialVersionUID");
+ }
+
+ private static Object serializeDeserialize(Object obj)
+ throws IOException, ClassNotFoundException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ try (ObjectOutputStream outStream = new ObjectOutputStream(bos)) {
+ outStream.writeObject(obj);
+ }
+
+ ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray());
+ try (ObjectInputStream inStream = new ObjectInputStream(bis)) {
+ return inStream.readObject();
+ }
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java
new file mode 100644
index 0000000000..5f644e3b50
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2021 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.validation;
+
+import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
+import org.apache.beam.sdk.coders.CoderException;
+import org.apache.beam.sdk.testing.CoderProperties;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.beam.sdk.values.TypeDescriptor;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class RangeHashCoderTest {
+ private static final RangeHashCoder TEST_CODER = new RangeHashCoder();
+ private static final ImmutableBytesWritable START =
+ new ImmutableBytesWritable("Start".getBytes());
+ private static final ImmutableBytesWritable STOP = new ImmutableBytesWritable("Stop".getBytes());
+ private static final ImmutableBytesWritable HASH = new ImmutableBytesWritable("hash".getBytes());
+ private static final ImmutableBytesWritable EMPTY =
+ new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY);
+
+ @Test
+ public void encodeRangeHash() throws Exception {
+ CoderProperties.coderDecodeEncodeEqual(TEST_CODER, RangeHash.of(START, STOP, HASH));
+ }
+
+ @Test(expected = CoderException.class)
+ public void encodeNullThrowsCoderException() throws Exception {
+ CoderUtils.encodeToByteArray(TEST_CODER, null);
+ }
+
+ @Test
+ public void testEncodedTypeDescriptor() throws Exception {
+ Assert.assertEquals(TEST_CODER.getEncodedTypeDescriptor(), TypeDescriptor.of(RangeHash.class));
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/README.md b/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/README.md
new file mode 100644
index 0000000000..3d9b722bb9
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/README.md
@@ -0,0 +1,18 @@
+# Generating the test HBase snapshot for HBase snapshot import integration tests
+
+The file `generate_test_data.txt` is an HBase command line command sequence
+used to generated the testing HBase snapshot data.
+
+If you need to modify the test data used by `bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java`,
+Please make sure you have HBase installed and export `/bin` to your PATH.
+
+Then:
+
+ $ hbase shell ./generate_test_data.txt
+ $ hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -Dmapreduce.framework.name=local -snapshot test-snapshot -copy-to file:////data
+
+ $ cd
+ $ gsutil -m cp -r ./data/ gs:///integration-test/
+
+After this, you use be able to run the integration test with your new data by specifying
+`-Dcloud.test.data.folder=gs:///integration-test/`
\ No newline at end of file
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/log4j.properties b/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/log4j.properties
index 7f9118c7bc..c609eb001a 100644
--- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/log4j.properties
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/resources/log4j.properties
@@ -22,3 +22,7 @@ log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
log4j.category.org.apache.beam.sdk.io.FileBasedSource=WARN
log4j.category.com.google.cloud.bigtable.beam.sequencefiles.SequenceFileSource=WARN
+# make hbase snapshot import integration tests output less verbose.
+log4j.category.org.apache.hadoop=WARN
+log4j.category.org.apache.beam.runners.dataflow.util.MonitoringUtil=WARN
+log4j.category.org.apache.beam.runners.dataflow.util.MonitoringUtil.LoggingHandler=WARN
\ No newline at end of file
diff --git a/bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/pom.xml b/bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/pom.xml
index ef0b866ec6..7cf1993350 100644
--- a/bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/pom.xml
+++ b/bigtable-hbase-1.x-parent/bigtable-hbase-1.x-mapreduce/pom.xml
@@ -39,6 +39,16 @@ limitations under the License.
provided
+
+
+
+ org.apache.hadoop
+ hadoop-common
+ ${hadoop.version}
+ ${hadoop.scope}
+
+
+
${project.groupId}
diff --git a/pom.xml b/pom.xml
index 215a7bc69e..9e6d60dc08 100644
--- a/pom.xml
+++ b/pom.xml
@@ -81,6 +81,7 @@ limitations under the License.
30.0-android
20.0
1.7
+ 29.0-jre
1.29.0
@@ -165,7 +166,7 @@ limitations under the License.
org.apache.maven.plugins
maven-shade-plugin
- 3.2.2
+ 3.2.4
org.apache.maven.plugins
@@ -175,7 +176,7 @@ limitations under the License.
org.apache.maven.plugins
maven-javadoc-plugin
- 3.1.1
+ 3.2.0
none