diff --git a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml index 218dc06db8..778083f0b9 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/pom.xml +++ b/bigtable-dataflow-parent/bigtable-beam-import/pom.xml @@ -76,7 +76,6 @@ limitations under the License. - org.apache.beam beam-sdks-java-core @@ -217,6 +216,23 @@ limitations under the License. ${hbase.version} test + + com.google.truth + truth + 1.0.1 + test + + + com.google.cloud + google-cloud-bigtable-emulator + 0.124.0 + test + + + com.google.code.findbugs + jsr305 + ${jsr305.version} + diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java index b346b90837..1f52f5125a 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/Main.java @@ -21,6 +21,7 @@ import com.google.cloud.bigtable.beam.sequencefiles.CreateTableHelper; import com.google.cloud.bigtable.beam.sequencefiles.ExportJob; import com.google.cloud.bigtable.beam.sequencefiles.ImportJob; +import com.google.cloud.bigtable.beam.validation.SyncTableJob; import java.io.File; import java.net.URISyntaxException; import java.util.Arrays; @@ -53,6 +54,9 @@ public static void main(String[] args) throws Exception { case "create-table": CreateTableHelper.main(subArgs); break; + case "sync-table": + SyncTableJob.main(subArgs); + break; default: usage(); System.exit(1); diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java index e64507317b..f839a50b23 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/TemplateUtils.java @@ -26,6 +26,7 @@ import com.google.bigtable.repackaged.com.google.cloud.bigtable.data.v2.models.Query; import com.google.cloud.bigtable.beam.sequencefiles.ExportJob.ExportOptions; import com.google.cloud.bigtable.beam.sequencefiles.ImportJob.ImportOptions; +import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions; import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; import com.google.cloud.bigtable.hbase.adapters.Adapters; import com.google.cloud.bigtable.hbase.adapters.read.DefaultReadHooks; @@ -72,6 +73,19 @@ public static CloudBigtableTableConfiguration BuildImportConfig(ImportOptions op return builder.build(); } + /** Builds CloudBigtableTableConfiguration from input runtime parameters for import job. */ + public static CloudBigtableTableConfiguration BuildSyncTableConfig(SyncTableOptions opts) { + CloudBigtableTableConfiguration.Builder builder = + new CloudBigtableTableConfiguration.Builder() + .withProjectId(opts.getBigtableProject()) + .withInstanceId(opts.getBigtableInstanceId()) + .withTableId(opts.getBigtableTableId()); + if (opts.getBigtableAppProfileId() != null) { + builder.withAppProfileId(opts.getBigtableAppProfileId()); + } + return builder.build(); + } + /** Provides a request that is constructed with some attributes. */ private static class RequestValueProvider implements ValueProvider, Serializable { diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java new file mode 100644 index 0000000000..e62b3c8215 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java @@ -0,0 +1,199 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.coders.ListCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.BoundedSource; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Buffers the RangeHashes generated by {@link HadoopHashTableSource}. This is an optimization that + * allows {@link ComputeAndValidateHashFromBigtableDoFn} to issue fewer ReadRow APIs with larger row + * ranges. + * + *

Hadoop HashTable output is sorted by row-key and contains a row-range and hash. Beam + * Pcollection do not guarantee any ordering. To fetch a batch of ranges in 1 ReadRows operation, + * this source buffers then and outputs a List guaranteeing the sorted order of ranges. + * + *

Emits a batch of sorted RangeHashes keyed by the start key of the first range. + */ +class BufferedHadoopHashTableSource extends BoundedSource>> { + + private static final long serialVersionUID = 39842743L; + + private static final int DEFAULT_BATCH_SIZE = 50; + private static final Coder>> CODER = + KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of()));; + + // Max number of RangeHashes to buffer. + private final int maxBufferSize; + private final HadoopHashTableSource hashTableSource; + + public BufferedHadoopHashTableSource(HadoopHashTableSource source) { + this(source, DEFAULT_BATCH_SIZE); + } + + public BufferedHadoopHashTableSource(HadoopHashTableSource hashTableSource, int maxBufferSize) { + this.hashTableSource = hashTableSource; + this.maxBufferSize = maxBufferSize; + } + + @Override + public List>>> split( + long desiredBundleSizeBytes, PipelineOptions options) throws IOException { + + @SuppressWarnings("unchecked") + List splitHashTableSources = + (List) hashTableSource.split(desiredBundleSizeBytes, options); + + List splitSources = + new ArrayList<>(splitHashTableSources.size()); + // Keep the splits same as HashTableSource. + for (HadoopHashTableSource splitHashTableSource : splitHashTableSources) { + // Add the last range for [lastPartition, stopRow). + splitSources.add(new BufferedHadoopHashTableSource(splitHashTableSource)); + } + return splitSources; + } + + @Override + public Coder>> getOutputCoder() { + return CODER; + } + + @Override + public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { + // HashTable data files don't expose a method to estimate size or lineCount. + return hashTableSource.getEstimatedSizeBytes(options); + } + + @Override + public BoundedReader>> createReader(PipelineOptions options) + throws IOException { + return new BufferedHashBasedReader(this, hashTableSource.createReader(options)); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof BufferedHadoopHashTableSource)) { + return false; + } + BufferedHadoopHashTableSource that = (BufferedHadoopHashTableSource) o; + return maxBufferSize == that.maxBufferSize + && Objects.equal(hashTableSource, that.hashTableSource); + } + + @Override + public int hashCode() { + return Objects.hashCode(maxBufferSize, hashTableSource); + } + + @Override + public String toString() { + return "BufferedHadoopHashTableSource [" + + immutableBytesToString(hashTableSource.startRowInclusive) + + ", " + + immutableBytesToString(hashTableSource.stopRowExclusive) + + "), maxBufferSize=" + + maxBufferSize; + } + + private static class BufferedHashBasedReader extends BoundedReader>> { + + private final BoundedReader hashReader; + private final BufferedHadoopHashTableSource source; + + private List buffer; + + public BufferedHashBasedReader( + BufferedHadoopHashTableSource source, BoundedReader hashReader) { + this.source = source; + this.hashReader = hashReader; + this.buffer = new ArrayList<>(source.maxBufferSize); + } + + @Override + public boolean start() throws IOException { + if (!hashReader.start()) { + // HashReader does not have any hashes, return empty reader. + return false; + } + // Start returned true, consume the current RangeHash. + buffer.add(hashReader.getCurrent()); + bufferRangeHashes(); + // Buffer is not empty, return true to consume the current buffer. + return true; + } + + // Reads from hashReader and buffers the RangeHashes. + // Returns true if any RangeHashes were read from hashReader. + private boolean bufferRangeHashes() throws IOException { + boolean readRangeHashes = false; + while (buffer.size() < source.maxBufferSize && hashReader.advance()) { + readRangeHashes = true; + buffer.add(hashReader.getCurrent()); + } + return readRangeHashes; + } + + @Override + public boolean advance() throws IOException { + // Reset the buffer for next batch. + buffer = new ArrayList<>(source.maxBufferSize); + + return bufferRangeHashes(); + } + + @Override + public KV> getCurrent() { + // getCurrent only gets called when buffer is not empty. + Preconditions.checkState( + !buffer.isEmpty(), "getCurrent() should only be called when start/advance return true."); + // GroupBy key is a string and not ImmutableBytesWritable because the WritableCoder is not + // deterministic. The outputted PCollection is grouped by the K and needs a deterministic + // coder. Having a String K leads to an unfortunate double encoding, ImmutableBytesWritable-> + // HEX string -> UTF8 encoded string. The number of batches are significantly smaller than + // data fetched from Bigtable and should not have meaningful impact on the job performance. + return KV.of(Bytes.toStringBinary(buffer.get(0).startInclusive.copyBytes()), buffer); + } + + @Override + public void close() throws IOException { + hashReader.close(); + } + + @Override + public BoundedSource>> getCurrentSource() { + return source; + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java new file mode 100644 index 0000000000..a75833b022 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFn.java @@ -0,0 +1,217 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.bigtable.repackaged.com.google.common.base.Preconditions; +import com.google.bigtable.repackaged.com.google.common.collect.Lists; +import com.google.cloud.bigtable.beam.AbstractCloudBigtableTableDoFn; +import com.google.cloud.bigtable.beam.CloudBigtableConfiguration; +import com.google.cloud.bigtable.beam.TemplateUtils; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions; +import com.google.common.annotations.VisibleForTesting; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher; + +/** + * A {@link DoFn} that takes a row range and hash from HBase and validates the hash from rows read + * from Cloud Bigtable. + */ +class ComputeAndValidateHashFromBigtableDoFn + extends AbstractCloudBigtableTableDoFn>>, RangeHash> { + + private static final long serialVersionUID = 2349094L; + private final ValueProvider tableName; + private final ValueProvider projectId; + private final ValueProvider sourceHashDir; + + private final TableHashWrapperFactory tableHashWrapperFactory; + + // Counter for reporting matching and mismatching ranges. Names are similar to HBase sync-table + // job. + private final Counter matches = Metrics.counter("cbt-dataflow-validate", "ranges_matched"); + private final Counter mismatches = Metrics.counter("cbt-dataflow-validate", "ranges_not_matched"); + + public ComputeAndValidateHashFromBigtableDoFn(SyncTableOptions options) { + super(TemplateUtils.BuildSyncTableConfig(options)); + this.tableName = options.getBigtableTableId(); + // Create a local copy of ValueProviders, PipelineOptions are not serializable. + projectId = options.getBigtableProject(); + sourceHashDir = options.getHashTableOutputDir(); + tableHashWrapperFactory = new TableHashWrapperFactory(); + } + + @VisibleForTesting + ComputeAndValidateHashFromBigtableDoFn( + CloudBigtableConfiguration config, + ValueProvider tableName, + ValueProvider projectId, + ValueProvider sourceHashDir, + TableHashWrapperFactory factory) { + super(config); + this.tableName = tableName; + this.tableHashWrapperFactory = factory; + this.sourceHashDir = projectId; + this.projectId = sourceHashDir; + } + + @ProcessElement + public void processElement(ProcessContext context) throws Exception { + List> wrapperdRangeHashes = Lists.newArrayList(context.element().getValue()); + // BufferedHadoopHashTableSource generates only 1 item per groupby key, key is startKey for the + // Sorted ranges. + Preconditions.checkState( + wrapperdRangeHashes.size() == 1, "Can not have muiple entries for a key"); + List rangeHashes = wrapperdRangeHashes.get(0); + Preconditions.checkState(!rangeHashes.isEmpty(), "Can not have empty ranges in DO_FN"); + + ImmutableBytesWritable rangeStartInclusive = rangeHashes.get(0).startInclusive; + ImmutableBytesWritable rangeEndExclusive = + rangeHashes.get(rangeHashes.size() - 1).stopExclusive; + + BigtableResultHasher resultHasher = new BigtableResultHasher(); + resultHasher.startBatch(rangeStartInclusive); + + // Since all the row-ranges are sorted in HashTable's data files, 1 big scan can be used + // to read all the row ranges. Parallelism is achieved by splitting the HashTable's data + // files into smaller bundle of row-ranges in GroupBy. + ResultScanner scanner = + createBigtableScan(rangeStartInclusive.copyBytes(), rangeEndExclusive.copyBytes()); + + Iterator rangeHashIterator = rangeHashes.iterator(); + long numRows = 0; + + RangeHash currentRangeHash = rangeHashIterator.next(); + + // Process each row and validate hashes + for (Result result : scanner) { + numRows++; + if (numRows % 10_000 == 0) { + // Heartbeat in logs in case a large scan gets hung. + DOFN_LOG.debug("Processed " + numRows + " rows "); + } + + ImmutableBytesWritable rowKey = new ImmutableBytesWritable(result.getRow()); + + // Check if the rowKey belongs to current range, if not keep iterating through the + // rangeHashes until rowKey's range is found. + while (!isWithinUpperBound(currentRangeHash.stopExclusive, rowKey)) { + validateBatchHash(context, resultHasher, currentRangeHash); + // THIS SHOULD NEVER HAPPEN. Bigtable is being scanned till the last + // RangeHash.endKeyExclusive(), so bigtable's result should not outlast the + // rangeHashes. + Preconditions.checkState( + rangeHashIterator.hasNext(), + "Buffer reached to end while scan is still active at row : %s. " + + "Affected Range: [%s, %s)." + + immutableBytesToString(result.getRow()) + + immutableBytesToString(rangeStartInclusive) + + immutableBytesToString(rangeEndExclusive)); + currentRangeHash = rangeHashIterator.next(); + } + + // Always Hash the current row. + resultHasher.hashResult(result); + } + + // Bigtable scan is finished at this point and rangeHashes may contain additional row ranges. + // Last range will always be unverified as the range end is exclusive and + // currentRow > rangeEndExclusive will never by true. Verify the last range. + validateBatchHash(context, resultHasher, currentRangeHash); + + // If there are remaining ranges in the rangeHashes they all need to reported as mismatched as + // there is nothing in Cloud Bigtable for those row ranges. + // for (int i = bufferIndex; i < rangeHashes.size(); i++) { + while (rangeHashIterator.hasNext()) { + currentRangeHash = rangeHashIterator.next(); + reportMismatch(context, currentRangeHash); + } + + DOFN_LOG.debug( + "Finishing context by outputting {} keys in range [{}, {}).", + rangeHashes.size(), + immutableBytesToString(rangeStartInclusive), + immutableBytesToString(rangeEndExclusive)); + } + + private ResultScanner createBigtableScan(byte[] startKeyInclusive, byte[] stopKeyExclusive) + throws IOException { + Table table = getConnection().getTable(TableName.valueOf(tableName.get())); + // Get the scan from TableHash, HashTable can be run to hash a small part of data (selected + // column families, timestamp range, maxVersions etc), this scan allows us to fetch the same + // data from Cloud Bigtable to match. + TableHashWrapper tableHash = + tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get()); + Scan scan = tableHash.getScan(); + // Set the workitem boundaries on the scan. + if (startKeyInclusive.length > 0) { + scan.withStartRow(startKeyInclusive, true); + } + if (stopKeyExclusive.length > 0) { + scan.withStopRow(stopKeyExclusive, false); + } + + return table.getScanner(scan); + } + + /** + * Determines if row >= stopExclusive for a row range (start, stopExclusive). Empty stopExclusive + * represents a range with no upper bound. + */ + private static boolean isWithinUpperBound( + ImmutableBytesWritable stopExclusive, ImmutableBytesWritable row) { + return stopExclusive.equals(HConstants.EMPTY_END_ROW) || row.compareTo(stopExclusive) < 0; + } + + private void validateBatchHash( + ProcessContext context, BigtableResultHasher resultHasher, RangeHash currentRangeHash) { + // The batch is always started, so its safe to finish the batch. If there were no rows, we will + // get a hash for empty batch. + resultHasher.finishBatch(); + if (!resultHasher.getBatchHash().equals(currentRangeHash.hash)) { + reportMismatch(context, currentRangeHash); + } else { + matches.inc(); + } + // Start a new batch + resultHasher.startBatch(currentRangeHash.stopExclusive); + } + + private void reportMismatch(ProcessContext context, RangeHash currentRangeHash) { + mismatches.inc(); + DOFN_LOG.info( + "MISMATCH ON RANGE [{}, {}).", + immutableBytesToString(currentRangeHash.startInclusive), + immutableBytesToString(currentRangeHash.stopExclusive)); + context.output(currentRangeHash); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java new file mode 100644 index 0000000000..f6ecf21e24 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSource.java @@ -0,0 +1,440 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import com.google.bigtable.repackaged.com.google.common.annotations.VisibleForTesting; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.beam.validation.TableHashWrapper.TableHashReader; +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import javax.annotation.Nullable; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.DefaultCoder; +import org.apache.beam.sdk.io.BoundedSource; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * A beam source to read output of Hadoop HashTable job. The source creates 1 workitem per HashTable + * data file and emits a row-range/hash pair. + */ +@InternalApi +public class HadoopHashTableSource extends BoundedSource implements Serializable { + + private static final long serialVersionUID = 2383724L; + + private static final Coder CODER = RangeHashCoder.of(); + + /** + * A simple POJO encapsulating a row range and the corresponding hash generated by HashTable job. + * TODO Evaluate if we can use AutoValue for this class. + */ + @DefaultCoder(RangeHashCoder.class) + public static class RangeHash { + + public final ImmutableBytesWritable startInclusive; + public final ImmutableBytesWritable stopExclusive; + public final ImmutableBytesWritable hash; + + private RangeHash( + ImmutableBytesWritable startInclusive, + ImmutableBytesWritable stopExclusive, + ImmutableBytesWritable hash) { + this.startInclusive = startInclusive; + this.stopExclusive = stopExclusive; + this.hash = hash; + } + + static RangeHash of( + ImmutableBytesWritable startInclusive, + ImmutableBytesWritable stopExclusive, + ImmutableBytesWritable hash) { + Preconditions.checkNotNull(startInclusive); + Preconditions.checkNotNull(stopExclusive); + Preconditions.checkNotNull(hash); + return new RangeHash(startInclusive, stopExclusive, hash); + } + + @Override + public String toString() { + return String.format( + "RangeHash{ range = [ %s, %s), hash: %s }", + immutableBytesToString(startInclusive), + immutableBytesToString(stopExclusive), + immutableBytesToString(hash)); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof RangeHash)) { + return false; + } + RangeHash rangeHash = (RangeHash) o; + return Objects.equal(startInclusive, rangeHash.startInclusive) + && Objects.equal(stopExclusive, rangeHash.stopExclusive) + && Objects.equal(hash, rangeHash.hash); + } + + @Override + public int hashCode() { + return Objects.hashCode(startInclusive, stopExclusive, hash); + } + } + + public static final Log LOG = LogFactory.getLog(HadoopHashTableSource.class); + + private final ValueProvider projectId; + + // Path to the output of HashTable job. Usually in GCS. + private final ValueProvider sourceHashDir; + + // Row range owned by this source. + // The Start and Stop row are serialized in a custom way. + @VisibleForTesting @Nullable transient ImmutableBytesWritable startRowInclusive; + + @VisibleForTesting @Nullable transient ImmutableBytesWritable stopRowExclusive; + + private final TableHashWrapperFactory tableHashWrapperFactory; + + /** + * Creates a HadoopHashTableSource that reads HashTable data from hashTableOutputDir in GCS bucket + * in project $(projectId). + */ + public HadoopHashTableSource( + ValueProvider projectId, ValueProvider sourceHashDir) { + this(projectId, sourceHashDir, /*startRowInclusive*/ null, /*stopRowExclusive*/ null); + } + + /** + * Constructor to initialize a HadoopHashTableSource for a given row-range. Used for creating + * split sources. + */ + @VisibleForTesting + HadoopHashTableSource( + ValueProvider projectId, + ValueProvider sourceHashDir, + @Nullable ImmutableBytesWritable startRowInclusive, + @Nullable ImmutableBytesWritable stopRowExclusive) { + this( + projectId, + sourceHashDir, + startRowInclusive, + stopRowExclusive, + new TableHashWrapperFactory()); + } + + @VisibleForTesting + HadoopHashTableSource( + ValueProvider projectId, + ValueProvider hadoopHashTableOutputDir, + @Nullable ImmutableBytesWritable startRowInclusive, + @Nullable ImmutableBytesWritable stopRowExclusive, + TableHashWrapperFactory tableHashWrapperFactory) { + this.projectId = projectId; + this.sourceHashDir = hadoopHashTableOutputDir; + // startRow and stopRow will be null when the template is initialized. startRow and stopRow are + // read from the hashTableOutputDir, which is only available at pipeline runtime. + this.startRowInclusive = startRowInclusive; + this.stopRowExclusive = stopRowExclusive; + this.tableHashWrapperFactory = tableHashWrapperFactory; + } + + @Override + public List> split( + long desiredBundleSizeBytes, PipelineOptions options) throws IOException { + // This method relies on the partitioning done by HBase-HashTable job. There is a possibility + // of stragglers. SyncTable handles it by using a group by and further splitting workitems. + TableHashWrapper hash = + tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get()); + + ImmutableList partitions = hash.getPartitions(); + int numPartitions = partitions.size(); + + List splitSources = new ArrayList<>(numPartitions + 1); + if (numPartitions == 0) { + // There are 0 partitions and 1 hashfile, return single source with full key range. + splitSources.add( + new HadoopHashTableSource( + projectId, + sourceHashDir, + hash.getStartRow(), + hash.getStopRow(), + tableHashWrapperFactory)); + return splitSources; + } + + // Use the HashTable start key. The value is HConstants.EMPTY_START_ROW for full table scan. + ImmutableBytesWritable nextStartRow = hash.getStartRow(); + ImmutableBytesWritable stopRow = hash.getStopRow(); + + // The output of HashTable is organized as partition file and a set of datafiles. + // Partition file contains a list of partitions, these partitions split the key-range of a table + // into roughly equal row-ranges and hashes for these row-ranges are stored in a single + // datafile. + // + // There are always numPartitions +1 data files. Datafile(i) covers hashes for [partition{i-1}, + // partition{i}). + // So a partition file containing entries [b,f] for a table with row range [a,z] will have 3 + // data files containing hashes. + // file0 will contain [a(nextStartRow), b), file1 will contain [b,f), and file3 will contain + // [f,z(stopRow)) + for (int i = 0; i < numPartitions; i++) { + // TODO make a utility function that generates [start, end) format from start/end. + LOG.debug( + "Adding: [" + + immutableBytesToString(nextStartRow.get()) + + ", " + + immutableBytesToString(partitions.get(i).get()) + + ")"); + splitSources.add( + new HadoopHashTableSource( + projectId, sourceHashDir, nextStartRow, partitions.get(i), tableHashWrapperFactory)); + nextStartRow = partitions.get(i); + } + // Add the last range for [lastPartition, stopRow). + LOG.debug( + "Adding: [" + + immutableBytesToString(nextStartRow.get()) + + ", " + + immutableBytesToString(stopRow.get()) + + ")"); + // Add the last range for [lastPartition, stopRow). + splitSources.add( + new HadoopHashTableSource( + projectId, sourceHashDir, nextStartRow, stopRow, tableHashWrapperFactory)); + LOG.info("Returning " + splitSources.size() + " sources from " + numPartitions + " partitions"); + return splitSources; + } + + @Override + public Coder getOutputCoder() { + return CODER; + } + + @Override + public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { + // HashTable data files don't expose a method to estimate size or lineCount. + return 0; + } + + @Override + public BoundedReader createReader(PipelineOptions options) throws IOException { + TableHashWrapper hash = + tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get()); + + // The row range for an un-split source is determined from the output of HashTable job. + // HashTableOutputDir is a runtime parameter and hence not available at construction time, so + // populate the start and stop here. + if (startRowInclusive == null || stopRowExclusive == null) { + startRowInclusive = hash.getStartRow(); + stopRowExclusive = hash.getStopRow(); + } + + return new HashBasedReader( + this, + startRowInclusive, + stopRowExclusive, + hash.newReader( + SyncTableUtils.createConfiguration(this.projectId.get(), this.sourceHashDir.get()), + startRowInclusive)); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof HadoopHashTableSource)) { + return false; + } + HadoopHashTableSource that = (HadoopHashTableSource) o; + return Objects.equal(projectId, that.projectId) + && Objects.equal(sourceHashDir, that.sourceHashDir) + && Objects.equal(startRowInclusive, that.startRowInclusive) + && Objects.equal(stopRowExclusive, that.stopRowExclusive); + } + + @Override + public int hashCode() { + return Objects.hashCode(projectId, sourceHashDir, startRowInclusive, stopRowExclusive); + } + + @Override + public String toString() { + return "HadoopHashTableSource [" + + immutableBytesToString(startRowInclusive) + + ", " + + immutableBytesToString(stopRowExclusive) + + ')'; + } + + private void writeObject(ObjectOutputStream s) throws IOException { + s.defaultWriteObject(); + // Start and Stop can be null, write a boolean to indicate if start/stop is expected. + if (startRowInclusive == null) { + s.writeBoolean(false); + } else { + s.writeBoolean(true); + s.writeObject(startRowInclusive.copyBytes()); + } + + if (stopRowExclusive == null) { + s.writeBoolean(false); + } else { + s.writeBoolean(true); + s.writeObject(stopRowExclusive.copyBytes()); + } + } + + private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { + s.defaultReadObject(); + // start/stop can be null, they are preceded by a boolean indicating their presence. + if (s.readBoolean() == true) { + startRowInclusive = new ImmutableBytesWritable((byte[]) s.readObject()); + } + if (s.readBoolean() == true) { + stopRowExclusive = new ImmutableBytesWritable((byte[]) s.readObject()); + } + } + + @VisibleForTesting + static class HashBasedReader extends BoundedReader { + + private final HadoopHashTableSource source; + private final TableHashReader reader; + + @VisibleForTesting final ImmutableBytesWritable startRowInclusive; + @VisibleForTesting final ImmutableBytesWritable stopRowExclusive; + + // Flag indicating that this workitem is finished. + private boolean isDone = false; + private ImmutableBytesWritable currentRangeStartKey; + // Hash for the current range. + private ImmutableBytesWritable currentHash; + private RangeHash currentRangeHash; + + public HashBasedReader( + HadoopHashTableSource source, + ImmutableBytesWritable startRowInclusive, + ImmutableBytesWritable stopRowExclusive, + TableHashReader reader) { + this.source = source; + this.startRowInclusive = startRowInclusive; + this.stopRowExclusive = stopRowExclusive; + this.reader = reader; + } + + @Override + public boolean start() throws IOException { + LOG.debug( + "Starting a new reader at key range [" + + immutableBytesToString(startRowInclusive) + + " ," + + immutableBytesToString(stopRowExclusive) + + ")."); + + if (readNextKey()) { + // Dataflow calls start, followed by getCurrent. HashBased reader needs to read on TableHash + // twice to return a RangeHash since it specifies both range-start and range-end. + advance(); + return true; + } + + isDone = true; + return false; + } + + @Override + public boolean advance() throws IOException { + if (isDone) { + LOG.debug("Ending workitem at key " + immutableBytesToString(currentRangeStartKey) + " ."); + return false; + } + + ImmutableBytesWritable startKey = this.currentRangeStartKey; + ImmutableBytesWritable hash = this.currentHash; + + // if there is nothing to read, we are done. readNextKey advances the currentRangeStartKey. + isDone = !readNextKey(); + currentRangeHash = RangeHash.of(startKey, currentRangeStartKey, hash); + + return true; + } + + // Returns true if a key can be read for this workitem. + private boolean readNextKey() throws IOException { + if (reader.next()) { + currentRangeStartKey = reader.getCurrentKey(); + if ( // StopRow is not set, everything is in bounds. + (stopRowExclusive.equals(HConstants.EMPTY_END_ROW) + || currentRangeStartKey.compareTo(stopRowExclusive) < 0)) { // currentKey < stopKey + // There is a key to read and the key is within the bounds of this workitem. Return true. + currentHash = reader.getCurrentHash(); + return true; + } else { + // There is a key to read but its outside of the bounds of this workitem. + currentHash = null; + return false; + } + } + + // Nothing left to read for this workitem. Next range would have started from + // stopRowExclusive. + currentRangeStartKey = stopRowExclusive; + currentHash = null; + return false; + } + + @Override + public RangeHash getCurrent() { + return currentRangeHash; + } + + @Override + public void close() throws IOException { + LOG.info( + "Finishing a reader for key range [" + + immutableBytesToString(startRowInclusive) + + " ," + + immutableBytesToString(stopRowExclusive) + + "). Ending at " + + immutableBytesToString(currentRangeStartKey)); + reader.close(); + } + + @Override + public BoundedSource getCurrentSource() { + return source; + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java new file mode 100644 index 0000000000..d6341a08f2 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/RangeHashCoder.java @@ -0,0 +1,105 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InvalidObjectException; +import java.io.OutputStream; +import java.util.Collections; +import java.util.List; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.CoderException; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** Coder used by beam to encode/decode @{@link RangeHash} objects. */ +public class RangeHashCoder extends Coder { + + public static Coder of() { + return new RangeHashCoder(); + } + + @Override + public void encode(RangeHash value, OutputStream outStream) throws IOException { + if (value == null) { + throw new CoderException("Can not encode null objects."); + } + DataOutputStream dataOutputStream = new DataOutputStream(outStream); + // RangeHash fields can never be null. + value.startInclusive.write(dataOutputStream); + value.stopExclusive.write(dataOutputStream); + value.hash.write(dataOutputStream); + } + + @Override + public RangeHash decode(InputStream inStream) throws IOException { + DataInputStream dataInputStream = new DataInputStream(inStream); + + ImmutableBytesWritable startInclusive = new ImmutableBytesWritable(); + startInclusive.readFields(dataInputStream); + + ImmutableBytesWritable stopExclusive = new ImmutableBytesWritable(); + stopExclusive.readFields(dataInputStream); + + ImmutableBytesWritable hash = new ImmutableBytesWritable(); + hash.readFields(dataInputStream); + + return RangeHash.of(startInclusive, stopExclusive, hash); + } + + @Override + public List> getCoderArguments() { + return Collections.emptyList(); + } + + @Override + public void verifyDeterministic() throws NonDeterministicException { + // This is a deterministic coder as it writes the byte[] in order. + } + + /** + * !!! DO NOT DELETE !!! + * + *

See readObjectNoData method in: + * https://docs.oracle.com/javase/7/docs/platform/serialization/spec/input.html#6053. + * + *

Disable backwards compatibility with previous versions that were serialized. + * + * @throws InvalidObjectException + */ + @SuppressWarnings("unused") + private void readObjectNoData() throws InvalidObjectException { + throw new InvalidObjectException("Hash data required"); + } + + @Override + protected Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + @Override + public boolean equals(Object other) { + return other instanceof RangeHashCoder; + } + + @Override + public int hashCode() { + return RangeHashCoder.class.hashCode(); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java new file mode 100644 index 0000000000..56b38fc3cb --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableJob.java @@ -0,0 +1,193 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly; +import com.google.bigtable.repackaged.com.google.gson.Gson; +import com.google.cloud.bigtable.beam.sequencefiles.Utils; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.common.annotations.VisibleForTesting; +import java.util.List; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.io.Read; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.metrics.MetricQueryResults; +import org.apache.beam.sdk.metrics.MetricResult; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.beam.sdk.transforms.GroupByKey; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A job that takes HBase HashTable output and compares the hashes from Cloud Bigtable table. + * + *

Execute the following command to run the job directly: + * + *

+ *   mvn compile exec:java \
+ *      -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
+ *      -Dexec.args="--runner=DataflowRunner \
+ *            --project=$PROJECT \
+ *            --bigtableInstanceId=$INSTANCE \
+ *            --bigtableTableId=$TABLE \
+ *            --sourceHashDir=$SOURCE_HASH_DIR \
+ *            --outputPrefix=$OUtPUT_PREFIX \
+ *            --stagingLocation=$STAGING_LOC \
+ *            --tempLocation=$TMP_LOC \
+ *            --region=$REGION \
+ *            --workerZone=$WORKER_ZONE"
+ * 
+ * + *

Execute the following command to create the Dataflow template: + * + *

+ * mvn compile exec:java \
+ *   -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
+ *   -Dexec.args="--runner=DataflowRunner \
+ *                --project=$PROJECT \
+ *                --stagingLocation=gs://$STAGING_PATH \
+ *                --templateLocation=gs://$TEMPLATE_PATH \
+ *                --wait=false"
+ * 
+ * + *

There are a few ways to run the pipeline using the template. See Dataflow doc for details: + * https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload + * a metadata file that contains information about the runtime parameters that can be used for + * parameter validation purpose and more. A sample metadata file can be found at + * "src/main/resources/SyncTableJob_metadata". + * + *

An example using gcloud command line: + * + *

+ * gcloud beta dataflow jobs run $JOB_NAME \
+ *   --gcs-location gs://$TEMPLATE_PATH \
+ *   --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,sourceHashDir=gs://$SOURCE_HASH_DIR,outputPrefix=$OUTPUT_PREFIX
+ * 
+ */ +@InternalExtensionOnly +public class SyncTableJob { + + private static final Log LOG = LogFactory.getLog(SyncTableJob.class); + + public interface SyncTableOptions extends GcpOptions { + + @Description("This Bigtable App Profile id.") + ValueProvider getBigtableAppProfileId(); + + @SuppressWarnings("unused") + void setBigtableAppProfileId(ValueProvider appProfileId); + + @Description("The project that contains the table to export. Defaults to --project.") + @Default.InstanceFactory(Utils.DefaultBigtableProjectFactory.class) + ValueProvider getBigtableProject(); + + @SuppressWarnings("unused") + void setBigtableProject(ValueProvider projectId); + + @Description("The Bigtable instance id that contains the table to export.") + ValueProvider getBigtableInstanceId(); + + @SuppressWarnings("unused") + void setBigtableInstanceId(ValueProvider instanceId); + + @Description("The Bigtable table id to export.") + ValueProvider getBigtableTableId(); + + @SuppressWarnings("unused") + void setBigtableTableId(ValueProvider tableId); + + @Description("HBase HashTable job output dir.") + ValueProvider getHashTableOutputDir(); + + @SuppressWarnings("unused") + // Rename it to sourceHashDir as in HBase sync table job. + void setHashTableOutputDir(ValueProvider hashTableOutputDir); + + @Description("File pattern for files containing mismatched row ranges.") + ValueProvider getOutputPrefix(); + + @SuppressWarnings("unused") + void setOutputPrefix(ValueProvider outputPrefix); + + // When creating a template, this flag must be set to false. + @Description("Wait for pipeline to finish.") + @Default.Boolean(true) + boolean getWait(); + + @SuppressWarnings("unused") + void setWait(boolean wait); + } + + public static void main(String[] args) { + PipelineOptionsFactory.register(SyncTableOptions.class); + + SyncTableOptions opts = + PipelineOptionsFactory.fromArgs(args).withValidation().as(SyncTableOptions.class); + + LOG.info("===> Building Pipeline"); + Pipeline pipeline = buildPipeline(opts); + + LOG.info("===> Running Pipeline"); + PipelineResult result = pipeline.run(); + + if (opts.getWait()) { + Utils.waitForPipelineToFinish(result); + } + + // Log all the counters for number of matches and number of mismatches. + MetricQueryResults metrics = result.metrics().allMetrics(); + for (MetricResult counter : metrics.getCounters()) { + LOG.warn(counter.getName() + ":" + counter.getAttempted()); + } + } + + @VisibleForTesting + public static Pipeline buildPipeline(SyncTableOptions opts) { + Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); + pipeline + .apply( + "Read HBase HashTable output", + Read.from( + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + opts.getBigtableProject(), opts.getHashTableOutputDir())))) + .apply( + "group by and create granular workitems", GroupByKey.>create()) + .apply("validate hash", ParDo.of(new ComputeAndValidateHashFromBigtableDoFn(opts))) + .apply("Serialize the ranges", MapElements.via(new RangeHashToString())) + .apply("Write to file", TextIO.write().to(opts.getOutputPrefix()).withSuffix(".txt")); + return pipeline; + } + + static class RangeHashToString extends SimpleFunction { + // TODO maybe explore a sequenceFile sink for RangeHash. Hadoop jobs using this output may be + // easier to write for sequence file. + private static final Gson GSON = new Gson(); + + @Override + public String apply(RangeHash input) { + return GSON.toJson(input); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java new file mode 100644 index 0000000000..cc92bea6a4 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/SyncTableUtils.java @@ -0,0 +1,57 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; + +/** Utility class for SyncTable job. */ +@InternalApi +public class SyncTableUtils { + + private SyncTableUtils() {} + + public static String immutableBytesToString(ImmutableBytesWritable bytes) { + if (bytes == null) { + return ""; + } + return immutableBytesToString(bytes.get()); + } + + public static String immutableBytesToString(byte[] bytes) { + return Bytes.toStringBinary(bytes); + } + + /** + * Creates a HBase configuration for reading HashTable output from GCS bucket located in + * projectId. + * + * @param projectId project containing the GCS bucket holding hashtable output. + * @param sourceHashDir location of hashtable output from HBase. + * @return + */ + public static Configuration createConfiguration(String projectId, String sourceHashDir) { + Configuration conf = HBaseConfiguration.create(); + conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"); + conf.set("fs.gs.project.id", projectId); + conf.set("fs.defaultFS", sourceHashDir); + conf.set("google.cloud.auth.service.account.enable", "true"); + return conf; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java new file mode 100644 index 0000000000..55200570ed --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapper.java @@ -0,0 +1,55 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import com.google.common.collect.ImmutableList; +import java.io.Closeable; +import java.io.IOException; +import java.io.Serializable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * Wraps HashTable.TableHash object and delegates the calls to it. This class exposes the minimal + * interface required from TableHash. This class is required for mocking purposes in unit tests. + */ +@InternalApi +public interface TableHashWrapper extends Serializable { + + int getNumHashFiles(); + + ImmutableList getPartitions(); + + ImmutableBytesWritable getStartRow(); + + ImmutableBytesWritable getStopRow(); + + Scan getScan(); + + TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow); + + interface TableHashReader extends Closeable { + boolean next() throws IOException; + + ImmutableBytesWritable getCurrentKey(); + + ImmutableBytesWritable getCurrentHash(); + + void close() throws IOException; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java new file mode 100644 index 0000000000..a4e3544519 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperFactory.java @@ -0,0 +1,35 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.createConfiguration; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import java.io.IOException; +import java.io.Serializable; + +/** Factory to create a TableHashWrapper. */ +@InternalApi +public class TableHashWrapperFactory implements Serializable { + + private static final long serialVersionUID = 265433454L; + + public TableHashWrapper getTableHash(String projectId, String sourceHashDir) throws IOException { + return TableHashWrapperImpl.create( + createConfiguration(projectId, sourceHashDir), sourceHashDir); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java new file mode 100644 index 0000000000..b04bd538a6 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/TableHashWrapperImpl.java @@ -0,0 +1,118 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor; +import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash; +import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash.Reader; + +class TableHashWrapperImpl implements TableHashWrapper { + + static TableHashWrapper create(Configuration conf, String hashTableOutputDir) throws IOException { + TableHash tableHash = TableHash.read(conf, new Path(hashTableOutputDir)); + + TableHashWrapper tableHashWrapper = new TableHashWrapperImpl(tableHash); + Preconditions.checkArgument( + tableHashWrapper.getNumHashFiles() == (tableHashWrapper.getPartitions().size() + 1), + "Corrupt hashtable output. %d hash files for %d partitions. Expected %d files.", + tableHashWrapper.getNumHashFiles(), + tableHashWrapper.getPartitions().size(), + tableHashWrapper.getPartitions().size() + 1); + return tableHashWrapper; + } + + private final TableHash hash; + + private TableHashWrapperImpl(TableHash hash) { + this.hash = hash; + } + + public int getNumHashFiles() { + return BigtableTableHashAccessor.getNumHashFiles(hash); + } + + public ImmutableList getPartitions() { + return BigtableTableHashAccessor.getPartitions(hash); + } + + public ImmutableBytesWritable getStartRow() { + return BigtableTableHashAccessor.getStartRow(hash); + } + + public ImmutableBytesWritable getStopRow() { + return BigtableTableHashAccessor.getStopRow(hash); + } + + public Scan getScan() { + try { + return BigtableTableHashAccessor.getScan(hash); + } catch (IOException e) { + throw new RuntimeException("Failed to init a scan from TableHash: ", e); + } + } + + public TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow) { + try { + return TableHashReaderImpl.create(hash.newReader(conf, startRow)); + } catch (IOException e) { + throw new RuntimeException( + "Failed to open reader at " + immutableBytesToString(startRow.copyBytes()), e); + } + } + + static class TableHashReaderImpl implements TableHashReader { + + private final Reader reader; + + static TableHashReaderImpl create(TableHash.Reader reader) { + Preconditions.checkNotNull(reader, "Reader can not be null."); + return new TableHashReaderImpl(reader); + } + + private TableHashReaderImpl(TableHash.Reader reader) { + this.reader = reader; + } + + @Override + public boolean next() throws IOException { + return reader.next(); + } + + @Override + public ImmutableBytesWritable getCurrentKey() { + return reader.getCurrentKey(); + } + + @Override + public ImmutableBytesWritable getCurrentHash() { + return reader.getCurrentHash(); + } + + @Override + public void close() throws IOException { + reader.close(); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java new file mode 100644 index 0000000000..a7db0add1c --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/org/apache/hadoop/hbase/mapreduce/BigtableTableHashAccessor.java @@ -0,0 +1,79 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.mapreduce; + +import com.google.bigtable.repackaged.com.google.api.core.InternalApi; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.HashTable.ResultHasher; +import org.apache.hadoop.hbase.mapreduce.HashTable.TableHash; + +/** A helper class to access package private fields of HashTable.TableHash. */ +@InternalApi +public class BigtableTableHashAccessor { + + // Restrict object creation. This class should only be used to access state from TableHash. + private BigtableTableHashAccessor() {} + + public static int getNumHashFiles(TableHash hash) { + return hash.numHashFiles; + } + + public static ImmutableList getPartitions(TableHash hash) { + return ImmutableList.copyOf(hash.partitions); + } + + public static ImmutableBytesWritable getStartRow(TableHash hash) { + return new ImmutableBytesWritable(hash.startRow); + } + + public static ImmutableBytesWritable getStopRow(TableHash hash) { + return new ImmutableBytesWritable(hash.stopRow); + } + + public static Scan getScan(TableHash hash) throws IOException { + return hash.initScan(); + } + + // Wrapper to access package private class ResultHasher. Delegates all the calls to underlying + // TableHash.ResultHasher, helps in mocking for unit tests. + public static class BigtableResultHasher { + private final ResultHasher hasher; + + public BigtableResultHasher() { + hasher = new ResultHasher(); + } + + public void startBatch(ImmutableBytesWritable batchStartKey) { + hasher.startBatch(batchStartKey); + } + + public void finishBatch() { + hasher.finishBatch(); + } + + public ImmutableBytesWritable getBatchHash() { + return hasher.getBatchHash(); + } + + public void hashResult(Result result) { + hasher.hashResult(result); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/..snapshotinfo.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/..snapshotinfo.crc deleted file mode 100644 index 8fe4533a01..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/..snapshotinfo.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.data.manifest.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.data.manifest.crc deleted file mode 100644 index 1467a17f1f..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.data.manifest.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo deleted file mode 100644 index 83e482aac0..0000000000 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/.snapshotinfo +++ /dev/null @@ -1,2 +0,0 @@ - - test-snapshottest�����. ( \ No newline at end of file diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/.b0f68aca966b48f1b171614e582b1cbb.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/.b0f68aca966b48f1b171614e582b1cbb.crc deleted file mode 100644 index ea5b25e778..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/.b0f68aca966b48f1b171614e582b1cbb.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/.8aff180e3a244dcc807e4de8b6fce0a7.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/.8aff180e3a244dcc807e4de8b6fce0a7.crc deleted file mode 100644 index 51cacdd03b..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/.8aff180e3a244dcc807e4de8b6fce0a7.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/.c2945aa8dac34922913a1f60fedb6154.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/.c2945aa8dac34922913a1f60fedb6154.crc deleted file mode 100644 index 2c4de3ac0e..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/.c2945aa8dac34922913a1f60fedb6154.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc deleted file mode 100644 index 931ebfb545..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/.cda93ca899f3475fb1c0f8989a8f0d18.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/.d8b49b374391407ba35d5e0db1c835c9.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/.d8b49b374391407ba35d5e0db1c835c9.crc deleted file mode 100644 index 32f450dba4..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/.d8b49b374391407ba35d5e0db1c835c9.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/.32053565831341128b8d8f5567d48fdc.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/.32053565831341128b8d8f5567d48fdc.crc deleted file mode 100644 index 80317a1515..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/.32053565831341128b8d8f5567d48fdc.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/.36798a163ed046b193818e21dd7516b4.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/.36798a163ed046b193818e21dd7516b4.crc deleted file mode 100644 index 00a9d7720d..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/.36798a163ed046b193818e21dd7516b4.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/.65b9c6860f5f4de39d61d1674947b030.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/.65b9c6860f5f4de39d61d1674947b030.crc deleted file mode 100644 index 1d7e3d8653..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/.65b9c6860f5f4de39d61d1674947b030.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/.b83044f76ba6474aa829e3bae7fd82d1.crc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/.b83044f76ba6474aa829e3bae7fd82d1.crc deleted file mode 100644 index ca57c97e2d..0000000000 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/.b83044f76ba6474aa829e3bae7fd82d1.crc and /dev/null differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt b/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt index 7f8f8fc2db..6e66d3e096 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/generate_test_data.txt @@ -1,107 +1,133 @@ +// Run from HBase shell. Run `hbase shell` from unix terminal on HBase master. create 'test', 'cf', {SPLITS => ["1", "2", "3", "4", "5", "6", "7", "8", "9"]} -put 'test','1', 'cf:a', 'value1' -put 'test','2', 'cf:a', 'value2' -put 'test','3', 'cf:a', 'value3' -put 'test','4', 'cf:a', 'value4' -put 'test','5', 'cf:a', 'value5' -put 'test','6', 'cf:a', 'value6' -put 'test','7', 'cf:a', 'value7' -put 'test','8', 'cf:a', 'value8' -put 'test','9', 'cf:a', 'value9' -put 'test','10', 'cf:a', 'value10' -put 'test','11', 'cf:a', 'value11' -put 'test','12', 'cf:a', 'value12' -put 'test','13', 'cf:a', 'value13' -put 'test','14', 'cf:a', 'value14' -put 'test','15', 'cf:a', 'value15' -put 'test','16', 'cf:a', 'value16' -put 'test','17', 'cf:a', 'value17' -put 'test','18', 'cf:a', 'value18' -put 'test','19', 'cf:a', 'value19' -put 'test','20', 'cf:a', 'value20' -put 'test','21', 'cf:a', 'value21' -put 'test','22', 'cf:a', 'value22' -put 'test','23', 'cf:a', 'value23' -put 'test','24', 'cf:a', 'value24' -put 'test','25', 'cf:a', 'value25' -put 'test','26', 'cf:a', 'value26' -put 'test','27', 'cf:a', 'value27' -put 'test','28', 'cf:a', 'value28' -put 'test','29', 'cf:a', 'value29' -put 'test','30', 'cf:a', 'value30' -put 'test','31', 'cf:a', 'value31' -put 'test','32', 'cf:a', 'value32' -put 'test','33', 'cf:a', 'value33' -put 'test','34', 'cf:a', 'value34' -put 'test','35', 'cf:a', 'value35' -put 'test','36', 'cf:a', 'value36' -put 'test','37', 'cf:a', 'value37' -put 'test','38', 'cf:a', 'value38' -put 'test','39', 'cf:a', 'value39' -put 'test','40', 'cf:a', 'value40' -put 'test','41', 'cf:a', 'value41' -put 'test','42', 'cf:a', 'value42' -put 'test','43', 'cf:a', 'value43' -put 'test','44', 'cf:a', 'value44' -put 'test','45', 'cf:a', 'value45' -put 'test','46', 'cf:a', 'value46' -put 'test','47', 'cf:a', 'value47' -put 'test','48', 'cf:a', 'value48' -put 'test','49', 'cf:a', 'value49' -put 'test','50', 'cf:a', 'value50' -put 'test','51', 'cf:a', 'value51' -put 'test','52', 'cf:a', 'value52' -put 'test','53', 'cf:a', 'value53' -put 'test','54', 'cf:a', 'value54' -put 'test','55', 'cf:a', 'value55' -put 'test','56', 'cf:a', 'value56' -put 'test','57', 'cf:a', 'value57' -put 'test','58', 'cf:a', 'value58' -put 'test','59', 'cf:a', 'value59' -put 'test','60', 'cf:a', 'value60' -put 'test','61', 'cf:a', 'value61' -put 'test','62', 'cf:a', 'value62' -put 'test','63', 'cf:a', 'value63' -put 'test','64', 'cf:a', 'value64' -put 'test','65', 'cf:a', 'value65' -put 'test','66', 'cf:a', 'value66' -put 'test','67', 'cf:a', 'value67' -put 'test','68', 'cf:a', 'value68' -put 'test','69', 'cf:a', 'value69' -put 'test','70', 'cf:a', 'value70' -put 'test','71', 'cf:a', 'value71' -put 'test','72', 'cf:a', 'value72' -put 'test','73', 'cf:a', 'value73' -put 'test','74', 'cf:a', 'value74' -put 'test','75', 'cf:a', 'value75' -put 'test','76', 'cf:a', 'value76' -put 'test','77', 'cf:a', 'value77' -put 'test','78', 'cf:a', 'value78' -put 'test','79', 'cf:a', 'value79' -put 'test','80', 'cf:a', 'value80' -put 'test','81', 'cf:a', 'value81' -put 'test','82', 'cf:a', 'value82' -put 'test','83', 'cf:a', 'value83' -put 'test','84', 'cf:a', 'value84' -put 'test','85', 'cf:a', 'value85' -put 'test','86', 'cf:a', 'value86' -put 'test','87', 'cf:a', 'value87' -put 'test','88', 'cf:a', 'value88' -put 'test','89', 'cf:a', 'value89' -put 'test','90', 'cf:a', 'value90' -put 'test','91', 'cf:a', 'value91' -put 'test','92', 'cf:a', 'value92' -put 'test','93', 'cf:a', 'value93' -put 'test','94', 'cf:a', 'value94' -put 'test','95', 'cf:a', 'value95' -put 'test','96', 'cf:a', 'value96' -put 'test','97', 'cf:a', 'value97' -put 'test','98', 'cf:a', 'value98' -put 'test','99', 'cf:a', 'value99' -put 'test','100', 'cf:a', 'value100' +put 'test','1', 'cf:a', 'value1', 100 +put 'test','2', 'cf:a', 'value2', 100 +put 'test','3', 'cf:a', 'value3', 100 +put 'test','4', 'cf:a', 'value4', 100 +put 'test','5', 'cf:a', 'value5', 100 +put 'test','6', 'cf:a', 'value6', 100 +put 'test','7', 'cf:a', 'value7', 100 +put 'test','8', 'cf:a', 'value8', 100 +put 'test','9', 'cf:a', 'value9', 100 +put 'test','10', 'cf:a', 'value10', 100 +put 'test','11', 'cf:a', 'value11', 100 +put 'test','12', 'cf:a', 'value12', 100 +put 'test','13', 'cf:a', 'value13', 100 +put 'test','14', 'cf:a', 'value14', 100 +put 'test','15', 'cf:a', 'value15', 100 +put 'test','16', 'cf:a', 'value16', 100 +put 'test','17', 'cf:a', 'value17', 100 +put 'test','18', 'cf:a', 'value18', 100 +put 'test','19', 'cf:a', 'value19', 100 +put 'test','20', 'cf:a', 'value20', 100 +put 'test','21', 'cf:a', 'value21', 100 +put 'test','22', 'cf:a', 'value22', 100 +put 'test','23', 'cf:a', 'value23', 100 +put 'test','24', 'cf:a', 'value24', 100 +put 'test','25', 'cf:a', 'value25', 100 +put 'test','26', 'cf:a', 'value26', 100 +put 'test','27', 'cf:a', 'value27', 100 +put 'test','28', 'cf:a', 'value28', 100 +put 'test','29', 'cf:a', 'value29', 100 +put 'test','30', 'cf:a', 'value30', 100 +put 'test','31', 'cf:a', 'value31', 100 +put 'test','32', 'cf:a', 'value32', 100 +put 'test','33', 'cf:a', 'value33', 100 +put 'test','34', 'cf:a', 'value34', 100 +put 'test','35', 'cf:a', 'value35', 100 +put 'test','36', 'cf:a', 'value36', 100 +put 'test','37', 'cf:a', 'value37', 100 +put 'test','38', 'cf:a', 'value38', 100 +put 'test','39', 'cf:a', 'value39', 100 +put 'test','40', 'cf:a', 'value40', 100 +put 'test','41', 'cf:a', 'value41', 100 +put 'test','42', 'cf:a', 'value42', 100 +put 'test','43', 'cf:a', 'value43', 100 +put 'test','44', 'cf:a', 'value44', 100 +put 'test','45', 'cf:a', 'value45', 100 +put 'test','46', 'cf:a', 'value46', 100 +put 'test','47', 'cf:a', 'value47', 100 +put 'test','48', 'cf:a', 'value48', 100 +put 'test','49', 'cf:a', 'value49', 100 +put 'test','50', 'cf:a', 'value50', 100 +put 'test','51', 'cf:a', 'value51', 100 +put 'test','52', 'cf:a', 'value52', 100 +put 'test','53', 'cf:a', 'value53', 100 +put 'test','54', 'cf:a', 'value54', 100 +put 'test','55', 'cf:a', 'value55', 100 +put 'test','56', 'cf:a', 'value56', 100 +put 'test','57', 'cf:a', 'value57', 100 +put 'test','58', 'cf:a', 'value58', 100 +put 'test','59', 'cf:a', 'value59', 100 +put 'test','60', 'cf:a', 'value60', 100 +put 'test','61', 'cf:a', 'value61', 100 +put 'test','62', 'cf:a', 'value62', 100 +put 'test','63', 'cf:a', 'value63', 100 +put 'test','64', 'cf:a', 'value64', 100 +put 'test','65', 'cf:a', 'value65', 100 +put 'test','66', 'cf:a', 'value66', 100 +put 'test','67', 'cf:a', 'value67', 100 +put 'test','68', 'cf:a', 'value68', 100 +put 'test','69', 'cf:a', 'value69', 100 +put 'test','70', 'cf:a', 'value70', 100 +put 'test','71', 'cf:a', 'value71', 100 +put 'test','72', 'cf:a', 'value72', 100 +put 'test','73', 'cf:a', 'value73', 100 +put 'test','74', 'cf:a', 'value74', 100 +put 'test','75', 'cf:a', 'value75', 100 +put 'test','76', 'cf:a', 'value76', 100 +put 'test','77', 'cf:a', 'value77', 100 +put 'test','78', 'cf:a', 'value78', 100 +put 'test','79', 'cf:a', 'value79', 100 +put 'test','80', 'cf:a', 'value80', 100 +put 'test','81', 'cf:a', 'value81', 100 +put 'test','82', 'cf:a', 'value82', 100 +put 'test','83', 'cf:a', 'value83', 100 +put 'test','84', 'cf:a', 'value84', 100 +put 'test','85', 'cf:a', 'value85', 100 +put 'test','86', 'cf:a', 'value86', 100 +put 'test','87', 'cf:a', 'value87', 100 +put 'test','88', 'cf:a', 'value88', 100 +put 'test','89', 'cf:a', 'value89', 100 +put 'test','90', 'cf:a', 'value90', 100 +put 'test','91', 'cf:a', 'value91', 100 +put 'test','92', 'cf:a', 'value92', 100 +put 'test','93', 'cf:a', 'value93', 100 +put 'test','94', 'cf:a', 'value94', 100 +put 'test','95', 'cf:a', 'value95', 100 +put 'test','96', 'cf:a', 'value96', 100 +put 'test','97', 'cf:a', 'value97', 100 +put 'test','98', 'cf:a', 'value98', 100 +put 'test','99', 'cf:a', 'value99', 100 +put 'test','100', 'cf:a', 'value100', 100 snapshot 'test', 'test-snapshot' list_snapshots + +////////////////////Run from Unix shell on HBase master node////////////////// +// Export the snapshot +hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -snapshot test-snapshot -copy-to /integration-test/data -mappers 16 + +// Create the hashes for the table. Run the command from unix shell on an HBase +// node. +hbase org.apache.hadoop.hbase.mapreduce.HashTable --batchsize=10 --numhashfiles=10 test /integration-test/hashtable + +// Export the data into GCS +hadoop fs -copyToLocal /integration-test /tmp/ +gsutil cp -r /tmp/integration-test gs:/// + +// GCS bucket should look like this: +$ gsutil ls gs:///integration-test/data +gs:///integration-test/data/ +gs:///integration-test/data/.hbase-snapshot/ +gs:///integration-test/data/archive/ +$ gsutil ls gs:///integration-test/hashtable +gs:///integration-test/hashtable/manifest +gs:///integration-test/hashtable/partitions +gs:///integration-test/hashtable/hashes/ + +// Run from HBase shell. Run `hbase shell` from unix terminal on HBase master. +// clean up the table disable 'test' drop 'test' exit diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo new file mode 100644 index 0000000000..03ac02e452 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/.snapshotinfo @@ -0,0 +1,2 @@ + + test-snapshottestϹ���. (@��������� \ No newline at end of file diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/data.manifest b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest similarity index 55% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/data.manifest rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest index 180516dc03..6439f06130 100644 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/.hbase-snapshot/test-snapshot/data.manifest and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/.hbase-snapshot/test-snapshot/data.manifest differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/32053565831341128b8d8f5567d48fdc b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b similarity index 86% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/32053565831341128b8d8f5567d48fdc rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b index 5320c6c58d..1b91b948d8 100644 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3d397f3b97e7fd2358fb5c93060b3a60/cf/32053565831341128b8d8f5567d48fdc and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/01340515889e8ec5014bbdbfa4fd4689/cf/0ad53893d268478f9b2484cbb6016d9b differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/8aff180e3a244dcc807e4de8b6fce0a7 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/8aff180e3a244dcc807e4de8b6fce0a7 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d index cbd9f539b3..951eb512ac 100644 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1a1358ba82be4a98feff54032986bbf2/cf/8aff180e3a244dcc807e4de8b6fce0a7 and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/156b320f3ebe472a1ae56a2f6930a676/cf/9926df0da08b4f51a33517afb040f82d differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/36798a163ed046b193818e21dd7516b4 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/36798a163ed046b193818e21dd7516b4 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b index ee586c252e..dc89f02ec2 100644 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/7466202f701dc0e3af8cc747c9a37ec8/cf/36798a163ed046b193818e21dd7516b4 and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/313460ce1b714784d36c64bcd01f9e2c/cf/966e85699fdd4680a8c6fbf4b41b6e4b differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/c2945aa8dac34922913a1f60fedb6154 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/3bfc13b0a9bf8148a91788a8d2b60117/cf/bab07e8089634e629a4c111ea2b415fe similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/c2945aa8dac34922913a1f60fedb6154 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/3bfc13b0a9bf8148a91788a8d2b60117/cf/bab07e8089634e629a4c111ea2b415fe index 05a0cac912..c7fb208f72 100644 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/1bf20ce0551df953331936d20dbd18fa/cf/c2945aa8dac34922913a1f60fedb6154 and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/3bfc13b0a9bf8148a91788a8d2b60117/cf/bab07e8089634e629a4c111ea2b415fe differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/65b9c6860f5f4de39d61d1674947b030 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/65b9c6860f5f4de39d61d1674947b030 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d index e8d9789f5e..7638f6eabb 100644 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/958c660f0e406404ffdfc81110e7eaf9/cf/65b9c6860f5f4de39d61d1674947b030 and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/5bc31088b2daee7903f5b3d3a52f7ebf/cf/7fef5694213b4be0ad79f79c45200c2d differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/b0f68aca966b48f1b171614e582b1cbb b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/b0f68aca966b48f1b171614e582b1cbb rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b index dc8da56c10..c6ba1f760b 100644 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/01ef4b8bb8d79f360bf182fedfb1c0e8/cf/b0f68aca966b48f1b171614e582b1cbb and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/7c4a9137853573c8d671264dc0b31f89/cf/f8d40658d79b4a7191f21bcf14ae289b differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/cda93ca899f3475fb1c0f8989a8f0d18 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/818d6b145a50cfc3bf8ee865486fdda3/cf/afe596ef5c61440983da2dcb54d581ab similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/cda93ca899f3475fb1c0f8989a8f0d18 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/818d6b145a50cfc3bf8ee865486fdda3/cf/afe596ef5c61440983da2dcb54d581ab index e77357601a..5a757daec8 100644 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/2c25a1cedf575cd08267e0013e45872e/cf/cda93ca899f3475fb1c0f8989a8f0d18 and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/818d6b145a50cfc3bf8ee865486fdda3/cf/afe596ef5c61440983da2dcb54d581ab differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/b83044f76ba6474aa829e3bae7fd82d1 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2 similarity index 87% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/b83044f76ba6474aa829e3bae7fd82d1 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2 index c119dd13ef..d29619e3ec 100644 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/dab1d611586e861818af77de74073d47/cf/b83044f76ba6474aa829e3bae7fd82d1 and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/8c2101799fadc18613082a495d11e4ea/cf/2c766f1fc8eb460dbfa9a3803138c9b2 differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/d8b49b374391407ba35d5e0db1c835c9 b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85 similarity index 86% rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/d8b49b374391407ba35d5e0db1c835c9 rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85 index d640fc8498..337b5f9280 100644 Binary files a/bigtable-dataflow-parent/bigtable-beam-import/src/test/data/archive/data/default/test/3264826a5972b18c5a59b2f612678316/cf/d8b49b374391407ba35d5e0db1c835c9 and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/data/archive/data/default/test/f1ef86b666a891d8c77f0eada4d1a15c/cf/e59edc08de6d441689288f04c7c0fe85 differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/_SUCCESS b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data new file mode 100644 index 0000000000..26334294df Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/index new file mode 100644 index 0000000000..f7ac1fc941 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00000/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data new file mode 100644 index 0000000000..87b715673c Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/index new file mode 100644 index 0000000000..4edcbd1ed5 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00001/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/data new file mode 100644 index 0000000000..4b59b346f0 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/index new file mode 100644 index 0000000000..4169ee8258 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00002/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data new file mode 100644 index 0000000000..a05197b51d Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index new file mode 100644 index 0000000000..9228013bfa Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00003/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/data new file mode 100644 index 0000000000..6e29b085e7 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/index new file mode 100644 index 0000000000..245c2ceb3f Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00004/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data new file mode 100644 index 0000000000..40cbf30418 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/index new file mode 100644 index 0000000000..dbbacaf8f0 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00005/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/data new file mode 100644 index 0000000000..3f0e32269c Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index new file mode 100644 index 0000000000..a0818358eb Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00006/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data new file mode 100644 index 0000000000..effda57ece Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index new file mode 100644 index 0000000000..a8eb1a1748 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00007/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data new file mode 100644 index 0000000000..011b956c5f Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index new file mode 100644 index 0000000000..fada13a256 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00008/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/data b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/data new file mode 100644 index 0000000000..f55fa79aca Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/data differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/index b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/index new file mode 100644 index 0000000000..8c8793cef8 Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/hashes/part-r-00009/index differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest new file mode 100644 index 0000000000..a95421d027 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/manifest @@ -0,0 +1,4 @@ +#Wed Dec 30 01:23:41 UTC 2020 +numHashFiles=10 +table=test +targetBatchSize=10 diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions new file mode 100644 index 0000000000..1d447dd67a Binary files /dev/null and b/bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test/hashtable/partitions differ diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java index 62f1cdced2..0320dd1a61 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java @@ -18,37 +18,61 @@ import static com.google.common.base.Preconditions.checkNotNull; import com.google.api.services.storage.model.Objects; -import com.google.cloud.bigtable.beam.sequencefiles.testing.BigtableTableUtils; +import com.google.bigtable.repackaged.com.google.gson.Gson; +import com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot.ImportOptions; +import com.google.cloud.bigtable.beam.sequencefiles.HBaseResultToMutationFn; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.beam.validation.SyncTableJob; +import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions; import com.google.cloud.bigtable.hbase.BigtableConfiguration; import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; import org.apache.beam.runners.dataflow.DataflowRunner; import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; +import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.PipelineResult.State; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; +import org.apache.beam.sdk.metrics.MetricQueryResults; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.snapshot.SnapshotTestingUtils; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /* - * End to end integration test for pipeline that import HBase snapshot data into Cloud Bigtable. + * End to end integration test for pipeline that import HBase snapshot data into Cloud Bigtable and + * validates the imported data with SyncTable. * Prepare test data with gsutil(https://cloud.google.com/storage/docs/quickstart-gsutil): - * gsutil -m cp -r /bigtable-dataflow-parent/bigtable-beam-import/src/test/data/ \ - * gs:///integration-test/ + * gsutil -m cp -r /bigtable-dataflow-parent/bigtable-beam-import/src/test/integration-test \ + * gs:/// * * Setup GCP credential: https://cloud.google.com/docs/authentication * Ensure your credential have access to Bigtable and Dataflow @@ -62,7 +86,7 @@ */ public class EndToEndIT { - private final Log LOG = LogFactory.getLog(getClass()); + private static Logger LOG = LoggerFactory.getLogger(HBaseResultToMutationFn.class); private static final String TEST_SNAPSHOT_NAME = "test-snapshot"; // Location of test data hosted on Google Cloud Storage, for on-cloud dataflow tests. private static final String CLOUD_TEST_DATA_FOLDER = "cloud.test.data.folder"; @@ -87,6 +111,8 @@ public class EndToEndIT { // Snapshot data setup private String hbaseSnapshotDir; + private String hashDir; + private String syncTableOutputDir; @Before public void setup() throws Exception { @@ -101,6 +127,13 @@ public void setup() throws Exception { hbaseSnapshotDir = cloudTestDataFolder + "data/"; UUID test_uuid = UUID.randomUUID(); + hashDir = cloudTestDataFolder + "hashtable/"; + + syncTableOutputDir = dataflowStagingLocation; + if (!syncTableOutputDir.endsWith(File.separator)) { + syncTableOutputDir = syncTableOutputDir + File.separator; + } + syncTableOutputDir = syncTableOutputDir + "sync-table-output/" + test_uuid + "/"; // Cloud Storage config GcpOptions gcpOptions = PipelineOptionsFactory.create().as(GcpOptions.class); @@ -118,6 +151,12 @@ public void setup() throws Exception { for (int i = 0; i < keys.length; i++) { keySplits[i] = keys[i].getBytes(); } + + // Create table in Bigtable + TableName tableName = TableName.valueOf(tableId); + HTableDescriptor descriptor = new HTableDescriptor(tableName); + descriptor.addFamily(new HColumnDescriptor(CF)); + connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys()); } private static String getTestProperty(String name) { @@ -126,6 +165,19 @@ private static String getTestProperty(String name) { @After public void teardown() throws IOException { + final List paths = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "/*")); + + if (!paths.isEmpty()) { + final List pathStrs = new ArrayList<>(); + + for (GcsPath path : paths) { + pathStrs.add(path.toString()); + } + // TODO: cleanup fails when tests time out. Add a orphan cleaner in the setup() + // https://github.com/googleapis/java-bigtable/blob/35588d89b9b243eb691a29d3aff16b9f5a08fbb8/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/test_helpers/env/AbstractTestEnv.java#L108-L119 + this.gcsUtil.remove(pathStrs); + } + connection.close(); // delete test table @@ -134,18 +186,28 @@ public void teardown() throws IOException { .deleteTable(TableName.valueOf(tableId)); } - @Test - public void testHBaseSnapshotImport() throws Exception { - - // Crete table - TableName tableName = TableName.valueOf(tableId); - HTableDescriptor descriptor = new HTableDescriptor(tableName); + private SyncTableOptions createSyncTableOptions() { + DataflowPipelineOptions syncTableOpts = + PipelineOptionsFactory.as(DataflowPipelineOptions.class); + syncTableOpts.setRunner(DataflowRunner.class); + syncTableOpts.setGcpTempLocation(dataflowStagingLocation); + syncTableOpts.setNumWorkers(1); + syncTableOpts.setProject(projectId); - descriptor.addFamily(new HColumnDescriptor(CF)); + SyncTableOptions syncOpts = syncTableOpts.as(SyncTableOptions.class); + // Setup Bigtable params + syncOpts.setBigtableProject(StaticValueProvider.of(projectId)); + syncOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId)); + syncOpts.setBigtableTableId(StaticValueProvider.of(tableId)); + syncOpts.setBigtableAppProfileId(null); - connection.getAdmin().createTable(descriptor, SnapshotTestingUtils.getSplitKeys()); + // Setup Hashes + syncOpts.setHashTableOutputDir(StaticValueProvider.of(hashDir)); + syncOpts.setOutputPrefix(StaticValueProvider.of(syncTableOutputDir)); + return syncOpts; + } - // Start import + private ImportOptions createImportOptions() { DataflowPipelineOptions importPipelineOpts = PipelineOptionsFactory.as(DataflowPipelineOptions.class); importPipelineOpts.setRunner(DataflowRunner.class); @@ -154,10 +216,9 @@ public void testHBaseSnapshotImport() throws Exception { importPipelineOpts.setProject(projectId); importPipelineOpts.setRegion(region); - ImportJobFromHbaseSnapshot.ImportOptions importOpts = - importPipelineOpts.as(ImportJobFromHbaseSnapshot.ImportOptions.class); + ImportOptions importOpts = importPipelineOpts.as(ImportOptions.class); - // setup GCP and bigtable + // setup Bigtable options importOpts.setBigtableProject(StaticValueProvider.of(projectId)); importOpts.setBigtableInstanceId(StaticValueProvider.of(instanceId)); importOpts.setBigtableTableId(StaticValueProvider.of(tableId)); @@ -165,17 +226,79 @@ public void testHBaseSnapshotImport() throws Exception { // setup HBase snapshot info importOpts.setHbaseSnapshotSourceDir(hbaseSnapshotDir); importOpts.setSnapshotName(TEST_SNAPSHOT_NAME); + return importOpts; + } + + private Map getCountMap(PipelineResult result) { + MetricQueryResults metrics = result.metrics().allMetrics(); + return StreamSupport.stream(metrics.getCounters().spliterator(), false) + .collect(Collectors.toMap((m) -> m.getName().getName(), (m) -> m.getAttempted())); + } + + /** + * Reads the output of SyncTable job and returns a list of mismatched RangeHashes. + * + * @throws IOException + */ + private List readMismatchesFromOutputFiles() throws IOException { + Gson gson = new Gson(); + // Find output files + List outputFiles = gcsUtil.expand(GcsPath.fromUri(syncTableOutputDir + "*")); + List rangeHashes = new ArrayList<>(); + + // Read each file line by line and create a RangeHash from it. + for (GcsPath outputFile : outputFiles) { + int size = (int) gcsUtil.fileSize(outputFile); + byte[] fileContents = new byte[size]; + gcsUtil.open(outputFile).read(ByteBuffer.wrap(fileContents)); + BufferedReader reader = + new BufferedReader(new InputStreamReader(new ByteArrayInputStream(fileContents))); + String serializedRangeHash; + while ((serializedRangeHash = reader.readLine()) != null) { + try { + rangeHashes.add(gson.fromJson(serializedRangeHash.trim(), RangeHash.class)); + } catch (Exception e) { + LOG.error("Failed to parse JSON: [" + serializedRangeHash + "]", e); + throw e; + } + } + } + return rangeHashes; + } + + // Asserts that all the rowKeys belong in mismatches. + // Throws AssertionException + private void validateRowInRangeHashes(List rowKeys, Iterable mismatches) { + for (byte[] mismatchedRowKey : rowKeys) { + Assert.assertTrue(containsRow(mismatchedRowKey, mismatches)); + } + } + + // Returns true if the rowKey belongs in one of the ranges contained in rangeHashes. + private boolean containsRow(byte[] rowKey, Iterable rangeHashes) { + for (RangeHash mismatchedRange : rangeHashes) { + // TODO: There maybe a better Range.belongs() utility function somewhere? + // Empty start/end key means that there is no start/end key. + if ((mismatchedRange.startInclusive.equals(HConstants.EMPTY_BYTE_ARRAY) + || mismatchedRange.startInclusive.compareTo(rowKey) <= 0) + && (mismatchedRange.stopExclusive.equals(HConstants.EMPTY_BYTE_ARRAY) + || mismatchedRange.stopExclusive.compareTo(rowKey) > 0)) { + return true; + } + } + return false; + } + + @Test + public void testHBaseSnapshotImport() throws Exception { + + // Start import + ImportOptions importOpts = createImportOptions(); // run pipeline State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); Assert.assertEquals(State.DONE, state); - // check data in bigtable - BigtableTableUtils destTable = new BigtableTableUtils(connection, tableId, CF); - Assert.assertEquals( - 100 /* There are 100 rows in test snapshot*/, - destTable.readAllCellsFromTable().toArray().length); - // check that the .restore dir used for temp files has been removed Objects objects = gcsUtil.listObjects( @@ -185,6 +308,81 @@ public void testHBaseSnapshotImport() throws Exception { null); Assert.assertNull(objects.getItems()); - // TODO(vermas2012): Add more validations after this. + SyncTableOptions syncOpts = createSyncTableOptions(); + + PipelineResult result = SyncTableJob.buildPipeline(syncOpts).run(); + state = result.waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + // Read the output files and validate that there are no mismatches. + Assert.assertEquals(0, readMismatchesFromOutputFiles().size()); + + // Validate the counters. + Map counters = getCountMap(result); + Assert.assertEquals(counters.get("ranges_matched"), (Long) 101L); + Assert.assertNull(counters.get("ranges_not_matched")); + } + + /** + * Introduces multiple corruptions in imported table and validates that sync-table can detect + * them. + */ + @Test + public void testHBaseSnapshotImportWithCorruptions() throws Exception { + // Import snapshot + ImportOptions importOpts = createImportOptions(); + State state = ImportJobFromHbaseSnapshot.buildPipeline(importOpts).run().waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + // Rows where corruptions will be added. + byte[] mismatchRowAtStart = "000".getBytes(); + byte[] mismatchRowInMiddle = "24".getBytes(); + byte[] mismatchRowDeleted = "64".getBytes(); + byte[] mismatchRowAtTheEnd = "999".getBytes(); + + // Introduce corruptions to the data in Bigtable. Delete data from Bigtable to simulate Bigtable + // missing data. Add data to Bigtable to simulate extra data in Bigtable. It is easier to update + // Bigtable than change the snapshots. + Table table = connection.getTable(TableName.valueOf(tableId)); + Cell cellInMiddle = table.get(new Get(mismatchRowInMiddle)).rawCells()[0]; + List puts = + Arrays.asList( + // Add a row at the start + new Put(mismatchRowAtStart) + .addColumn(CF.getBytes(), "random_col".getBytes(), 1L, "value000".getBytes()) + .addColumn(CF.getBytes(), "random_col".getBytes(), 2L, "value001".getBytes()), + // change a cell in middle + new Put(cellInMiddle.getRowArray()) + .addColumn( + cellInMiddle.getFamilyArray(), + cellInMiddle.getQualifierArray(), + cellInMiddle.getTimestamp(), + "corrupted_val".getBytes()), + // add a new row in the end + new Put(mismatchRowAtTheEnd) + .addColumn(CF.getBytes(), "random_col".getBytes(), 100L, "value999".getBytes())); + + table.put(puts); + // Delete a random row in the middle. We should see 4 ranges mismatch as table is split on + // 1,2...9. All the updates are happening on a different split. + table.delete(new Delete(mismatchRowDeleted)); + + // Run SyncTable job and expect 4 mismatches. + SyncTableOptions syncOpts = createSyncTableOptions(); + PipelineResult result = SyncTableJob.buildPipeline(syncOpts).run(); + state = result.waitUntilFinish(); + Assert.assertEquals(State.DONE, state); + + List syncTableOutputMismatches = readMismatchesFromOutputFiles(); + Assert.assertEquals(4, syncTableOutputMismatches.size()); + validateRowInRangeHashes( + Arrays.asList( + mismatchRowAtStart, mismatchRowAtTheEnd, mismatchRowDeleted, mismatchRowInMiddle), + syncTableOutputMismatches); + + // Assert that the output collection is the right one. + Map counters = getCountMap(result); + Assert.assertEquals(counters.get("ranges_matched"), (Long) 97L); + Assert.assertEquals(counters.get("ranges_not_matched"), (Long) 4L); } } diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java new file mode 100644 index 0000000000..96d5960423 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSourceTest.java @@ -0,0 +1,162 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static org.junit.Assert.assertEquals; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.beam.sdk.testing.SourceTestUtils; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class BufferedHadoopHashTableSourceTest { + + private BufferedHadoopHashTableSource bufferedSource; + private FakeTableHashWrapper fakeTableHashWrapper; + + private static final String HASH_TABLE_OUTPUT_PATH_DIR = "gs://my-bucket/outputDir"; + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("AAAA".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = + new ImmutableBytesWritable("ZZZZ".getBytes()); + private static final ImmutableBytesWritable POST_STOP_ROW = + new ImmutableBytesWritable("z".getBytes()); // Lowercase z is lexicographically > uppercase Z + private static final ImmutableBytesWritable EMPTY_ROW = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + private static final ImmutableBytesWritable START_HASH = + new ImmutableBytesWritable("START-HASH".getBytes()); + private static final int BATCH_SIZE = 5; + + @Before + public void setUp() throws Exception { + fakeTableHashWrapper = + new FakeTableHashWrapper( + START_ROW, STOP_ROW, new ArrayList<>(), new ArrayList<>(), new Scan()); + bufferedSource = + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + StaticValueProvider.of("cbt-dev"), + StaticValueProvider.of(HASH_TABLE_OUTPUT_PATH_DIR), + START_ROW, + STOP_ROW, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)), + BATCH_SIZE); + } + + protected static ImmutableBytesWritable getKey(int keyIndex) { + return new ImmutableBytesWritable(("KEY-" + keyIndex).getBytes()); + } + + protected static ImmutableBytesWritable getHash(int hashIndex) { + return new ImmutableBytesWritable(("HASH-" + hashIndex).getBytes()); + } + + /** + * Populates the fakeTableHashWrapper with {@code numEntries} entries starting with startKey. + * Returns a List of expected RangeHashes for this data, for numEntries=1, single RangeHash is + * returned (startRow, stopRow, START_HASH). + */ + protected List>> setupTestData( + ImmutableBytesWritable startRow, ImmutableBytesWritable stopRow, int numEntries) { + fakeTableHashWrapper.startRowInclusive = startRow; + fakeTableHashWrapper.stopRowExclusive = stopRow; + fakeTableHashWrapper.hashes.add(KV.of(startRow, START_HASH)); + for (int i = 0; i < numEntries - 1; i++) { + fakeTableHashWrapper.hashes.add(KV.of(getKey(i), getHash(i))); + } + + List>> out = new ArrayList<>(); + // Setup RangeHashes to be returned + List expectedRangeHashes = new ArrayList<>(); + ImmutableBytesWritable key = startRow; + ImmutableBytesWritable hash = START_HASH; + for (int i = 0; i < numEntries - 1; i++) { + expectedRangeHashes.add(RangeHash.of(key, getKey(i), hash)); + key = getKey(i); + hash = getHash(i); + if (expectedRangeHashes.size() % BATCH_SIZE == 0) { + out.add( + KV.of( + Bytes.toStringBinary(expectedRangeHashes.get(0).startInclusive.copyBytes()), + expectedRangeHashes)); + expectedRangeHashes = new ArrayList<>(); + } + } + // Process the last range + expectedRangeHashes.add(RangeHash.of(key, stopRow, hash)); + // Finalize the last batch + out.add( + KV.of( + Bytes.toStringBinary(expectedRangeHashes.get(0).startInclusive.copyBytes()), + expectedRangeHashes)); + + return out; + } + + @Test + public void testHashReaderEmpty() throws IOException { + // The tableHashWrapper has no hashes, this should result in empty source. + assertEquals(Arrays.asList(), SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testHashReaderPartialBuffer() throws IOException { + // Setup 4 entries in this hashtable datafile. + List>> expected = setupTestData(START_ROW, STOP_ROW, 4); + assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testHashReaderMultipleBatches() throws IOException { + // Setup 4 entries in this hashtable datafile. + List>> expected = setupTestData(START_ROW, STOP_ROW, 20); + assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testHashReaderMultipleBatchesWithPartialBatchAtEnd() throws IOException { + // Setup 4 entries in this hashtable datafile. + List>> expected = setupTestData(START_ROW, STOP_ROW, 23); + assertEquals(expected, SourceTestUtils.readFromSource(bufferedSource, null)); + } + + @Test + public void testSplitEqualsUnsplit() throws Exception { + fakeTableHashWrapper.partitions = Arrays.asList(getKey(4), getKey(9)); + SourceTestUtils.assertSourcesEqualReferenceSource( + bufferedSource, bufferedSource.split(0, null), null); + } + + @Test + public void testUnstartedReaderEqualsStarted() throws Exception { + setupTestData(START_ROW, STOP_ROW, 6); + SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource( + bufferedSource.createReader(null), null); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java new file mode 100644 index 0000000000..8c608b74db --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java @@ -0,0 +1,473 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.models.GCRules.GCRULES; + +import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.BigtableTableAdminClient; +import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.BigtableTableAdminSettings; +import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.models.CreateTableRequest; +import com.google.cloud.bigtable.beam.CloudBigtableTableConfiguration; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.cloud.bigtable.emulator.v2.BigtableEmulatorRule; +import com.google.cloud.bigtable.hbase.BigtableConfiguration; +import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.metrics.MetricQueryResults; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@RunWith(JUnit4.class) +public class ComputeAndValidateHashFromBigtableDoFnTest { + + private static final byte[] EMPTY_ROW_KEY = HConstants.EMPTY_BYTE_ARRAY; + protected final Logger LOG = LoggerFactory.getLogger(getClass()); + + public static final String FAKE_TABLE = "fake-table"; + private static final String ROW_KEY_PREFIX = "row-"; + private static final String VALUE_PREFIX = "value-"; + private static final byte[] EXTRA_VALUE = "add".getBytes(); + private static final byte[] CF = "cf".getBytes(); + private static final byte[] CF2 = "cf".getBytes(); + private static final byte[] COL = "col".getBytes(); + private static final long TS = 1000l; + private static final int FIRST_ROW_INDEX = 20; + private static final int LAST_ROW_INDEX = 31; + + @Rule public final BigtableEmulatorRule bigtableEmulator = BigtableEmulatorRule.create(); + + @Rule public final transient TestPipeline p = TestPipeline.create(); + + private ComputeAndValidateHashFromBigtableDoFn doFn; + + // Clients that will be connected to the emulator + private BigtableTableAdminClient tableAdminClient; + private Table table; + // Fake a TableHashWrapper. + private FakeTableHashWrapper fakeTableHashWrapper; + + private List hashes; + + @Before + public void setUp() throws IOException { + hashes = new ArrayList<>(); + // Initialize the clients to connect to the emulator + tableAdminClient = + BigtableTableAdminClient.create( + BigtableTableAdminSettings.newBuilderForEmulator(bigtableEmulator.getPort()) + .setProjectId("fake-project") + .setInstanceId("fake-instance") + .build()); + + CloudBigtableTableConfiguration config = + new CloudBigtableTableConfiguration.Builder() + .withProjectId("fake-project") + .withInstanceId("fake-instance") + .withTableId(FAKE_TABLE) + .withConfiguration( + BigtableOptionsFactory.BIGTABLE_EMULATOR_HOST_KEY, + "localhost:" + bigtableEmulator.getPort()) + .build(); + + Connection connection = BigtableConfiguration.connect(config.toHBaseConfig()); + table = connection.getTable(TableName.valueOf(FAKE_TABLE)); + fakeTableHashWrapper = new FakeTableHashWrapper(); + // Scan all the cells for the column, HBase scan fetches 1 cell/column by default + fakeTableHashWrapper.scan = new Scan().setMaxVersions(); + + FakeTableHashWrapperFactory fakeFactory = new FakeTableHashWrapperFactory(fakeTableHashWrapper); + + doFn = + new ComputeAndValidateHashFromBigtableDoFn( + config, + StaticValueProvider.of(FAKE_TABLE), + StaticValueProvider.of("proj"), + StaticValueProvider.of("hash"), + fakeFactory); + + // Create a test table that can be used in tests + tableAdminClient.createTable( + CreateTableRequest.of(FAKE_TABLE) + .addFamily(new String(CF), GCRULES.maxVersions(100)) + .addFamily(new String(CF2), GCRULES.maxVersions(100))); + + p.getCoderRegistry().registerCoderForClass(RangeHash.class, new RangeHashCoder()); + + // Fill CBT table with data. + writeDataToTable(); + } + + @After + public void tearDown() { + // TODO should we delete the table for each test? + tableAdminClient.deleteTable(FAKE_TABLE); + } + + private byte[] getRowKey(int i) { + return (ROW_KEY_PREFIX + i).getBytes(); + } + + private byte[] getValue(int rowIndex, int cellIndex) { + return (VALUE_PREFIX + rowIndex + "-" + cellIndex).getBytes(); + } + + private void writeDataToTable() throws IOException { + List puts = new ArrayList<>(); + // Tests use the rows 21-30. Setup some extra data simulate the real world scenario where + // there will be other workitems working parallely on the table. + for (int i = 20; i < 32; i++) { + for (int j = 0; j < 2; j++) { + // Insert rows with 2 cells each + Put put = new Put(getRowKey(i)); + put.addColumn(CF, COL, TS + j, getValue(i, j)); + puts.add(put); + } + } + table.put(puts); + } + + /** Deletes the row range [startIndex, stopIndex) */ + private void deleteRange(int startIndex, int stopIndex) throws IOException { + for (int i = startIndex; i < stopIndex; i++) { + table.delete(new Delete(getRowKey(i))); + } + } + + // Creates a RangeHash for range [startRow, stopRow). + private RangeHash createHash(byte[] startRow, byte[] stopRow) throws IOException { + LOG.debug("Creating hash for rows " + startRow + " to " + stopRow); + BigtableResultHasher hasher = new BigtableResultHasher(); + hasher.startBatch(new ImmutableBytesWritable(startRow)); + + // Scan all the cells for a column. + Scan scan = new Scan().setMaxVersions().withStartRow(startRow).withStopRow(stopRow, false); + + // Read the rows from Bigtable and compute the expected hash. + for (Result result : table.getScanner(scan)) { + LOG.debug("Adding result to hash: " + result); + hasher.hashResult(result); + } + hasher.finishBatch(); + return RangeHash.of( + new ImmutableBytesWritable(startRow), + new ImmutableBytesWritable(stopRow), + hasher.getBatchHash()); + } + + private void validateCounters( + PipelineResult result, Long expectedMatches, Long expectedMismatches) { + MetricQueryResults metrics = result.metrics().allMetrics(); + Map counters = + StreamSupport.stream(metrics.getCounters().spliterator(), false) + .collect(Collectors.toMap((m) -> m.getName().getName(), (m) -> m.getAttempted())); + if (expectedMatches > 0) { + Assert.assertEquals(expectedMatches, counters.get("ranges_matched")); + } + if (expectedMismatches > 0) { + Assert.assertEquals(expectedMismatches, counters.get("ranges_not_matched")); + } + } + + ////////// Happy case tests for various setups////////////////////// + @Test + public void testHashMatchesForMultipleRange() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(28))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).empty(); + PipelineResult result = p.run(); + validateCounters(result, 2L, 0L); + } + + @Test + public void testHashMatchesForSingleRange() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(); + PipelineResult result = p.run(); + validateCounters(result, 1L, 0L); + } + + @Test + public void testHashMatchesForFullTableScanWithMultipleRange() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(24))); + hashes.add(createHash(getRowKey(24), EMPTY_ROW_KEY)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).empty(); + PipelineResult result = p.run(); + validateCounters(result, 2L, 0L); + } + + @Test + public void testHashMatchesForMultipleSingleRowRange() throws Exception { + hashes.add(createHash(getRowKey(22), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(25))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(22)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).empty(); + PipelineResult result = p.run(); + validateCounters(result, 3L, 0L); + } + + ///////////////// Test mismatches when Bigtable has extra rows //////////////////// + @Test + public void testAdditionalCellInMiddle() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(30))); + + // Add an extra cell in the table + table.put(new Put(getRowKey(25)).addColumn(CF, COL, EXTRA_VALUE)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes.get(1)); + PipelineResult result = p.run(); + validateCounters(result, 2L, 1L); + } + + @Test + public void testAdditionalRowsAtEnds() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(27))); + hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY)); + + // Add an extra row in the beginning + table.put(new Put(getRowKey(1)).addColumn(CF, COL, EXTRA_VALUE)); + + // Add an extra row at the end. + table.put(new Put(getRowKey(5)).addColumn(CF, COL, EXTRA_VALUE)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2)); + PipelineResult result = p.run(); + validateCounters(result, 1L, 2L); + } + + ///////////////////// Test different values /////////////////////////// + @Test + public void testDifferentValues() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY)); + + // Modify the CF + table.delete(new Delete(getRowKey(20)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(1)).addColumn(CF2, COL, TS, getValue(20, 0))); + + // Modify the qualifier + table.delete(new Delete(getRowKey(22)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(22)).addColumn(CF, "random-col".getBytes(), TS, getValue(22, 0))); + + // Modify the timestamp + table.delete(new Delete(getRowKey(24)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(24)).addColumn(CF, COL, 1, getValue(24, 0))); + + // Modify the value + table.delete(new Delete(getRowKey(26)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(26)).addColumn(CF, COL, getValue(26, 0))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output) + .containsInAnyOrder(hashes.get(0), hashes.get(1), hashes.get(2), hashes.get(3)); + PipelineResult result = p.run(); + validateCounters(result, 1L, 4L); + } + + ////////////////// Tests with CBT missing data ////////////////////////////// + @Test + public void testMissingRows() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY)); + + // Delete a row at the beginning + table.delete(new Delete(getRowKey(FIRST_ROW_INDEX))); + + // Delete a row at the middle + table.delete(new Delete(getRowKey(24))); + + // Delete a row at the end + table.delete(new Delete(getRowKey(LAST_ROW_INDEX))); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4)); + PipelineResult result = p.run(); + validateCounters(result, 2L, 3L); + } + + @Test + public void testMissingRanges() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(29))); + hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY)); + + // Delete a range at the beginning + deleteRange(FIRST_ROW_INDEX, 21); + + // Delete a range in middle + deleteRange(23, 25); + + // Delete row ranges at the end, bigtable scanner will finish with multiple row-ranges to + // process. + deleteRange(27, LAST_ROW_INDEX + 1); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output) + .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5)); + PipelineResult result = p.run(); + validateCounters(result, 2L, 4L); + } + + @Test + public void testCbtEmpty() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(29))); + hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY)); + + // Delete all data from bigtable + deleteRange(FIRST_ROW_INDEX, LAST_ROW_INDEX); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes); + PipelineResult result = p.run(); + validateCounters(result, 0L, 3L); + } + + ////////////////////// Test that scan is used from TableHash.//////////////////////// + @Test + public void testScanFromTableHash() throws Exception { + hashes.add(createHash(getRowKey(21), getRowKey(24))); + hashes.add(createHash(getRowKey(24), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(30))); + + // Update the TableHashWrapper Scan to default. Scan from HashTable.TableHash determines the + // cells used to compute hash. CBT has to use the same cells for validation. + fakeTableHashWrapper.scan = new Scan(); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output).containsInAnyOrder(hashes); + PipelineResult result = p.run(); + validateCounters(result, 0L, 3L); + } + + ////////////////////// Combination of different cases ////////////////////////////////// + @Test + public void testMismatchesComprehensive() throws Exception { + hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21))); + hashes.add(createHash(getRowKey(21), getRowKey(23))); + hashes.add(createHash(getRowKey(23), getRowKey(25))); + hashes.add(createHash(getRowKey(25), getRowKey(27))); + hashes.add(createHash(getRowKey(27), getRowKey(29))); + hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY)); + + // Delete a range at the beginning from CBT + deleteRange(FIRST_ROW_INDEX, 21); + + // Delete a row in middle from CBT + table.delete(new Delete(getRowKey(23))); + + // Update a value in CBT + table.delete(new Delete(getRowKey(27)).addColumns(CF, COL, TS)); + table.put(new Put(getRowKey(27)).addColumn(CF, COL, getValue(27, 0))); + + // Add an extra row at the end. + table.put(new Put(getRowKey(5)).addColumn(CF, COL, EXTRA_VALUE)); + + PCollection>>> input = + p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes)))); + + PCollection output = input.apply(ParDo.of(doFn)); + PAssert.that(output) + .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5)); + PipelineResult result = p.run(); + validateCounters(result, 2L, 4L); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java new file mode 100644 index 0000000000..ee2b6814e2 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapper.java @@ -0,0 +1,153 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.bigtable.repackaged.com.google.gson.Gson; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.ArrayList; +import java.util.List; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; + +/** + * A fake for TableHashWrapper that allows us to mock the behavior of hbase's HashTable.TableHash + */ +public class FakeTableHashWrapper implements TableHashWrapper { + + // Sorted list of partition keys splitting the key range. + public List partitions; + // List of sorted by key. + public List> hashes; + public ImmutableBytesWritable startRowInclusive; + public ImmutableBytesWritable stopRowExclusive; + public Scan scan; + private static final long serialVersionUID = 34876543L; + + public FakeTableHashWrapper() { + this( + new ImmutableBytesWritable(), + new ImmutableBytesWritable(), + new ArrayList<>(), + new ArrayList<>(), + new Scan()); + } + + public FakeTableHashWrapper( + ImmutableBytesWritable startRowInclusive, + ImmutableBytesWritable stopRowExclusive, + List partitions, + List> hashes, + Scan scan) { + super(); + this.startRowInclusive = startRowInclusive; + this.stopRowExclusive = stopRowExclusive; + this.partitions = partitions; + this.hashes = hashes; + this.scan = scan; + } + + @Override + public int getNumHashFiles() { + return partitions.size() + 1; + } + + @Override + public ImmutableList getPartitions() { + return ImmutableList.copyOf(partitions); + } + + @Override + public ImmutableBytesWritable getStartRow() { + return startRowInclusive; + } + + @Override + public ImmutableBytesWritable getStopRow() { + return stopRowExclusive; + } + + @Override + public Scan getScan() { + return scan; + } + + @Override + public TableHashReader newReader(Configuration conf, ImmutableBytesWritable startRow) { + return new FakeTableHashReader(startRow); + } + + private void writeObject(ObjectOutputStream s) throws IOException { + Gson gson = new Gson(); + s.writeObject(gson.toJson(scan)); + s.writeObject(gson.toJson(startRowInclusive)); + s.writeObject(gson.toJson(stopRowExclusive)); + s.writeObject(gson.toJson(partitions)); + s.writeObject(gson.toJson(hashes)); + } + + private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { + Gson gson = new Gson(); + scan = gson.fromJson((String) s.readObject(), Scan.class); + startRowInclusive = gson.fromJson((String) s.readObject(), ImmutableBytesWritable.class); + stopRowExclusive = gson.fromJson((String) s.readObject(), ImmutableBytesWritable.class); + partitions = gson.fromJson((String) s.readObject(), ArrayList.class); + hashes = gson.fromJson((String) s.readObject(), ArrayList.class); + } + + public class FakeTableHashReader implements TableHashReader { + private final ImmutableBytesWritable startRow; + // Copy of items to be read by this reader. + private final List> entriesToRead; + // First next() will make index = 0, and compare it with the size of entriesToRead. + private int index = -1; + + public FakeTableHashReader(ImmutableBytesWritable startRow) { + this.startRow = startRow; + entriesToRead = new ArrayList<>(); + for (KV hash : hashes) { + // Collect all the entries after startRow. + if (hash.getKey().compareTo(startRow) >= 0) { + entriesToRead.add(hash); + } + } + } + + @Override + public boolean next() throws IOException { + return ++index < entriesToRead.size(); + } + + @Override + public ImmutableBytesWritable getCurrentKey() { + return entriesToRead.get(index).getKey(); + } + + @Override + public ImmutableBytesWritable getCurrentHash() { + return entriesToRead.get(index).getValue(); + } + + @Override + public void close() throws IOException { + // NOOP + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java new file mode 100644 index 0000000000..2e65e3b855 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/FakeTableHashWrapperFactory.java @@ -0,0 +1,32 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +public class FakeTableHashWrapperFactory extends TableHashWrapperFactory { + + private static final long serialVersionUID = 269854624L; + + private final FakeTableHashWrapper fakeTableHashWrapper; + + public FakeTableHashWrapperFactory(FakeTableHashWrapper wrapper) { + this.fakeTableHashWrapper = wrapper; + } + + @Override + public TableHashWrapper getTableHash(String projectId, String sourceHashDir) { + return fakeTableHashWrapper; + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java new file mode 100644 index 0000000000..fa88a56d14 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashBasedReaderTest.java @@ -0,0 +1,179 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static org.junit.Assert.assertEquals; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.beam.sdk.testing.SourceTestUtils; +import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class HadoopHashBasedReaderTest { + + private HadoopHashTableSource hashTableSource; + private FakeTableHashWrapper fakeTableHashWrapper; + + private static final String HASH_TABLE_OUTPUT_PATH_DIR = "gs://my-bucket/outputDir"; + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("AAAA".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = + new ImmutableBytesWritable("ZZZZ".getBytes()); + private static final ImmutableBytesWritable EMPTY_ROW = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + private static final ImmutableBytesWritable START_HASH = + new ImmutableBytesWritable("START-HASH".getBytes()); + + @Before + public void setUp() throws Exception { + fakeTableHashWrapper = + new FakeTableHashWrapper( + START_ROW, STOP_ROW, new ArrayList<>(), new ArrayList<>(), new Scan()); + hashTableSource = + new HadoopHashTableSource( + StaticValueProvider.of("cbt-dev"), + StaticValueProvider.of(HASH_TABLE_OUTPUT_PATH_DIR), + START_ROW, + STOP_ROW, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)); + } + + protected static ImmutableBytesWritable getKey(int keyIndex) { + return new ImmutableBytesWritable(("KEY-" + keyIndex).getBytes()); + } + + protected static ImmutableBytesWritable getHash(int hashIndex) { + return new ImmutableBytesWritable(("HASH-" + hashIndex).getBytes()); + } + + /** + * Populates the fakeTableHashWrapper with {@code numEntries} entries starting with startKey. + * Returns a List of expected RangeHashes for this data, for numEntries=1, single RangeHash is + * returned (startRow, stopRow, START_HASH). + */ + protected List setupTestData( + ImmutableBytesWritable startRow, ImmutableBytesWritable stopRow, int numEntries) { + fakeTableHashWrapper.startRowInclusive = startRow; + fakeTableHashWrapper.stopRowExclusive = stopRow; + fakeTableHashWrapper.hashes.add(KV.of(startRow, START_HASH)); + for (int i = 0; i < numEntries - 1; i++) { + fakeTableHashWrapper.hashes.add(KV.of(getKey(i), getHash(i))); + } + + // Setup RangeHashes to be returned + List expectedRangeHashes = new ArrayList<>(); + ImmutableBytesWritable key = startRow; + ImmutableBytesWritable hash = START_HASH; + for (int i = 0; i < numEntries - 1; i++) { + expectedRangeHashes.add(RangeHash.of(key, getKey(i), hash)); + key = getKey(i); + hash = getHash(i); + } + expectedRangeHashes.add(RangeHash.of(key, stopRow, hash)); + return expectedRangeHashes; + } + + /////////////////////////////// Test the end of HashTable Output ///////////////////////// + + @Test + public void testHashReaderEmpty() throws IOException { + // The tableHashWrapper has no hashes, this should result in empty source. + assertEquals(Arrays.asList(), SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderSingleHashBatch() throws IOException { + // Setup 1 entry in this hashtable datafile. The test is setup so that HashTable datafile has + // only 1 entry. + List expected = setupTestData(START_ROW, STOP_ROW, 1); + + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderMultipleHashBatch() throws IOException { + // Setup 4 entries in this hashtable datafile. + List expected = setupTestData(START_ROW, STOP_ROW, 4); + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + //////////////////// Test the end of HashTable output when end of range is ""///////////////// + @Test + public void testHashReaderWithEmptyEndRow() throws IOException { + // Setup 4 entries in this hashtable datafile with no start or stop keys set. + List expected = setupTestData(EMPTY_ROW, EMPTY_ROW, 4); + hashTableSource.startRowInclusive = EMPTY_ROW; + hashTableSource.stopRowExclusive = EMPTY_ROW; + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + /////////////////////////////// Test reader.getCurrent() >= stopRow ///////////////////////// + + @Test + public void testHashReaderWorkItemEndedOnFirstBatch() throws IOException { + // Setup 1 entry in this hashtable datafile. This entry is outside of the workitem's row + fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, START_HASH)); + // Source will be empty as no hashes fall in its bounds. + assertEquals(new ArrayList(), SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderWorkItemEndedOnSecondEntry() throws IOException { + // Setup 1 entry in this hashtable datafile. The test is setup so that HashTable datafile has + // only 1 entry. + List expected = setupTestData(START_ROW, STOP_ROW, 1); + // Add a next entry at the stop row. Reader should stop and read just 1 entry. + fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, getHash(100))); + + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testHashReaderWorkItemEndedAfterMultipleBatches() throws IOException { + // Setup 4 entries in this hashtable datafile. + List expected = setupTestData(START_ROW, STOP_ROW, 4); + // Add a next entry at the stop row. Reader should stop and read just 4 entry. + fakeTableHashWrapper.hashes.add(KV.of(STOP_ROW, getHash(100))); + assertEquals(expected, SourceTestUtils.readFromSource(hashTableSource, null)); + } + + @Test + public void testSplitEqualsUnsplit() throws Exception { + setupTestData(START_ROW, STOP_ROW, 6); + fakeTableHashWrapper.partitions = Arrays.asList(getKey(2), getKey(4)); + SourceTestUtils.assertSourcesEqualReferenceSource( + hashTableSource, hashTableSource.split(1, null), null); + } + + @Test + public void testUnstartedReaderEqualsStarted() throws Exception { + setupTestData(START_ROW, STOP_ROW, 6); + SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource( + hashTableSource.createReader(null), null); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java new file mode 100644 index 0000000000..a3aba3f756 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HadoopHashTableSourceTest.java @@ -0,0 +1,209 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.HashBasedReader; +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.util.List; +import junit.framework.TestCase; +import org.apache.beam.sdk.io.BoundedSource; +import org.apache.beam.sdk.io.BoundedSource.BoundedReader; +import org.apache.beam.sdk.options.ValueProvider; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class HadoopHashTableSourceTest extends TestCase { + + HadoopHashTableSource source; + FakeTableHashWrapper fakeTableHashWrapper; + + private static final ValueProvider PROJECT_ID = StaticValueProvider.of("test-project"); + private static final ValueProvider HASH_TABLE_OUTPUT_PATH_DIR = + StaticValueProvider.of("gs://my-bucket/outputDir"); + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("a".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = new ImmutableBytesWritable("z".getBytes()); + private static final ImmutableBytesWritable PARTITION1 = + new ImmutableBytesWritable("d".getBytes()); + private static final ImmutableBytesWritable PARTITION2 = + new ImmutableBytesWritable("g".getBytes()); + private static final ImmutableBytesWritable EMPTY_ROW_KEY = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + + @Before + public void setUp() throws Exception { + super.setUp(); + fakeTableHashWrapper = new FakeTableHashWrapper(); + } + + private List> getSplitSources( + List partitions, + ImmutableBytesWritable startRow, + ImmutableBytesWritable stopRow) + throws IOException { + fakeTableHashWrapper.startRowInclusive = startRow; + fakeTableHashWrapper.stopRowExclusive = stopRow; + fakeTableHashWrapper.partitions = partitions; + + source = + new HadoopHashTableSource( + PROJECT_ID, + HASH_TABLE_OUTPUT_PATH_DIR, + startRow, + stopRow, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)); + return (List>) source.split(0, null); + } + + private void testSourceSplits( + List partitions, + ImmutableBytesWritable startRow, + ImmutableBytesWritable stopRow, + List> expectedSources) + throws IOException { + assertEquals(expectedSources, getSplitSources(partitions, startRow, stopRow)); + } + + @Test + public void testSplitZeroPartitions() throws IOException { + // Row range [a-z) with no splits. + List> expected = + ImmutableList.of( + new HadoopHashTableSource(PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, STOP_ROW)); + testSourceSplits(ImmutableList.of(), START_ROW, STOP_ROW, expected); + } + + @Test + public void testSplitOnePartition() throws IOException { + // Row range [a-z) with 1 splits. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, STOP_ROW)); + testSourceSplits(ImmutableList.of(PARTITION1), START_ROW, STOP_ROW, expected); + } + + @Test + public void testMultiplePartitons() throws IOException { + // Row range [a-z) with splits on {d,g}. The data files will be for {[a,d), [d,g), [g,z)}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, STOP_ROW)); + testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), START_ROW, STOP_ROW, expected); + } + + @Test + public void testSplitEmptyStartRow() throws IOException { + // Row range [""-z) with splits on {d,g}. The data files will be for {["",d), [d,g), [g,z)}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, EMPTY_ROW_KEY, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, STOP_ROW)); + testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), EMPTY_ROW_KEY, STOP_ROW, expected); + } + + @Test + public void testSplitEmptyStopRow() throws IOException { + // Row range [a-"") with splits on {d,g}. The data files will be for {[a,d), [d,g), [g,"")}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, START_ROW, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, EMPTY_ROW_KEY)); + testSourceSplits(ImmutableList.of(PARTITION1, PARTITION2), START_ROW, EMPTY_ROW_KEY, expected); + } + + @Test + public void testSplitFullTableScan() throws IOException { + // Row range [""-"") with splits on {d,g}. The data files will be for {["",d), [d,g), [g,"")}. + List> expected = + ImmutableList.of( + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, EMPTY_ROW_KEY, PARTITION1), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION1, PARTITION2), + new HadoopHashTableSource( + PROJECT_ID, HASH_TABLE_OUTPUT_PATH_DIR, PARTITION2, EMPTY_ROW_KEY)); + testSourceSplits( + ImmutableList.of(PARTITION1, PARTITION2), EMPTY_ROW_KEY, EMPTY_ROW_KEY, expected); + } + + @Test + public void testCreateReaderWithoutSplit() throws IOException { + source = + new HadoopHashTableSource( + PROJECT_ID, + HASH_TABLE_OUTPUT_PATH_DIR, + // When split is not called, start/stop are uninitialized. Start/stop are runtime params + // and are initialized in split/createReader. + null, + null, + new FakeTableHashWrapperFactory(fakeTableHashWrapper)); + // Setup boundaries on the TableHashWrapper to be used in Source. + fakeTableHashWrapper.startRowInclusive = START_ROW; + fakeTableHashWrapper.stopRowExclusive = STOP_ROW; + + // Create a new Reader + BoundedReader reader = source.createReader(null); + + // Validate that the reader was properly created. + assertEquals(HashBasedReader.class, reader.getClass()); + assertEquals(source, reader.getCurrentSource()); + HashBasedReader hashBasedReader = (HashBasedReader) reader; + assertEquals(START_ROW, hashBasedReader.startRowInclusive); + assertEquals(STOP_ROW, hashBasedReader.stopRowExclusive); + } + + @Test + public void testCreateReaderAfterSplit() throws IOException { + // Single partitions will return a 2 sources. + List> splitSources = + getSplitSources(ImmutableList.of(PARTITION1), START_ROW, STOP_ROW); + BoundedSource splitHashSource = splitSources.get(0); + + // Create a new Reader + BoundedReader reader = splitHashSource.createReader(null); + + // Validate that the reader was properly created. + assertEquals(HashBasedReader.class, reader.getClass()); + assertEquals(splitHashSource, reader.getCurrentSource()); + HashBasedReader hashBasedReader = (HashBasedReader) reader; + assertEquals(START_ROW, hashBasedReader.startRowInclusive); + assertEquals(PARTITION1, hashBasedReader.stopRowExclusive); + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java new file mode 100644 index 0000000000..f58becf3cb --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/HashBasedSourceSerializationTest.java @@ -0,0 +1,122 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import static com.google.common.truth.Truth.assertWithMessage; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import junit.framework.TestCase; +import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class HashBasedSourceSerializationTest extends TestCase { + + public static final String SOURCE_HASH_DIR = "gs://my-bucket/outputDir"; + public static final String PROJECT_ID = "test-project"; + private static final ImmutableBytesWritable START_ROW = + new ImmutableBytesWritable("a".getBytes()); + private static final ImmutableBytesWritable STOP_ROW = new ImmutableBytesWritable("y".getBytes()); + + @Before + public void setUp() throws Exception { + super.setUp(); + } + + @Test + public void testSerializeWithValueProviders() throws IOException { + checkSerialization( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR))); + } + + @Test + public void testSerializeWithStartStop() throws IOException { + checkSerialization( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), + StaticValueProvider.of(SOURCE_HASH_DIR), + new ImmutableBytesWritable(START_ROW), + new ImmutableBytesWritable(STOP_ROW))); + } + + @Test + public void testBufferedSourceSerialize() { + checkSerialization( + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR)))); + } + + @Test + public void testBufferedSourceSerializeWithBatchSize() { + checkSerialization( + new BufferedHadoopHashTableSource( + new HadoopHashTableSource( + StaticValueProvider.of(PROJECT_ID), StaticValueProvider.of(SOURCE_HASH_DIR)), + 5)); + } + + private static void checkSerialization(Object source) { + try { + Object deserialized = serializeDeserialize(source); + checkClassDeclaresSerialVersionUid(source.getClass()); + assertEquals(source, deserialized); + } catch (IOException | ClassNotFoundException e) { + fail(e.toString()); + } + } + + private static void checkClassDeclaresSerialVersionUid(Class cls) { + String uid = "serialVersionUID"; + for (Field field : cls.getDeclaredFields()) { + if (field.getName() == uid) { + int modifiers = field.getModifiers(); + assertWithMessage(field + " is not static").that(Modifier.isStatic(modifiers)).isTrue(); + assertWithMessage(field + " is not final").that(Modifier.isFinal(modifiers)).isTrue(); + assertWithMessage(field + " is not private").that(Modifier.isPrivate(modifiers)).isTrue(); + assertWithMessage(field + " must be long") + .that(field.getType().getSimpleName()) + .isEqualTo("long"); + return; + } + } + fail(cls + " does not declare serialVersionUID"); + } + + private static Object serializeDeserialize(Object obj) + throws IOException, ClassNotFoundException { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try (ObjectOutputStream outStream = new ObjectOutputStream(bos)) { + outStream.writeObject(obj); + } + + ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray()); + try (ObjectInputStream inStream = new ObjectInputStream(bis)) { + return inStream.readObject(); + } + } +} diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java new file mode 100644 index 0000000000..5f644e3b50 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/RangeHashCoderTest.java @@ -0,0 +1,51 @@ +/* + * Copyright 2021 Google Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.beam.validation; + +import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash; +import org.apache.beam.sdk.coders.CoderException; +import org.apache.beam.sdk.testing.CoderProperties; +import org.apache.beam.sdk.util.CoderUtils; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.junit.Assert; +import org.junit.Test; + +public class RangeHashCoderTest { + private static final RangeHashCoder TEST_CODER = new RangeHashCoder(); + private static final ImmutableBytesWritable START = + new ImmutableBytesWritable("Start".getBytes()); + private static final ImmutableBytesWritable STOP = new ImmutableBytesWritable("Stop".getBytes()); + private static final ImmutableBytesWritable HASH = new ImmutableBytesWritable("hash".getBytes()); + private static final ImmutableBytesWritable EMPTY = + new ImmutableBytesWritable(HConstants.EMPTY_BYTE_ARRAY); + + @Test + public void encodeRangeHash() throws Exception { + CoderProperties.coderDecodeEncodeEqual(TEST_CODER, RangeHash.of(START, STOP, HASH)); + } + + @Test(expected = CoderException.class) + public void encodeNullThrowsCoderException() throws Exception { + CoderUtils.encodeToByteArray(TEST_CODER, null); + } + + @Test + public void testEncodedTypeDescriptor() throws Exception { + Assert.assertEquals(TEST_CODER.getEncodedTypeDescriptor(), TypeDescriptor.of(RangeHash.class)); + } +}