diff --git a/CMakeLists.txt b/CMakeLists.txt index 5aa8b930..2b69b109 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -675,6 +675,8 @@ set(LIBPARQUET_SRCS src/parquet/column_reader.cc src/parquet/column_scanner.cc src/parquet/column_writer.cc + src/parquet/murmur3.cc + src/parquet/bloom.cc src/parquet/file/metadata.cc src/parquet/file/printer.cc diff --git a/src/parquet/CMakeLists.txt b/src/parquet/CMakeLists.txt index a2e283e2..a8a87323 100644 --- a/src/parquet/CMakeLists.txt +++ b/src/parquet/CMakeLists.txt @@ -27,6 +27,8 @@ install(FILES schema.h statistics.h types.h + bloom.h + murmur3.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet") configure_file(parquet_version.h.in @@ -56,6 +58,7 @@ ADD_PARQUET_TEST(public-api-test) ADD_PARQUET_TEST(types-test) ADD_PARQUET_TEST(reader-test) ADD_PARQUET_TEST(schema-test) +ADD_PARQUET_TEST(bloom-test) ADD_PARQUET_BENCHMARK(column-io-benchmark) ADD_PARQUET_BENCHMARK(encoding-benchmark) diff --git a/src/parquet/bloom-test.cc b/src/parquet/bloom-test.cc new file mode 100644 index 00000000..5621d13b --- /dev/null +++ b/src/parquet/bloom-test.cc @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "parquet/murmur3.h" +#include "parquet/bloom.h" + +#include "parquet/util/memory.h" + +namespace parquet { +namespace test { +TEST(Murmur3Test, TestBloomFilter) { + const uint8_t bitset[8] = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7}; + int64_t result; + MurmurHash3_x64_128(bitset, 8, Bloom::DEFAULT_SEED, &result); + ASSERT_EQ(result, -3850979349427597861l); +} + + +TEST(FindTest, TestBloomFilter) { + std::unique_ptr bloom(new Bloom(1024)); + for(int i = 0; i<10; i++) { + uint64_t hash_value = bloom->hash(i); + bloom->insert(hash_value); + } + std::shared_ptr sink; + sink.reset(new InMemoryOutputStream()); + + bloom->writeTo(sink); + + std::shared_ptr source(new InMemoryInputStream(sink->GetBuffer())); + + int64_t bytes_avaliable; + uint32_t length = *(reinterpret_cast( + source->Read(4, &bytes_avaliable))); + ASSERT_EQ(length, 1024); + + uint32_t hash = *(reinterpret_cast( + source->Read(4, &bytes_avaliable))); + ASSERT_EQ(hash, 0); + + uint32_t algo = *(reinterpret_cast( + source->Read(4, &bytes_avaliable))); + ASSERT_EQ(algo, 0); + + const uint8_t* bitset = source->Read(length, &bytes_avaliable); + + std::unique_ptr de_bloom(new Bloom(bitset, length)); + + for(int i = 0; i<10; i++) { + ASSERT_TRUE(de_bloom->find(bloom->hash(i))); + } +} + +}//namespace test + + +}// namespace parquet diff --git a/src/parquet/bloom.cc b/src/parquet/bloom.cc new file mode 100644 index 00000000..cac71e45 --- /dev/null +++ b/src/parquet/bloom.cc @@ -0,0 +1,216 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "parquet/bloom.h" + +#include +#include + +#include "arrow/util/bit-util.h" + +#include "parquet/exception.h" +#include "parquet/murmur3.h" +#include "parquet/util/logging.h" + +namespace parquet { +constexpr uint32_t Bloom::SALT[8]; + +Bloom::Bloom(uint32_t num_bytes) + : num_bytes(num_bytes), + hash_strategy(MURMUR3_X64_128), + algorithm(BLOCK), + hashFunc(NULL) { + initBitset(num_bytes); + + switch (hash_strategy) { + case MURMUR3_X64_128: + this->hashFunc = &MurmurHash3_x64_128; + break; + default: + throw parquet::ParquetException("Unknown hash strategy."); + } +} + +void Bloom::initBitset(uint32_t num_bytes) { + if (num_bytes < BYTES_PER_FILTER_BLOCK) { + num_bytes = BYTES_PER_FILTER_BLOCK; + } + + if (num_bytes > DEFAULT_MAXIMUM_BLOOM_FILTER_BYTES) { + num_bytes = DEFAULT_MAXIMUM_BLOOM_FILTER_BYTES; + } + + // Get next power of 2 if it is not power of 2. + if ((num_bytes & (num_bytes - 1)) != 0) { + num_bytes = static_cast(::arrow::BitUtil::NextPower2(num_bytes)); + } + + this->bitset = new uint8_t[num_bytes]; +} + +Bloom::Bloom(const uint8_t* bitset, uint32_t num_bytes) + : num_bytes(num_bytes), + hash_strategy(MURMUR3_X64_128), + algorithm(BLOCK){ + this->bitset = new uint8_t[num_bytes]; + memcpy(this->bitset, bitset, num_bytes); + switch (hash_strategy) { + case MURMUR3_X64_128: + this->hashFunc = &MurmurHash3_x64_128; + break; + default: + throw new parquet::ParquetException("Not supported hash strategy"); + } +} + +void Bloom::setMask(uint32_t key, uint32_t mask[8]) { + for (int i = 0; i < 8; ++i) { + mask[i] = key * SALT[i]; + } + + for (int i = 0; i < 8; ++i) { + mask[i] = mask[i] >> 27; + } + + for (int i = 0; i < 8; ++i) { + mask[i] = 0x1U << mask[i]; + } +} + +uint32_t optimalNumOfBits(uint32_t ndv, double fpp) { + DCHECK(fpp > 0.0 && fpp < 1.0); + const double M = -8 * ndv / log(1 - pow(fpp, 1.0 / 8)); + const double MAX = Bloom::DEFAULT_MAXIMUM_BLOOM_FILTER_BYTES << 3; + + int num_bits = static_cast(M); + + // Handle overflow. + if (M > MAX || M < 0) { + num_bits = static_cast(MAX); + } + + // Get next power of 2 if bits is not power of 2. + if ((num_bits & (num_bits - 1)) != 0) { + num_bits = static_cast(::arrow::BitUtil::NextPower2(num_bits)); + } + + // Minimum + if (num_bits < (Bloom::BYTES_PER_FILTER_BLOCK << 3)) { + num_bits = Bloom::BYTES_PER_FILTER_BLOCK << 3; + } + + return num_bits; +} + +void Bloom::addElement(uint64_t hash) { + uint32_t* const bitset32 = reinterpret_cast(bitset); + const uint32_t bucketIndex = static_cast(hash >> 32) + & (num_bytes / BYTES_PER_FILTER_BLOCK - 1); + uint32_t key = static_cast(hash); + + // Calculate mask for bucket. + uint32_t mask[8]; + setMask(key, mask); + + for (int i = 0; i < 8; i++) { + bitset32[bucketIndex * 8 + i] |= mask[i]; + } +} + +bool Bloom::contains(uint64_t hash) { + uint32_t * const bitset32 = reinterpret_cast(bitset); + const uint32_t bucketIndex = static_cast((hash >> 32) + & (num_bytes / BYTES_PER_FILTER_BLOCK - 1)); + uint32_t key = static_cast(hash); + + // Calculate mask for bucket. + uint32_t mask[8]; + setMask(key, mask); + + for (int i = 0; i < 8; ++i) { + if (0 == (bitset32[8 * bucketIndex + i] & mask[i])) { + return false; + } + } + return true; +} + +bool Bloom::find(uint64_t hash) { + return contains(hash); +} + +void Bloom::insert(unsigned long long hash) { + addElement(hash); +} + +uint64_t Bloom::hash(int value) { + uint64_t out[2]; + (*hashFunc)((void*)&value, sizeof(int), DEFAULT_SEED, &out); + return out[0]; +} + +uint64_t Bloom::hash(const long value) { + uint64_t out[2]; + (*hashFunc)((void*)&value, sizeof(long), DEFAULT_SEED, &out); + return out[0]; +} + +uint64_t Bloom::hash(const float value) { + uint64_t out[2]; + (*hashFunc)((void*)&value, sizeof(float), DEFAULT_SEED, &out); + return out[0]; +} + + +uint64_t Bloom::hash(const double value) { + uint64_t out[2]; + (*hashFunc)((void*)&value, sizeof(double), DEFAULT_SEED, &out); + return out[0]; +} + + +uint64_t Bloom::hash(const Int96 &value) { + uint64_t out[2]; + (*hashFunc)((void*)value.value, sizeof(value.value), DEFAULT_SEED, &out); + return out[0]; +} + +uint64_t Bloom::hash(const ByteArray &value) { + uint64_t out[2]; + (*hashFunc)((void*)value.ptr, value.len, DEFAULT_SEED, &out); + return out[0]; +} + +uint64_t Bloom::hash(const FLBA &value, uint32_t len) { + uint64_t out[2]; + (*hashFunc)((void*)value.ptr, len, DEFAULT_SEED, &out); + return out[0]; +} + +void Bloom::writeTo(const std::shared_ptr& sink){ + sink->Write(reinterpret_cast(&num_bytes), sizeof(uint32_t)); + sink->Write(reinterpret_cast(&hash_strategy), sizeof(uint32_t)); + sink->Write(reinterpret_cast(&algorithm), sizeof(uint32_t)); + sink->Write(bitset, num_bytes); +} + +Bloom::~Bloom() { + if (bitset) { + free(bitset); + } +} +} // namespace parquet + diff --git a/src/parquet/bloom.h b/src/parquet/bloom.h new file mode 100644 index 00000000..fb3a7996 --- /dev/null +++ b/src/parquet/bloom.h @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_BLOOM_H +#define PARQUET_BLOOM_H + +#include +#include + +#include "parquet/types.h" +#include "parquet/util/memory.h" + +namespace parquet{ + class OutputStream; + + // Bloom Filter is a compact structure to indicate whether an item is not in set or + // probably in set. Bloom class is underlying class of Bloom Filter which stores a + // bit set represents elements set, hash strategy and bloom filter algorithm. + + // Bloom Filter algorithm is implemented using block Bloom filters from Putze et al.'s + // "Cache-,Hash- and Space-Efficient Bloom Filters". The basic idea is to hash the + // item to a tiny Bloom Filter which size fit a single cache line or smaller. This + // implementation sets 8 bits in each tiny Bloom Filter. Tiny bloom filter are 32 + // bytes to take advantage of 32-bytes SIMD instruction. + +class Bloom { +public: + // Hash strategy available for bloom filter. + enum HashStrategy { + MURMUR3_X64_128 + }; + + // Bloom filter algorithm. + enum Algorithm { + BLOCK + }; + + /** + * Default false positive probability value use to calculate optimal number of bits + * used by bloom filter. + */ + static constexpr double DEFAULT_FPP = 0.01; + + // Bloom filter data header, including number of bytes, hash strategy and algorithm. + static constexpr int HEADER_SIZE = 12; + + // Bytes in a tiny bloom filter block. + static constexpr int BYTES_PER_FILTER_BLOCK = 32; + + // Default seed for hash function + static constexpr int DEFAULT_SEED = 104729; + + // Default maximum bloom filter size (need to discuss) + static constexpr int DEFAULT_MAXIMUM_BLOOM_FILTER_BYTES = 16 * 1024 * 1024; + + // The block based algorithm needs 8 odd SALT values to calculate eight index + // of bit to set, one bit in 32-bit word. + static constexpr uint32_t SALT[8] = { 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, + 0xa2b7289dU, 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U }; + + typedef void (*HashFunc)(const void *, int, uint32_t, void*); +public: + /// Constructor of bloom filter, if numBytes is zero, bloom filter bitset + /// will be created lazily and the number of bytes will be calculated through + /// distinct values in cache. It use murmur3_x64_128 as its default hash function + /// and block based algorithm as default algorithm. + /// @param num_bytes The number of bytes for bloom filter bitset, set to zero can + /// let it calculate number automatically by using default DEFAULT_FPP. + Bloom(uint32_t num_bytes); + + + /// Construct the bloom filter with given bit set, it is used when reconstruct + /// bloom filter from parquet file.It use murmur3_x64_128 as its default hash + /// function and block based algorithm as default algorithm. + /// @param bitset The given bitset to construct bloom filter. + /// @param len Length of bitset. + Bloom(const uint8_t* bitset, uint32_t len); + + Bloom(const Bloom& orig) = delete; + virtual ~Bloom(); + + // Calculate optimal size according to the number of distinct values and false + // positive probability. + // @param ndv: The number of distinct values. + // @param fpp: The false positive probability. + // @return optimal number of bits of given n and p. + static uint32_t optimalNumOfBits(uint32_t ndv, double fpp); + + // Determine whether an element exist in set or not. + // @param hash the element to contain. + // @return false if value is definitely not in set, and true means PROBABLY in set. + bool find(uint64_t hash); + + // Insert element to set represented by bloom bitset. + // @param hash the hash of value to insert into bloom filter.. + void insert(unsigned long long hash); + + // Compute hash for int value by using its plain encoding result. + // @param value the value to hash. + // @return hash result. + uint64_t hash(const int value); + + // Compute hash for long value by using its plain encoding result. + // @param value the value to hash. + // @return hash result. + uint64_t hash(const long value); + + // Compute hash for float value by using its plain encoding result. + // @param value the value to hash. + // @return hash result. + uint64_t hash(const float value); + + // Compute hash for double value by using its plain encoding result. + // @param value the value to hash. + // @return hash result. + uint64_t hash(const double value); + + // Compute hash for Int96 value by using its plain encoding result. + // @param value the value to hash. + // @return hash result. + uint64_t hash(const Int96 &value); + + // Compute hash for ByteArray value by using its plain encoding result. + // @param value the value to hash. + // @return hash result. + uint64_t hash(const ByteArray &value); + + // Compute hash for Fixed Length Byte Array value by using its plain encoding result. + // @param value the value to hash. + // @return hash result. + uint64_t hash(const FLBA &value, uint32_t len); + + // Write bloom filter to output stream. A bloom filter structure should include + // bitset length, hash strategy, algorithm, and bitset. + // @param sink output stream to write + void writeTo(const std::shared_ptr& sink); + +private: + // Create a new bitset for bloom filter, at least 256 bits will be create. + // @param numBytes number of bytes for bitset + void initBitset(uint32_t num_bytes); + void setMask(uint32_t key, uint32_t mask[8]); + void addElement(uint64_t hash); + bool contains(uint64_t hash); + + // The number of bytes of bloom filter bitset. + uint32_t num_bytes; + + // Hash strategy used in this bloom filter. + HashStrategy hash_strategy; + + // Algorithm applied of this bloom filter. + Algorithm algorithm; + + // The underlying byte array for bloom filter bitset. + uint8_t* bitset; + + // Hash function applied. + HashFunc hashFunc; +}; +} +#endif /* BLOOM_H */ + diff --git a/src/parquet/murmur3.cc b/src/parquet/murmur3.cc new file mode 100644 index 00000000..ab4cfa1e --- /dev/null +++ b/src/parquet/murmur3.cc @@ -0,0 +1,349 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "murmur3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE inline __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix64 ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i*4+0); + uint32_t k2 = getblock32(blocks,i*4+1); + uint32_t k3 = getblock32(blocks,i*4+2); + uint32_t k4 = getblock32(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock64(blocks,i*2+0); + uint64_t k2 = getblock64(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} diff --git a/src/parquet/murmur3.h b/src/parquet/murmur3.h new file mode 100644 index 00000000..e823e7c6 --- /dev/null +++ b/src/parquet/murmur3.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) && (_MSC_VER < 1600) + +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ \ No newline at end of file