Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 63 additions & 4 deletions cpp/src/arrow/util/compression_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"

namespace arrow {
namespace util {
namespace arrow::util {

#ifdef ARROW_WITH_BENCHMARKS_REFERENCE

Expand Down Expand Up @@ -133,6 +132,37 @@ static void ReferenceStreamingCompression(
StreamingCompression(COMPRESSION, data, state);
}

int64_t Compress(Codec* codec, const std::vector<uint8_t>& data,
std::vector<uint8_t>* compressed_data) {
const uint8_t* input = data.data();
int64_t input_len = data.size();
int64_t compressed_size = 0;
int64_t max_compressed_len = codec->MaxCompressedLen(input_len, input);
compressed_data->resize(max_compressed_len);

if (input_len > 0) {
compressed_size = *codec->Compress(input_len, input, compressed_data->size(),
compressed_data->data());
compressed_data->resize(compressed_size);
}
return compressed_size;
}

template <Compression::type COMPRESSION>
static void ReferenceCompression(benchmark::State& state) { // NOLINT non-const reference
auto data = MakeCompressibleData(8 * 1024 * 1024); // 8 MB

auto codec = *Codec::Create(COMPRESSION);

while (state.KeepRunning()) {
std::vector<uint8_t> compressed_data;
auto compressed_size = Compress(codec.get(), data, &compressed_data);
state.counters["ratio"] =
static_cast<double>(data.size()) / static_cast<double>(compressed_size);
}
state.SetBytesProcessed(state.iterations() * data.size());
}

static void StreamingDecompression(
Compression::type compression, const std::vector<uint8_t>& data,
benchmark::State& state) { // NOLINT non-const reference
Expand Down Expand Up @@ -175,27 +205,56 @@ static void ReferenceStreamingDecompression(
StreamingDecompression(COMPRESSION, data, state);
}

template <Compression::type COMPRESSION>
static void ReferenceDecompression(
benchmark::State& state) { // NOLINT non-const reference
auto data = MakeCompressibleData(8 * 1024 * 1024); // 8 MB

auto codec = *Codec::Create(COMPRESSION);

std::vector<uint8_t> compressed_data;
ARROW_UNUSED(Compress(codec.get(), data, &compressed_data));
state.counters["ratio"] =
static_cast<double>(data.size()) / static_cast<double>(compressed_data.size());

std::vector<uint8_t> decompressed_data(data);
while (state.KeepRunning()) {
auto result = codec->Decompress(compressed_data.size(), compressed_data.data(),
decompressed_data.size(), decompressed_data.data());
ARROW_CHECK(result.ok());
ARROW_CHECK(*result == static_cast<int64_t>(decompressed_data.size()));
}
state.SetBytesProcessed(state.iterations() * data.size());
}

#ifdef ARROW_WITH_ZLIB
BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::GZIP);
BENCHMARK_TEMPLATE(ReferenceCompression, Compression::GZIP);
BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::GZIP);
BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::GZIP);
#endif

#ifdef ARROW_WITH_BROTLI
BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::BROTLI);
BENCHMARK_TEMPLATE(ReferenceCompression, Compression::BROTLI);
BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::BROTLI);
BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::BROTLI);
#endif

#ifdef ARROW_WITH_ZSTD
BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::ZSTD);
BENCHMARK_TEMPLATE(ReferenceCompression, Compression::ZSTD);
BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::ZSTD);
BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::ZSTD);
#endif

#ifdef ARROW_WITH_LZ4
BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::LZ4_FRAME);
BENCHMARK_TEMPLATE(ReferenceCompression, Compression::LZ4_FRAME);
BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::LZ4_FRAME);
BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::LZ4_FRAME);
Comment on lines +253 to +255

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is LZ4_FRAME OK?
It seems that Parquet doesn't use LZ4_FRAME.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can even benchmark both LZ4 variants.

@mapleFU mapleFU Oct 23, 2023

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that Parquet doesn't use LZ4_FRAME

Aha I remember parquet-mr first implement LZ4. And arrow implement a different version ( LZ4_FRAME ). LZ4 stores an extra-length here.

Maybe apache/parquet-format#168 helps

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And I don't think they have too many differences...

Currently I didn't add LZ4. But feel free to add if neccesssary

#endif

#endif

} // namespace util
} // namespace arrow
} // namespace arrow::util