rapidsai
diff --git a/‎CMakeLists.txt‎
Lines changed: 21 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎benchmarks/CMakeLists.txt‎
Lines changed: 57 additions & 0 deletions b/‎benchmarks/CMakeLists.txt‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎benchmarks/random_allocations/random_allocations.cpp‎
Lines changed: 195 additions & 0 deletions b/‎benchmarks/random_allocations/random_allocations.cpp‎
Lines changed: 195 additions & 0 deletions
diff --git a/‎benchmarks/synchronization/synchronization.cpp‎
Lines changed: 72 additions & 0 deletions b/‎benchmarks/synchronization/synchronization.cpp‎
Lines changed: 72 additions & 0 deletions
@@ -54,8 +54,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     endif(CMAKE_CXX11_ABI)
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
-option(BUILD_TESTS "Configure CMake to build tests"
-       ON)
+option(BUILD_TESTS "Configure CMake to build tests" ON)
+option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
+
 
 ###################################################################################################
 # - cnmem ---------------------------------------------------------------------------------
@@ -89,6 +90,24 @@ if(BUILD_TESTS)
     endif(GTEST_FOUND)
 endif(BUILD_TESTS)
 
+###################################################################################################
+# - add google benchmark --------------------------------------------------------------------------
+
+if(BUILD_BENCHMARKS)
+
+  include(ConfigureGoogleBenchmark)
+
+  if(GBENCH_FOUND)
+    message(STATUS "Google C++ Benchmarking Framework (Google Benchmark) found in ${GBENCH_ROOT}")
+    include_directories(${GBENCH_INCLUDE_DIR})
+    add_subdirectory(${CMAKE_SOURCE_DIR}/benchmarks)
+  else()
+    message(AUTHOR_WARNING "Google C++ Benchmarking Framework (Google Benchmark) not found: automated tests are disabled.")
+  endif(GBENCH_FOUND)
+
+endif(BUILD_BENCHMARKS)
+
+
 ###################################################################################################
 # - include paths ---------------------------------------------------------------------------------
 
 
@@ -0,0 +1,57 @@
+cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
+
+project(RMM_BENCHS LANGUAGES C CXX CUDA)
+
+###################################################################################################
+# - compiler function -----------------------------------------------------------------------------
+
+function(ConfigureBench CMAKE_BENCH_NAME CMAKE_BENCH_SRC)
+    add_executable(${CMAKE_BENCH_NAME}
+                   ${CMAKE_BENCH_SRC}
+                   "${CMAKE_CURRENT_SOURCE_DIR}/synchronization/synchronization.cpp")
+    set_target_properties(${CMAKE_BENCH_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_link_libraries(${CMAKE_BENCH_NAME} benchmark benchmark_main pthread rmm )
+    set_target_properties(${CMAKE_BENCH_NAME} PROPERTIES
+                            RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gbenchmarks")
+endfunction(ConfigureBench)
+
+###################################################################################################
+# - include paths ---------------------------------------------------------------------------------
+
+include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
+                    "${CMAKE_BINARY_DIR}/include"
+                    "${CMAKE_SOURCE_DIR}/include"
+                    "${CMAKE_SOURCE_DIR}"
+                    "${CMAKE_SOURCE_DIR}/src"
+                    "${GTEST_INCLUDE_DIR}"
+                    "${GBENCH_INCLUDE_DIR}"
+                    "${CMAKE_CURRENT_SOURCE_DIR}")
+
+###################################################################################################
+# - library paths ---------------------------------------------------------------------------------
+
+link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc
+                 "${CMAKE_BINARY_DIR}/lib"
+                 "${CMAKE_BINARY_DIR}"
+                 "${GTEST_LIBRARY_DIR}"
+                 "${GBENCH_LIBRARY_DIR}")
+
+###################################################################################################
+### test sources ##################################################################################
+###################################################################################################
+
+###################################################################################################
+# - test benchmark --------------------------------------------------------------------------------
+
+set(TEST_BENCH_SRC
+  "${CMAKE_CURRENT_SOURCE_DIR}/test/test_benchmark.cpp")
+
+ConfigureBench(TEST_BENCH "${TEST_BENCH_SRC}")
+
+###################################################################################################
+# - random allocations benchmark --------------------------------------------------------------------------------
+
+set(RANDOM_ALLOCATIONS_BENCH_SRC
+  "${CMAKE_CURRENT_SOURCE_DIR}/random_allocations/random_allocations.cpp")
+
+ConfigureBench(RANDOM_ALLOCATIONS_BENCH "${RANDOM_ALLOCATIONS_BENCH_SRC}")
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either ex  ess or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <rmm/mr/device/cnmem_memory_resource.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/default_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/sub_memory_resource.hpp>
+
+#include <benchmark/benchmark.h>
+
+#include <random>
+
+#define VERBOSE 0
+
+namespace {
+
+constexpr std::size_t size_mb{1 << 20};
+
+struct allocation {
+  void* p{nullptr};
+  std::size_t size{0};
+  allocation(void* _p, std::size_t _size) : p{_p}, size{_size} {}
+  allocation() = default;
+};
+
+using allocation_vector = std::vector<allocation>;
+
+allocation remove_at(allocation_vector& allocs, std::size_t index) {
+  assert(index < allocs.size());
+  auto removed = allocs[index];
+
+  if ((allocs.size() > 1) && (index < allocs.size() - 1)) {
+    std::swap(allocs[index], allocs.back());
+  }
+  allocs.pop_back();
+
+  return removed;
+}
+
+template <typename SizeDistribution>
+void random_allocation_free(rmm::mr::device_memory_resource& mr,
+                            SizeDistribution size_distribution,
+                            size_t num_allocations,
+                            size_t max_usage, // in MiB
+                            cudaStream_t stream = 0)
+{
+  std::default_random_engine generator;
+
+  max_usage *= size_mb; // convert to bytes
+  
+  constexpr int allocation_probability = 73; // percent
+  std::uniform_int_distribution<int> op_distribution(0, 99);
+  std::uniform_int_distribution<int> index_distribution(0, num_allocations-1);
+
+  int active_allocations{0};
+  int allocation_count{0};
+
+  allocation_vector allocations{};
+  size_t allocation_size{0};
+
+  for (int i = 0; i < num_allocations * 2; ++i) {
+    bool do_alloc = true;
+    size_t size = static_cast<size_t>(size_distribution(generator));
+    
+    if (active_allocations > 0) {
+      int chance = op_distribution(generator);
+      do_alloc = (chance < allocation_probability) &&
+                 (allocation_count < num_allocations) &&
+                 (allocation_size + size < max_usage);
+    }
+
+    void* ptr = nullptr;
+    if (do_alloc) { // try to allocate
+      try {
+        ptr = mr.allocate(size, stream);
+      } catch(std::bad_alloc) {
+        do_alloc = false;
+      }
+    }
+
+    if (do_alloc) { // alloc succeeded
+      allocations.emplace_back(ptr, size);
+      active_allocations++;
+      allocation_count++;
+      allocation_size += size;
+
+      #if VERBOSE
+      std::cout << active_allocations << " | " << allocation_count << " Allocating: " << size 
+                << " | total: " << allocation_size << "\n";
+      #endif
+    }
+    else { // dealloc, or alloc failed
+      if (active_allocations > 0) {
+        size_t index = index_distribution(generator) % active_allocations;
+        active_allocations--;
+        allocation to_free = remove_at(allocations, index);
+        mr.deallocate(to_free.p, to_free.size, stream);
+        allocation_size -= to_free.size;
+
+        #if VERBOSE
+        std::cout << active_allocations << " | " << allocation_count << " Deallocating: " 
+                  << to_free.size << " at " << index << " | total: " << allocation_size << "\n";
+        #endif
+      }
+    }
+  }
+
+  assert(active_allocations == 0);
+  assert(allocations.size() == 0);
+}
+}  // namespace
+
+void uniform_random_allocations(rmm::mr::device_memory_resource& mr,
+                                size_t num_allocations,
+                                size_t max_allocation_size, // in MiB
+                                size_t max_usage,
+                                cudaStream_t stream = 0) {
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_allocation_size * size_mb);
+  random_allocation_free(mr, size_distribution, num_allocations, max_usage, stream);
+}
+
+// TODO figure out how to map a normal distribution to integers between 1 and max_allocation_size
+/*void normal_random_allocations(rmm::mr::device_memory_resource& mr,
+                                size_t num_allocations = 1000,
+                                size_t mean_allocation_size = 500, // in MiB
+                                size_t stddev_allocation_size = 500, // in MiB
+                                size_t max_usage = 8 << 20,
+                                cudaStream_t stream) {
+  std::normal_distribution<std::size_t> size_distribution(, max_allocation_size * size_mb);
+}*/
+
+constexpr size_t num_allocations = 100000;
+constexpr size_t max_size = 2;
+constexpr size_t max_usage = 16000;
+
+static void BM_RandomAllocationsCUDA(benchmark::State& state) {
+  rmm::mr::cuda_memory_resource mr;
+
+  try {
+    for (auto _ : state)
+      uniform_random_allocations(mr, num_allocations, max_size, max_usage);
+  } catch (std::exception const& e) {
+    std::cout << "Error: " << e.what() << "\n";
+  }
+}
+//BENCHMARK(BM_RandomAllocationsCUDA)->Unit(benchmark::kMillisecond);
+
+template <typename State>
+static void BM_RandomAllocationsSub(State& state) {
+  rmm::mr::sub_memory_resource mr;
+
+  try {
+    for (auto _ : state)
+      uniform_random_allocations(mr, num_allocations, max_size, max_usage);
+  } catch (std::exception const& e) {
+    std::cout << "Error: " << e.what() << "\n";
+  }
+}
+BENCHMARK(BM_RandomAllocationsSub)->Unit(benchmark::kMillisecond);
+
+template <typename State>
+static void BM_RandomAllocationsCnmem(State& state) {
+  rmm::mr::cnmem_memory_resource mr;
+
+  try {
+    for (auto _ : state)
+      uniform_random_allocations(mr, num_allocations, max_size, max_usage);
+  } catch (std::exception const& e) {
+    std::cout << "Error: " << e.what() << "\n";
+  }
+}
+BENCHMARK(BM_RandomAllocationsCnmem)->Unit(benchmark::kMillisecond);
+
+/*int main(void) {
+  std::vector<int> state(1);
+  BM_RandomAllocationsSub(state);
+  return 0;
+}*/
+
+
+
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "synchronization.hpp"
+#include "rmm/rmm.h"
+
+#define CUDA_TRY(call)                                            \
+  do {                                                            \
+    cudaError_t const status = (call);                            \
+    if (cudaSuccess != status) {                                  \
+      throw std::runtime_error("CUDA error");                     \
+    }                                                             \
+  } while (0);
+
+#define RMM_TRY(call)                                             \
+  do {                                                            \
+    rmmError_t const status = (call);                             \
+    if (RMM_SUCCESS != status) {                                  \
+      throw std::runtime_error("RMM error");                      \
+    }                                                             \
+  } while (0);
+
+cuda_event_timer::cuda_event_timer(benchmark::State& state,
+                                   bool flush_l2_cache,
+                                   cudaStream_t stream):
+  p_state(&state), stream(stream) {
+  // flush all of L2$
+  if(flush_l2_cache) {
+    int current_device = 0;
+    CUDA_TRY(cudaGetDevice(&current_device));
+
+    int l2_cache_bytes = 0;
+    CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
+
+    if (l2_cache_bytes > 0) {
+      const int memset_value = 0;
+      int* l2_cache_buffer = nullptr;
+      RMM_TRY(RMM_ALLOC(&l2_cache_buffer, l2_cache_bytes, stream));
+      CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream));
+      RMM_TRY(RMM_FREE(l2_cache_buffer, stream));
+    }
+  }
+
+  CUDA_TRY(cudaEventCreate(&start));
+  CUDA_TRY(cudaEventCreate(&stop));
+  CUDA_TRY(cudaEventRecord(start, stream));
+}
+
+cuda_event_timer::~cuda_event_timer() {
+  CUDA_TRY(cudaEventRecord(stop, stream));
+  CUDA_TRY(cudaEventSynchronize(stop));
+ 
+  float milliseconds = 0.0f;
+  CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
+  p_state->SetIterationTime(milliseconds/(1000.0f));
+  CUDA_TRY(cudaEventDestroy(start));
+  CUDA_TRY(cudaEventDestroy(stop));
+}
+