Skip to content

Commit 89b5611

Browse files
committed
Added random_allocate microbenchmark
1 parent 36d97fc commit 89b5611

File tree

3 files changed

+315
-2
lines changed

3 files changed

+315
-2
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
4545

4646
option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON)
4747
if(CMAKE_CXX11_ABI)
48-
message(STATUS "CUDF: Enabling the GLIBCXX11 ABI")
48+
message(STATUS "RMM: Enabling the GLIBCXX11 ABI")
4949
else()
50-
message(STATUS "CUDF: Disabling the GLIBCXX11 ABI")
50+
message(STATUS "RMM: Disabling the GLIBCXX11 ABI")
5151
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
5252
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
5353
endif(CMAKE_CXX11_ABI)

tests/CMakeLists.txt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,14 @@
1515
#=============================================================================
1616
cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
1717

18+
1819
project(RMM_TESTS LANGUAGES C CXX CUDA)
1920

21+
set(CMAKE_CXX_STANDARD 11)
22+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
23+
set(CMAKE_CUDA_STANDARD 11)
24+
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
25+
2026
###################################################################################################
2127
# - compiler function -----------------------------------------------------------------------------
2228

@@ -62,3 +68,19 @@ ConfigureTest(RMM_TEST "${RMM_TEST_SRC}")
6268
###################################################################################################
6369

6470
enable_testing()
71+
72+
###################################################################################################
73+
### Performance test sources ######################################################################
74+
###################################################################################################
75+
76+
###################################################################################################
77+
# - random_allocate microbenchmark ----------------------------------------------------------------
78+
79+
set(RANDOM_ALLOCATE_SRC
80+
"${CMAKE_CURRENT_SOURCE_DIR}/performance/random_allocate.cu")
81+
82+
add_executable(random_allocate ${RANDOM_ALLOCATE_SRC})
83+
set_target_properties(random_allocate PROPERTIES POSITION_INDEPENDENT_CODE ON)
84+
target_link_libraries(random_allocate rmm)
85+
set_target_properties(random_allocate PROPERTIES
86+
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bench")
Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
/*
2+
* Copyright (c) 2019, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#define _BSD_SOURCE
17+
#include <rmm/rmm.h>
18+
#include <stdio.h>
19+
#include <stdlib.h>
20+
#include <string.h>
21+
#include <assert.h>
22+
#include <time.h>
23+
#include <sys/time.h>
24+
#include <assert.h>
25+
26+
#include <iostream>
27+
#include <cstdio>
28+
29+
using namespace std;
30+
31+
#define cudaSucceeded(ans) { gpuAssert((ans), __FILE__, __LINE__); }
32+
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
33+
if (code != cudaSuccess) {
34+
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
35+
if (abort) exit(code);
36+
}
37+
}
38+
39+
#define rmmSucceeded(ans) { rmmAssert((ans), __FILE__, __LINE__); }
40+
inline void rmmAssert(rmmError_t code, const char *file, int line, bool abort=true) {
41+
if (code != RMM_SUCCESS) {
42+
fprintf(stderr, "RMMassert: %s %d\n", file, line);
43+
if (abort) exit(code);
44+
}
45+
}
46+
47+
cudaError_t (*gpuAlloc)(void** ptr, size_t sz) = cudaMalloc;
48+
cudaError_t (*gpuFree)(void* ptr) = cudaFree;
49+
50+
cudaError_t _rmmAlloc(void **ptr, size_t sz) {
51+
rmmError_t res = RMM_ALLOC(ptr, sz, 0);
52+
rmmSucceeded(res);
53+
if (res != RMM_SUCCESS) return cudaErrorMemoryAllocation;
54+
return cudaSuccess;
55+
}
56+
57+
cudaError_t _rmmFree(void *ptr) {
58+
rmmError_t res = RMM_FREE(ptr, 0);
59+
rmmSucceeded(res);
60+
if (res != RMM_SUCCESS) return cudaErrorMemoryAllocation;
61+
return cudaSuccess;
62+
}
63+
64+
enum Allocator {
65+
cudaDefault = 0,
66+
rmmDefault,
67+
rmmManaged,
68+
rmmDefaultPool,
69+
rmmManagedPool
70+
};
71+
72+
void setAllocator(const std::string alloc) {
73+
if (alloc == "cudaDefault") {
74+
gpuAlloc = cudaMalloc;
75+
gpuFree = cudaFree;
76+
return;
77+
}
78+
else {
79+
rmmOptions_t options{CudaDefaultAllocation, 0, false};
80+
if (alloc == "rmmManaged")
81+
options.allocation_mode = CudaManagedMemory;
82+
else if (alloc == "rmmDefaultPool")
83+
options.allocation_mode = PoolAllocation;
84+
else if (alloc == "rmmManagedPool")
85+
options.allocation_mode =
86+
static_cast<rmmAllocationMode_t>(PoolAllocation |
87+
CudaManagedMemory);
88+
else assert(alloc == "rmmDefault");
89+
rmmInitialize(&options);
90+
gpuAlloc = _rmmAlloc;
91+
gpuFree = _rmmFree;
92+
return;
93+
}
94+
}
95+
96+
int useconds() {
97+
struct timeval t;
98+
gettimeofday(&t, NULL);
99+
return t.tv_sec*1000000+t.tv_usec;
100+
}
101+
102+
#define ALLOC_PROBABILITY 53
103+
#define ALLOC 1
104+
#define FREE 2
105+
#define BAR_UNIT 80
106+
107+
#define MAX_BUFFER_SIZE_BYTE (1UL << 27)
108+
#define MIN_BUFFER_SIZE_BYTE (1UL << 10)
109+
110+
#define KB (1UL << 10)
111+
#define MB (1UL << 20)
112+
113+
// Using 88.7% of the memory to avoid OOM due to fragmentation
114+
#define MEM_USAGE_PERCENTAGE 8870
115+
116+
#define SEED 123898464
117+
118+
int main(int argc, char** argv) {
119+
120+
if (argc < 5) {
121+
printf("Usage: %s <allocator> <num allocations> <num unique sizes> <report average time every n allocations>\n", argv[0]);
122+
printf("Allocator is one of: cudaDefault, rmmDefault, rmmManaged, rmmDefaultPool, or rmmManagedPool\n");
123+
return 1;
124+
}
125+
126+
setAllocator(argv[1]);
127+
128+
int numAllocations = atoi(argv[2]);
129+
int numSizes = atoi(argv[3]);
130+
int averagePerN = atoi(argv[4]);
131+
printf("allocator: %s, numAllocations: %d, numSize: %d, report average every %d allocations\n", argv[1], numAllocations, numSizes, averagePerN);
132+
133+
cudaStream_t st1;
134+
cudaSucceeded(cudaStreamCreate(&st1)); // Not used in this version
135+
136+
//------------------------ creating some random sizes -------------------------//
137+
unsigned *sizes = (unsigned*) malloc(numSizes * sizeof(unsigned));
138+
srand(SEED);
139+
140+
printf("Randomizing sizes between %luKB and %luKB bytes\n", MIN_BUFFER_SIZE_BYTE / KB, MAX_BUFFER_SIZE_BYTE / KB);
141+
for (int i = 0; i < numSizes; i ++) {
142+
sizes[i] = (rand() % (MAX_BUFFER_SIZE_BYTE - MIN_BUFFER_SIZE_BYTE)) + MIN_BUFFER_SIZE_BYTE;
143+
}
144+
//-----------------------------------------------------------------------------//
145+
146+
147+
//----------------------- create a bunch of allocation sizes -----------------//
148+
srand(SEED);
149+
unsigned* allocations = (unsigned*) malloc(numAllocations * sizeof(unsigned));
150+
void** buffers = (void**) malloc(numAllocations * sizeof(void*));
151+
long long unsigned totalAllocatedSize = 0;
152+
for (int i = 0; i < numAllocations; i ++) {
153+
allocations[i] = sizes[rand() % numSizes];
154+
buffers[i] = NULL;
155+
totalAllocatedSize += allocations[i];
156+
}
157+
//----------------------------------------------------------------------------//
158+
159+
size_t totalMem, freeMem;
160+
cudaSucceeded(cudaMemGetInfo(&freeMem, &totalMem));
161+
162+
//----------------- create the exact allocation-free plan --------------------//
163+
const int numAllocFree = numAllocations * 2;
164+
int* allocFrees = (int*)malloc(numAllocFree * sizeof(int));
165+
166+
// This is the array the holds the valid allocations we currently have
167+
int* existingAllocations = (int*) malloc(numAllocations * sizeof(int));
168+
memset(existingAllocations, 0, numAllocations * sizeof(int));
169+
170+
int allocCounter = 0; // Ignore the first allocation index (so that we can negate the index)
171+
size_t currentSize = 0;
172+
size_t maxSize = (size_t)(((freeMem / MB) * MEM_USAGE_PERCENTAGE) / 10000) * MB;
173+
int existingCounter = 0;
174+
175+
// Printing the bar for max size, so user knows how to measure the usage based on the bar length
176+
printf("[ max size: (%8luMB) ] [", freeMem / MB);
177+
for (int j = 0; j < ((maxSize / KB) * BAR_UNIT) / (maxSize / KB); j ++) {
178+
printf("-");
179+
}
180+
printf("]100.0%%\n");
181+
182+
srand(SEED);
183+
for (int i = 0; i < numAllocFree; i++) {
184+
// Decide whether we want to allocate or free
185+
int allocOrFree = 0;
186+
int chance = rand() % 100;
187+
if (chance < ALLOC_PROBABILITY) {
188+
allocOrFree = ALLOC;
189+
if ((currentSize + allocations[allocCounter]) >= maxSize || allocCounter >= numAllocations) {
190+
allocOrFree = FREE;
191+
}
192+
}
193+
else {
194+
allocOrFree = FREE;
195+
if (currentSize <= 0) {
196+
allocOrFree = ALLOC;
197+
}
198+
}
199+
200+
201+
if (allocOrFree == ALLOC) {
202+
allocFrees[i] = allocCounter ++;
203+
204+
// Record this allocation and move on
205+
existingAllocations[existingCounter++] = allocFrees[i];
206+
currentSize += allocations[allocFrees[i]];
207+
assert(currentSize < maxSize);
208+
printf("[%3d] Alloc index %4d with size %7luKB (current sum: %7luMB)[", i, allocFrees[i], allocations[allocFrees[i]] / KB, currentSize / MB);
209+
for (int j = 0; j < ((currentSize / KB) * BAR_UNIT) / (maxSize / KB); j ++) {
210+
printf("-");
211+
}
212+
double usage = (double)((currentSize / KB) * 100) / (double)(maxSize / KB);
213+
printf("]%3.1f%%\n", usage);
214+
}
215+
else {
216+
// Let's randomly pick one of the allocations that is not already free'd
217+
int allocationToFreeIndex = rand() % existingCounter;
218+
allocFrees[i] = existingAllocations[allocationToFreeIndex] * (-1);
219+
220+
// Shift existingAllocations to remove the allocation
221+
for (int j = allocationToFreeIndex + 1; j < existingCounter; j ++) {
222+
existingAllocations[j - 1] = existingAllocations[j];
223+
}
224+
existingCounter --;
225+
currentSize -= allocations[allocFrees[i] * (-1)];
226+
printf("[%3d] Free index %4d with size %7luKB (current sum: %7luMB)[", i, allocFrees[i], allocations[allocFrees[i] * (-1)] / KB, currentSize / MB);
227+
for (int j = 0; j < (currentSize * BAR_UNIT) / maxSize; j ++) {
228+
printf("-");
229+
}
230+
double usage = (double)((currentSize / KB) * 100) / (double)(maxSize / KB);
231+
printf("]%3.1f%%\n", usage);
232+
}
233+
234+
}
235+
236+
printf("Allocation-free plan is created. Executing the plan.\n");
237+
238+
int this_time_malloc, this_time_free, start, aft, sum_time_malloc=0, sum_time_free=0, period_time_malloc=0, period_time_free=0;
239+
int period_count_malloc = 0;
240+
int period_count_free = 0;
241+
242+
start = useconds();
243+
// Do the first allocation outside the for, since its index is 0
244+
cudaSucceeded(gpuAlloc(&buffers[allocFrees[0]], allocations[allocFrees[0]]));
245+
aft = useconds();
246+
247+
this_time_malloc = aft-start;
248+
for (int i = 1; i < numAllocFree; i++) {
249+
if (allocFrees[i] > 0) {
250+
start = useconds();
251+
if (gpuAlloc(&buffers[allocFrees[i]], allocations[allocFrees[i]]) != cudaSuccess) {
252+
printf("failed to allocate %dth allocation with size %luKB\n", i, allocations[allocFrees[i]] / KB);
253+
exit(1);
254+
}
255+
aft = useconds();
256+
this_time_malloc = aft-start;
257+
sum_time_malloc += this_time_malloc;
258+
259+
period_count_malloc ++;
260+
period_time_malloc += this_time_malloc;
261+
if (period_count_malloc >= averagePerN) {
262+
printf("Average malloc: %0.1f us\n", (double)period_time_malloc / (double)period_count_malloc);
263+
period_count_malloc = 0;
264+
period_time_malloc = 0;
265+
}
266+
}
267+
else {
268+
start = useconds();
269+
cudaSucceeded(gpuFree(buffers[allocFrees[i] * (-1)]));
270+
aft = useconds();
271+
this_time_free = aft-start;
272+
sum_time_free += this_time_free;
273+
274+
period_count_free ++;
275+
period_time_free += this_time_free;
276+
if (period_count_free >= averagePerN) {
277+
printf("Average free: %0.1f us\n", (double)period_time_free / (double)period_count_free);
278+
period_count_free = 0;
279+
period_time_free = 0;
280+
}
281+
}
282+
}
283+
284+
cudaSucceeded(cudaStreamSynchronize(st1));
285+
printf("sum allocation size: %llu MB\n", totalAllocatedSize / MB);
286+
printf("Average allocation size: %llu KB\n", (totalAllocatedSize / numAllocations) / KB);
287+
printf("sum malloc: %d ms (average: %0.1f us)\n", sum_time_malloc / 1000, (double)sum_time_malloc / (double)numAllocations);
288+
printf("sum free: %d ms (average: %0.1f us)\n", sum_time_free / 1000, (double)sum_time_free / (double)numAllocations);
289+
290+
return 0;
291+
}

0 commit comments

Comments
 (0)