cudaLATCH/gpuFacade.cpp at master · csp256/cudaLATCH · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#include <vector>
#include <iostream>
#include <stdio.h>
#include <time.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "opencv2/opencv.hpp"
using namespace std;
using namespace cv;
#include "latch.h"
#include "bitMatcher.h"
#include "gpuFacade.hpp"

// images
// keypoints
// descriptors
// matches

using namespace std;

#define cudaCalloc(A, B) \
    do { \
        cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \
        if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \
    } while (0)

#define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
   if (code != cudaSuccess) {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

#define checkLaunchError()                                            \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

gpuFacade::~gpuFacade() {
    // cudaFreeArray(patchTriplets); // This crashes..?
    cudaFree(d_K);
    cudaFree(d_D1);
    cudaFree(d_D2);
    cudaFree(d_M1);
    cudaFree(d_M2);
    cudaFreeHost(h_K1);
    cudaFreeHost(h_K2);
    cudaDeviceReset();
}

gpuFacade::gpuFacade(int maxKeypoints, int input_WIDTH, int input_HEIGHT, int imageSlots) {
    maxKP = maxKeypoints;
    WIDTH = input_WIDTH;
    HEIGHT = input_HEIGHT;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Sizes for device and host pointers
    sizeK = maxKP * sizeof(float) * 4; // K for keypoints
    sizeI = WIDTH * HEIGHT * sizeof(unsigned char); // I for Image
    sizeD = maxKP * (2048 / 32) * sizeof(unsigned int); // D for Descriptor
    sizeM = maxKP * sizeof(int); // M for Matches
    sizeMask = 64 * sizeof(float);

    // Host pointers
    cudaMallocHost((void **) &h_K1, sizeK);
    cudaMallocHost((void **) &h_K2, sizeK);
    h_M1 = (int*) malloc(sizeM);
    h_M2 = (int*) malloc(sizeM);
    for (int i=0; i<64; i++) { h_mask[i] = 1.0f; }

    // Device pointers
    cudaCalloc((void **) &d_K, sizeK);
    cudaCalloc((void **) &d_D1, sizeD);
    cudaCalloc((void **) &d_D2, sizeD);
    cudaCalloc((void **) &d_M1, sizeM);
    cudaCalloc((void **) &d_M2, sizeM);
    cudaCalloc((void **) &d_mask, sizeM);

    // The patch triplet locations for LATCH fits in texture memory cache.
    initPatchTriplets(patchTriplets);
    initImage(&d_I, WIDTH, HEIGHT, &pitch);
    initMask(&d_mask, h_mask);

    // Events allow asynchronous, nonblocking launch of subsequent kernels after a given event has happened,
    // such as completion of a different kernel on a different stream.
    cudaEventCreate(&latchFinished);
    // You should create a new stream for each bitMatcher kernel you want to launch at once.
    cudaStreamCreate(&streamKP1);
    cudaStreamCreate(&streamKP2);
}

void gpuFacade::LATCH(
                Mat img,
                unsigned int* d_descriptor,
                int* keypoints,
                vector<KeyPoint>* vectorKP) {
    latch( img, d_I, pitch, h_K1, d_descriptor, keypoints, maxKP, d_K, vectorKP, d_mask, latchFinished );
}

void gpuFacade::match(
                unsigned int* d_descriptorQ,
                unsigned int* d_descriptorT,
                int numKP_Q,
                int numKP_T,
                int* d_matches,
                int threshold,
                cudaStream_t stream) {
    bitMatcher( d_descriptorQ, d_descriptorT, numKP_Q, numKP_T, maxKP, d_matches, threshold, stream, latchFinished );
}

void gpuFacade::getResults(int* h_matches, int* d_matches) {
    getMatches(maxKP, h_matches, d_matches);
}