-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMinHash.cpp
More file actions
49 lines (42 loc) · 1.31 KB
/
MinHash.cpp
File metadata and controls
49 lines (42 loc) · 1.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
//
// Created by fedor on 03.06.16.
//
#include <iostream>
#include <values.h>
#include <unistd.h>
#include "MinHash.h"
void MinHash::generateTables(int numHashFunctions, unsigned long numWordsInDictionary) {
/* initialize random seed: */
srand (time(NULL));
for(int i = 0; i < numHashFunctions; ++i) {
hashTables.push_back(vector<int>(numWordsInDictionary));
for(unsigned long j = 0; j < numWordsInDictionary; ++j) {
hashTables[i][j] = rand();
}
}
}
vector<int> MinHash::chooseWords(vector<int> &words) {
if(words.empty()) {
return words;
}
vector<int> chosenWords;
for(int i = 0; i < hashTables.size(); ++i) {
int chosenIdx = applyHashFunction(hashTables[i], words);
chosenWords.push_back(words[chosenIdx]);
}
return chosenWords;
}
int MinHash::applyHashFunction(vector<int> hashFunction, vector<int> words) {
int minVal = MAXINT, minIdx = 0;
for(int i = 0; i < words.size(); ++i) {
int numberRelatedToWord = hashFunction[words[i]];
if(minVal > numberRelatedToWord) {
minVal = numberRelatedToWord;
minIdx = i;
}
}
return minIdx;
}
MinHash::MinHash(int numHashFunctions, unsigned long numWordsInDictionary) {
generateTables(numHashFunctions, numWordsInDictionary);
}