-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.cpp
More file actions
126 lines (95 loc) · 3.8 KB
/
tokenizer.cpp
File metadata and controls
126 lines (95 loc) · 3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include "tokenizer.hpp"
#include "performance.hpp"
void Tokenizer::train() {
printf("* Training started\n");
const auto start = performance::startPerf(std::nullopt);
generateRawTokens();
std::vector<std::pair<std::pair<int, int>, int> > _merges;
auto mTokens = tokens;
for (int i = 0; i < vocab_size; ++i) {
auto stats = getStats(mTokens);
// get the max repeated token and assign it the lowest idx, in this case initially idx=257 and continues
auto max = utils::getMax(stats);
const size_t idx = default_vocab_size + i;
mTokens = generateTokenWithId(mTokens, max, idx);
_merges.push_back(std::pair<std::pair<int, int>, int>(max, idx));
// printf("merging: (%i, %i) -> %lu\n", max.first, max.second, idx);
}
performance::endPerf(start);
merges = _merges;
makeVocab(_merges);
}
std::map<std::pair<int, int>, int> Tokenizer::getStats(const std::vector<int> &ids) const {
std::map<std::pair<int, int>, int> counted_pairs = {};
for (size_t i = 0; i < ids.size() - 1; ++i) {
std::pair<int, int> key = {ids[i], ids[i + 1]};
counted_pairs[key]++;
}
return counted_pairs;
}
void Tokenizer::generateRawTokens() {
std::vector<unsigned char> file_content = utils::readFile(this->file_name);
for (unsigned char byte_value: file_content) {
const int integer_val = static_cast<int>(byte_value);
tokens.push_back(integer_val);
}
}
std::vector<int> Tokenizer::generateTokenWithId(const std::vector<int> &ids, const std::pair<int, int> &pair,
size_t idx) const {
std::vector<int> newIds;
for (size_t i = 0; i < ids.size(); ++i) {
if (ids.size() > 1 && ids[i] == pair.first && ids[i + 1] == pair.second) {
newIds.push_back(idx);
++i;
} else {
newIds.push_back(ids[i]);
}
}
return newIds;
}
void Tokenizer::makeVocab(const std::vector<std::pair<std::pair<int, int>, int> > &_merges) {
for (int idx = 0; idx < default_vocab_size; ++idx) {
auto byte_pair = {static_cast<unsigned char>(idx)};
vocab[idx] = byte_pair;
}
for (const auto entry: _merges) {
const auto &firsChar = vocab[entry.first.first];
const auto &secondChar = vocab[entry.first.second];
auto byte_pair = firsChar;
byte_pair.insert(byte_pair.end(), secondChar.begin(), secondChar.end());
vocab[entry.second] = byte_pair;
}
}
std::string Tokenizer::decode(const std::vector<int> &ids) const {
std::string text;
for (const int i: ids) {
const auto &byte_sequence = vocab.at(i);
text.append(byte_sequence.begin(), byte_sequence.end());
}
return text;
}
std::vector<int> Tokenizer::encode(std::string text) const {
auto _tokens = std::vector<int>(text.begin(), text.end());
printf("\nToken size before encode: %lu", _tokens.size());
while (_tokens.size() >= 2) {
auto stats = getStats(_tokens);
// now get the pair by comparing its tokens in the merges table with the lowest idx.
// because lowest idx is the most repeated one in merges table by its design. See the training function above.
const auto minPair = utils::getMinByMerges(stats, merges);
if (!minPair) break; // no more merges found. break and return the token list
auto idx = getIdxOfPair(minPair.value());
_tokens = generateTokenWithId(_tokens, minPair.value(), idx);
}
printf("\nToken size after encode: %lu", _tokens.size());
return _tokens;
}
int Tokenizer::getIdxOfPair(const std::pair<int, int> &pair) const {
int idx = -1; // bad value!
for (const auto &i : merges) {
if(i.first == pair) {
idx = i.second;
break;
}
}
return idx;
}