11
22#include " Bayes.h"
3-
3+ #include < string>
4+ #include < iostream>
5+ #include < sstream>
6+ #include < fstream>
7+ #include < boost/regex.hpp>
8+ #include < boost/algorithm/string.hpp>
9+ #include < tr1/functional>
10+ #include < locale> // std::locale, std::tolower
11+ #include < math.h>
12+ #include < cmath>
413/*
514bool check_positive_negative(review, data){
615 total = 0
@@ -10,22 +19,70 @@ bool check_positive_negative(review, data){
1019 return 1 if total/float(len(review.split())) >= 0.5 else 0
1120}*/
1221
22+ typedef std::vector<std::vector<std::string> > Rows;
23+
1324
1425Bayes::Bayes (){
1526
27+ Rows rows;
28+ std::ifstream input (" labeledTrainData.tsv" );
29+ if (!input) {
30+ std::cout << " unable to load file" << std::endl;
31+ }
32+ char const row_delim = ' \n ' ;
33+ char const field_delim = ' \t ' ;
34+ std::string firstrow;
35+ getline (input, firstrow, row_delim);
36+ for (std::string row; getline (input, row, row_delim); ) {
37+ rows.push_back (Rows::value_type ());
38+ std::istringstream ss (row);
39+ for (std::string field; getline (ss, field, field_delim); ) {
40+ rows.back ().push_back (field);
41+ }
42+ }
43+ for (std::vector<std::vector<std::string> >::iterator it = rows.begin (); it !=rows.end (); ++it){
44+ int tag = std::stoi ((*it)[1 ]);
45+ std::string text = (*it)[2 ];
46+
47+ boost::regex regex (" \\ w+" );
48+ boost::sregex_token_iterator iter (text.begin (), text.end (), regex, 0 );
49+ boost::sregex_token_iterator end;
50+
51+ // Tengo que hacer la iteracion una vez antes del for:
52+ std::string pal_ant (*iter);
53+ std::transform (pal_ant.begin (), pal_ant.end (), pal_ant.begin (), ::tolower);
54+ // palabra está en: pal_ant
55+
56+ for ( ; iter != end; ++iter ) {
57+ // std::tr1::hash<std::string> hash_fn;
58+ /* std::transform((*iter).begin(), (*iter).end(), (*iter).begin(), ::tolower);*/
59+ std::string temp (*iter);
60+ std::transform (temp.begin (), temp.end (), temp.begin (), ::tolower);
61+ entrenar (tag, temp);
62+ // hash_palabras.push_back(str_hash % dimensiones);
63+ /* if (bigramas){
64+ std::string gram(pal_ant + " " + temp);
65+ // printf("Bigrama %s", gram.c_str());
66+ std::transform(gram.begin(), gram.end(), gram.begin(), ::tolower);
67+ std::size_t gram_hash = hash_fn(gram);
68+ hash_palabras.push_back(gram_hash % dimensiones);
69+ pal_ant = temp;
70+ }*/
71+ }
72+ }
73+
1674}
1775
1876Bayes::~Bayes (){
1977
2078}
2179
22- void Bayes::entrenar (int tag, std::vector<std::string*>* oracion){
80+ // void Bayes::entrenar(int tag, std::vector<std::string*>* oracion){
81+ void Bayes::entrenar (int tag, std::string word ){
2382 std::string* palabra;
24-
25- for (unsigned int i = 0 ; i < oracion->size () ; i++){
2683
2784
28- palabra = (*oracion)[i] ;
85+ palabra = &word ;
2986 // std::cout << palabra << "\n";
3087
3188
@@ -36,13 +93,84 @@ void Bayes::entrenar(int tag, std::vector<std::string*>* oracion){
3693
3794 }
3895
39- if (tag > 0 ){
96+ if (tag == 0 ){
4097 (*data[*palabra])[0 ]++;
4198 }else {
4299 (*data[*palabra])[1 ]++;
43100 }
44-
45- }
101+
46102}
47103
48- double evaluar (std::vector<std::vector<std::string*>*>* oraciones);
104+ double evaluar (std::vector<std::vector<std::string*>*>* oraciones);
105+
106+ std::vector<long double >* Bayes::Predicciones (){
107+ Rows rows;
108+ std::ifstream input (" testData.tsv" );
109+ if (!input) {
110+ std::cout << " unable to load file" << std::endl;
111+ }
112+ char const row_delim = ' \n ' ;
113+ char const field_delim = ' \t ' ;
114+ std::string firstrow;
115+ getline (input, firstrow, row_delim);
116+ for (std::string row; getline (input, row, row_delim); ) {
117+ rows.push_back (Rows::value_type ());
118+ std::istringstream ss (row);
119+ for (std::string field; getline (ss, field, field_delim); ) {
120+ rows.back ().push_back (field);
121+ }
122+ }
123+ std::vector<long double >* pred = new std::vector<long double >();
124+ for (std::vector<std::vector<std::string> >::iterator it = rows.begin (); it !=rows.end (); ++it){
125+ std::string id = (*it)[0 ];
126+ ids.push_back (id);
127+
128+ std::string text = (*it)[1 ];
129+
130+ boost::regex regex (" \\ w+" );
131+ boost::sregex_token_iterator iter (text.begin (), text.end (), regex, 0 );
132+ boost::sregex_token_iterator end;
133+
134+ // Tengo que hacer la iteracion una vez antes del for:
135+ // std::string pal_ant(*iter);
136+ // std::transform(pal_ant.begin(), pal_ant.end(), pal_ant.begin(), ::tolower);
137+ // palabra está en: pal_ant
138+
139+ long double suma_probas = 0 ;
140+ float cantidad_palabras = 0 ; // es una cantidad, pero no quiero castear después
141+ for ( ; iter != end; ++iter ) {
142+ // std::tr1::hash<std::string> hash_fn;
143+ /* std::transform((*iter).begin(), (*iter).end(), (*iter).begin(), ::tolower);*/
144+ std::string temp (*iter);
145+ std::transform (temp.begin (), temp.end (), temp.begin (), ::tolower);
146+
147+ // long double proba = ((long double) *data[temp] )[1]/( (long double) *data[temp])[0] + (long double) *data[temp])[1]);
148+ auto it = data.find (temp);
149+ // la clave existe
150+ if (it != data.end ()){
151+ long double proba = ((long double )(*data[temp])[1 ])/(((long double )(*data[temp])[0 ]) + ((long double )(*data[temp])[1 ]));
152+ suma_probas += proba;
153+ cantidad_palabras++;
154+ }
155+
156+ // hash_palabras.push_back(str_hash % dimensiones);
157+ /* if (bigramas){
158+ std::string gram(pal_ant + " " + temp);
159+ // printf("Bigrama %s", gram.c_str());
160+ std::transform(gram.begin(), gram.end(), gram.begin(), ::tolower);
161+ std::size_t gram_hash = hash_fn(gram);
162+ hash_palabras.push_back(gram_hash % dimensiones);
163+ pal_ant = temp;
164+ }*/
165+ }
166+ // pred->push_back(suma_probas/cantidad_palabras);
167+ if (cantidad_palabras != 0 ){
168+ pred->push_back (suma_probas/cantidad_palabras);
169+ } else {
170+ puts (" Un error" );
171+ pred->push_back (0.5 ); // no se que está pasando acá
172+ }
173+ }
174+ input.close ();
175+ return pred;
176+ }
0 commit comments