Skip to content

Commit 7defdb4

Browse files
committed
fix: Keep some punctuation;
1 parent ba5b124 commit 7defdb4

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

src/data/processing/utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def get_stopwords():
3232
rms = ["um", "não", "mais", "muito", "sem", "estou", "sou"]
3333
for rm in rms:
3434
del stpwords[stpwords.index(rm)]
35-
for rm in ["?", "!", ","]:
35+
for rm in ["?", "!", ",", ";", ":", "'", '"']:
3636
del punkt[punkt.index(rm)]
3737
return stpwords, punkt
3838

@@ -203,6 +203,10 @@ def fit(self, phrase):
203203
phrase = phrase.lower()
204204
phrase = phrase.replace("?", " ? ")
205205
phrase = phrase.replace("!", " ! ")
206+
phrase = phrase.replace("'", " ' ")
207+
phrase = phrase.replace('"', ' " ')
208+
phrase = phrase.replace(";", " ; ")
209+
phrase = phrase.replace(":", " : ")
206210
# Remove strings padrão existente, como urls
207211
for o, r in self.RM:
208212
phrase = re.sub(o, r, phrase, flags=re.MULTILINE)

0 commit comments

Comments
 (0)