|
3 | 3 | import re |
4 | 4 |
|
5 | 5 | import numpy as np |
6 | | -import pdftotext |
| 6 | +from pdftotext import PDF |
7 | 7 |
|
8 | 8 | if __name__ == "__main__": |
9 | 9 | RM = [ |
|
12 | 12 | (r"(http[s]*?:\/\/)+[0-9a-zA-Z.-_\/?=]*\s*", r""), # urls |
13 | 13 | ] |
14 | 14 |
|
15 | | - with codecs.open(f"{os.getcwd()}/data/embedding/livros.txt", "wb", encoding="utf-8") as fh: |
| 15 | + filename = f"{os.getcwd()}/data/embedding/livros.txt" |
| 16 | + |
| 17 | + with codecs.open(filename, "wb", encoding="utf-8") as fh: |
16 | 18 | path = f"{os.getcwd()}/data/corpus/pdf/" |
17 | 19 | for root, dirs, files in os.walk(path): |
18 | | - for filename in files: |
19 | | - with open(f"{path}/{filename}", "rb") as f: |
| 20 | + for book_name in files: |
| 21 | + with open(f"{path}/{book_name}", "rb") as f: |
20 | 22 | sentences = [] |
21 | 23 | try: |
22 | | - pdf = pdftotext.PDF(f) |
| 24 | + pdf = PDF(f) |
23 | 25 | for page in pdf: |
24 | 26 | for s, e in RM: |
25 | 27 | page = re.sub(s, e, page) |
26 | | - for phrase in page.strip().split("."): |
27 | | - phrase = phrase.strip() |
28 | | - if len(phrase.split()) > 5: |
29 | | - sentences += [phrase] |
30 | | - except: |
31 | | - print(filename) |
32 | | - sentences = list(set(filter(None, sentences))) |
| 28 | + for phrase in page.strip().split("."): |
| 29 | + if len(phrase := phrase.strip()) > 10: |
| 30 | + sentences.append(phrase) |
| 31 | + except Exception as e: |
| 32 | + print(book_name, str(e)) |
| 33 | + sentences = list(sorted(set(filter(None, sentences)))) |
33 | 34 | np.savetxt(fh, sentences, fmt="%s") |
0 commit comments