Skip to content

Commit 245bb92

Browse files
committed
fix: Small loading on super and livros;
1 parent 334eb3a commit 245bb92

File tree

2 files changed

+15
-14
lines changed

2 files changed

+15
-14
lines changed

src/data/scraping/embedding/livros_generator.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import re
44

55
import numpy as np
6-
import pdftotext
6+
from pdftotext import PDF
77

88
if __name__ == "__main__":
99
RM = [
@@ -12,22 +12,23 @@
1212
(r"(http[s]*?:\/\/)+[0-9a-zA-Z.-_\/?=]*\s*", r""), # urls
1313
]
1414

15-
with codecs.open(f"{os.getcwd()}/data/embedding/livros.txt", "wb", encoding="utf-8") as fh:
15+
filename = f"{os.getcwd()}/data/embedding/livros.txt"
16+
17+
with codecs.open(filename, "wb", encoding="utf-8") as fh:
1618
path = f"{os.getcwd()}/data/corpus/pdf/"
1719
for root, dirs, files in os.walk(path):
18-
for filename in files:
19-
with open(f"{path}/{filename}", "rb") as f:
20+
for book_name in files:
21+
with open(f"{path}/{book_name}", "rb") as f:
2022
sentences = []
2123
try:
22-
pdf = pdftotext.PDF(f)
24+
pdf = PDF(f)
2325
for page in pdf:
2426
for s, e in RM:
2527
page = re.sub(s, e, page)
26-
for phrase in page.strip().split("."):
27-
phrase = phrase.strip()
28-
if len(phrase.split()) > 5:
29-
sentences += [phrase]
30-
except:
31-
print(filename)
32-
sentences = list(set(filter(None, sentences)))
28+
for phrase in page.strip().split("."):
29+
if len(phrase := phrase.strip()) > 10:
30+
sentences.append(phrase)
31+
except Exception as e:
32+
print(book_name, str(e))
33+
sentences = list(sorted(set(filter(None, sentences))))
3334
np.savetxt(fh, sentences, fmt="%s")

src/data/scraping/embedding/super_extractor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from .utils import loader, save_phrases
88

9-
urls = [f"https://super.abril.com.br/superarquivo/{i}/" for i in range(1, 3)]
9+
urls = [f"https://super.abril.com.br/superarquivo/{i}/" for i in range(1, 459)]
1010

1111

1212
async def get_link_content(url):
@@ -43,6 +43,6 @@ async def get_links(url):
4343
links = list(filter(None, chain(*asyncio.run(loader(get_links, urls)))))
4444
print("Links carregados...")
4545
phrases = filter(None, chain(*asyncio.run(loader(get_link_content, links))))
46-
phrases = [pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]
46+
phrases = [pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 5]
4747
save_phrases(phrases, "/data/embedding/mundo.txt")
4848
print()

0 commit comments

Comments
 (0)