Skip to content

Commit ee254f8

Browse files
committed
fix/feat: Lots of fixes and improvements;
1 parent 59dd3fa commit ee254f8

File tree

13 files changed

+119
-249
lines changed

13 files changed

+119
-249
lines changed

src/data/scraping/embedding/bula_extractor.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
import codecs
21
import os
32
import random
43
import time
54
from concurrent.futures import ProcessPoolExecutor
65
from string import ascii_lowercase
76

8-
import numpy as np
97
from selenium import webdriver
108
from selenium.webdriver.chrome.service import Service as ChromeService
119
from selenium.webdriver.common.by import By
1210

11+
from .utils import chunks, save_phrases
12+
1313

1414
def chrome_options():
1515
options = webdriver.ChromeOptions()
@@ -67,27 +67,13 @@ def browser_loader(url):
6767
phrases = filter(None, phrases)
6868
phrases = [pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]
6969

70-
sentences = []
71-
try:
72-
with codecs.open(f"{os.getcwd()}/data/embedding/bulas.txt", "rb", encoding="utf-8") as fh:
73-
sentences = fh.readlines()
74-
except:
75-
pass
76-
with codecs.open(f"{os.getcwd()}/data/embedding/bulas.txt", "wb", encoding="utf-8") as fh:
77-
sents = list(set(sentences + phrases))
78-
np.savetxt(fh, sents, fmt="%s")
79-
70+
save_phrases(phrases, "/data/embedding/bulas.txt")
8071
time.sleep(random.randint(1, 3))
8172
time.sleep(random.randint(4, 6))
8273

8374
driver.close()
8475

8576

86-
def chunks(lst, n):
87-
for i in range(0, len(lst), n):
88-
yield lst[i : i + n]
89-
90-
9177
if __name__ == "__main__":
9278
urls = [
9379
f"https://consultaremedios.com.br/bulas/{w}"

src/data/scraping/embedding/compress.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,15 @@
1414
warnings.filterwarnings("ignore")
1515

1616

17-
def load_sentences(tipo, sentence):
18-
sentence = sentence[0] if tipo == 0 else " ".join(sentence)
19-
sentence = normalizar.fit(sentence)
17+
def load_sentences(type_, sentence):
18+
sentence = sentence[0] if type_ == 0 else " ".join(sentence)
19+
sentence = cleanup.fit(sentence)
2020
if len(sentence) >= 5:
2121
return " ".join(sentence).strip()
2222
return None
2323

2424

25-
def carregar_sentencas(exc, fh, filename):
25+
def load_sentences_files(exc, fh, filename):
2626
if not os.path.exists(filename):
2727
return
2828

@@ -46,7 +46,7 @@ def corpus_nltk(exc, fh, model):
4646

4747
print("Carregando sentenças...")
4848

49-
normalizar = CleanUp(
49+
cleanup = CleanUp(
5050
remove_accentuation=False,
5151
remove_4_comment=False,
5252
remove_numbers=False,
@@ -86,6 +86,6 @@ def corpus_nltk(exc, fh, model):
8686
print("Carregando sentenças dos corpus criados...")
8787
for filename in filenames:
8888
print(f"Carregando sentenças: {filename}")
89-
carregar_sentencas(exc, fh, filename)
89+
load_sentences_files(exc, fh, filename)
9090

9191
print(f"Tempo total da compressao: {round(time.time() - start, 2)}s")

src/data/scraping/embedding/fapesp_extractor.py

Lines changed: 8 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
import asyncio
2-
import codecs
3-
import os
42
from itertools import chain
53

64
import httpx
7-
import numpy as np
8-
from aiomultiprocess import Pool
95
from bs4 import BeautifulSoup
106

7+
from .utils import chunks, loader, save_phrases
8+
119
main_urls = [
1210
"https://revistapesquisa.fapesp.br/category/impressa/humanidades/",
1311
"https://revistapesquisa.fapesp.br/saude/",
@@ -32,7 +30,7 @@ async def get_link_content(url):
3230
html = BeautifulSoup(r.content, "lxml")
3331
posts = html.findAll("div", {"class": "post-content"})
3432
for post in posts:
35-
phrases += post.get_text().split(".")
33+
phrases += post.get_text().strip().split(".")
3634
except Exception as e:
3735
print(f"2. Erro ao carregar frases: {url}, {str(e)}")
3836
return phrases
@@ -56,27 +54,10 @@ async def get_links(url):
5654
return links
5755

5856

59-
async def loader(func, urls):
60-
async with Pool() as pool:
61-
result = await pool.map(func, urls)
62-
return result
63-
64-
6557
if __name__ == "__main__":
66-
links = filter(None, chain(*asyncio.run(loader(get_links, urls))))
67-
phrases = filter(None, chain(*asyncio.run(loader(get_link_content, links))))
68-
phrases = [pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]
69-
70-
try:
71-
sentences = []
72-
with codecs.open(f"{os.getcwd()}/data/embedding/fapesp.txt", "rb", encoding="utf-8") as fh:
73-
sentences = fh.readlines()
74-
sentences = [sent.strip() for sent in sentences]
75-
with codecs.open(f"{os.getcwd()}/data/embedding/fapesp.txt", "wb", encoding="utf-8") as fh:
76-
sents = list(set(sentences + phrases))
77-
np.savetxt(fh, sents, fmt="%s")
78-
except:
79-
with codecs.open(f"{os.getcwd()}/data/embedding/fapesp_sec.txt", "wb", encoding="utf-8") as fh:
80-
sents = list(set(phrases))
81-
np.savetxt(fh, sents, fmt="%s")
58+
for chunked in chunks(urls, 10):
59+
links = filter(None, chain(*asyncio.run(loader(get_links, chunked))))
60+
phrases = filter(None, chain(*asyncio.run(loader(get_link_content, links))))
61+
phrases = [pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]
62+
save_phrases("/data/embedding/fapesp.txt", phrases)
8263
print()

src/data/scraping/embedding/frases_extractor.py

Lines changed: 11 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
import asyncio
2-
import codecs
3-
import os
42
from itertools import chain
53

64
import httpx
7-
import numpy as np
8-
from aiomultiprocess import Pool
95
from bs4 import BeautifulSoup
106

7+
from .utils import loader, save_phrases
8+
119
main_urls = list(
1210
set(
1311
[
@@ -149,7 +147,9 @@
149147
)
150148
)
151149

152-
urls = main_urls + [f"{url}page/{i}/" if type_ == 0 else f"{url}{i}/" for i in range(2, 55) for url, type_ in main_urls]
150+
urls = [url for url, p in main_urls] + [
151+
f"{url}page/{i}/" if type_ == 0 else f"{url}{i}/" for i in range(2, 55) for url, type_ in main_urls
152+
]
153153

154154

155155
async def get_link_content(url):
@@ -159,34 +159,19 @@ async def get_link_content(url):
159159
r = await client.get(url, timeout=240)
160160
if r.status_code == 200:
161161
html = BeautifulSoup(r.content, "lxml")
162-
posts = html.findAll("p", {"class": "frase"})
162+
if not (posts := html.findAll("p", {"class": "frase"})):
163+
posts = html.findAll("div", {"class": "card"})
163164
for post in posts:
164-
phrases += BeautifulSoup(post.get_text(), "lxml").get_text().replace("\n", " ").split(".")
165+
phrases += (
166+
BeautifulSoup(post.get_text().strip(), "lxml").get_text().strip().replace("\n", " ").split(".")
167+
)
165168
except Exception as e:
166169
print(f"1. Erro ao carregar frases: {url}, {str(e)}")
167170
return phrases
168171

169172

170-
async def loader(func, urls):
171-
async with Pool() as pool:
172-
result = await pool.map(func, urls)
173-
return result
174-
175-
176173
if __name__ == "__main__":
177174
phrases = filter(None, chain(*asyncio.run(loader(get_link_content, urls))))
178175
phrases = list(set([pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]))
179-
180-
try:
181-
sentences = []
182-
with codecs.open(f"{os.getcwd()}/data/embedding/frases.txt", "rb", encoding="utf-8") as fh:
183-
sentences = fh.readlines()
184-
sentences = [sent.strip() for sent in sentences]
185-
with codecs.open(f"{os.getcwd()}/data/embedding/frases.txt", "wb", encoding="utf-8") as fh:
186-
sents = list(set(sentences + phrases))
187-
np.savetxt(fh, sents, fmt="%s")
188-
except:
189-
with codecs.open(f"{os.getcwd()}/data/embedding/frases_sec.txt", "wb", encoding="utf-8") as fh:
190-
sents = list(set(phrases))
191-
np.savetxt(fh, sents, fmt="%s")
176+
save_phrases(phrases, "/data/embedding/frases.txt")
192177
print()

src/data/scraping/embedding/g1_extractor.py

Lines changed: 3 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
import asyncio
2-
import codecs
3-
import os
42
from itertools import chain
53

64
import feedparser
7-
import numpy as np
8-
from aiomultiprocess import Pool
95
from bs4 import BeautifulSoup
106

7+
from .utils import loader, save_phrases
8+
119
rss = [
1210
"http://g1.globo.com/dynamo/brasil/rss2.xml",
1311
"http://g1.globo.com/dynamo/carros/rss2.xml",
@@ -54,12 +52,6 @@ async def get_link_content(url):
5452
return phrases
5553

5654

57-
async def loader(func, urls):
58-
async with Pool() as pool:
59-
result = await pool.map(func, urls)
60-
return result
61-
62-
6355
if __name__ == "__main__":
6456
print("Iniciando G1")
6557
print("-" * 30)
@@ -70,17 +62,5 @@ async def loader(func, urls):
7062
)
7163
)
7264
phrases = [pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]
73-
74-
try:
75-
sentences = []
76-
with codecs.open(f"{os.getcwd()}/data/embedding/g1.txt", "rb", encoding="utf-8") as fh:
77-
sentences = fh.readlines()
78-
sentences = [sent.strip() for sent in sentences]
79-
with codecs.open(f"{os.getcwd()}/data/embedding/g1.txt", "wb", encoding="utf-8") as fh:
80-
sents = sorted(list(set(sentences + phrases)))
81-
np.savetxt(fh, sents, fmt="%s")
82-
except:
83-
with codecs.open(f"{os.getcwd()}/data/embedding/g1_sec.txt", "wb", encoding="utf-8") as fh:
84-
sents = sorted(list(set(phrases)))
85-
np.savetxt(fh, sents, fmt="%s")
65+
save_phrases(phrases, "/data/embedding/g1.txt")
8666
print()
Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,25 @@
1-
import time
1+
import asyncio
22
from datetime import datetime
3+
from itertools import chain
4+
from random import randint
35

46
import arrow
57
import httpx
68
from aiomultiprocess import Pool
79
from bs4 import BeautifulSoup
810

11+
from .utils import save_phrases
12+
913

1014
async def get_link_content(url):
1115
phrases = []
1216
try:
13-
time.sleep(1.5)
17+
await asyncio.sleep(randint(1, 3))
1418
async with httpx.AsyncClient() as client:
1519
r = await client.get(url, timeout=240)
1620
if r.status_code == 200:
1721
html = BeautifulSoup(r.content, "lxml")
18-
posts = html.findAll("article", {"class": "hstEntry__content"})
22+
posts = html.findAll("div", {"class": "content"})
1923
for post in posts:
2024
phrases += post.get_text(strip=True).split(".")
2125
except Exception as e:
@@ -26,19 +30,19 @@ async def get_link_content(url):
2630
async def get_links(url):
2731
links = []
2832
try:
33+
await asyncio.sleep(randint(1, 2))
2934
async with httpx.AsyncClient() as client:
30-
r = await client.get(url, timeout=240)
35+
r = await client.get(url, timeout=240, follow_redirects=True)
3136
if r.status_code == 200:
3237
html = BeautifulSoup(r.content, "lxml")
33-
links_ = html.findAll("article", {"class": "block-opacable hstBlock"})
38+
links_ = html.findAll("div", {"class": "card-img"})
3439
for link in links_:
3540
href = link.find("a").get("href")
36-
if "https://history.uol.com.br" not in href:
37-
href = f"https://history.uol.com.br{href}"
41+
if "https://www.canalhistory.com.br" not in href:
42+
href = f"https://www.canalhistory.com.br{href}"
3843
links.append(href)
39-
except Exception:
40-
# print(f"2. Erro ao carregar posts: {url}, {str(e)}")
41-
pass
44+
except Exception as e:
45+
print(f"2. Erro ao carregar links: {url}, {str(e)}")
4246
return links
4347

4448

@@ -53,25 +57,13 @@ async def carregar(func, urls):
5357
print("-" * 30)
5458
start_date = arrow.get(datetime(2020, 1, 1))
5559
links = [
56-
f"https://history.uol.com.br/hoje-na-historia/{start_date.shift(days=d).format('YYYY-MM-DD')}"
57-
for d in range(1, 366)
60+
f"https://history.uol.com.br/hoje-na-historia/{start_date.shift(days=d).format('DD/MM')}" for d in range(1, 366)
5861
]
59-
print(links)
60-
# links = list(filter(None, chain(*asyncio.run(carregar(get_links, links)))))
61-
# print(f"Links carregados... {len(links)}")
62-
# phrases = filter(None, chain(*asyncio.run(carregar(get_link_content, links))))
63-
# phrases = [phrase.strip() for phrase in phrases if len(phrase) > 15 and not phrase.startswith("Imagem:")]
64-
65-
# try:
66-
# sentences = []
67-
# with codecs.open(f"{os.getcwd()}/data/embedding/history.txt", "rb", encoding="utf-8") as fh:
68-
# sentences = fh.readlines()
69-
# sentences = [sent.strip() for sent in sentences]
70-
# with codecs.open(f"{os.getcwd()}/data/embedding/history.txt", "wb", encoding="utf-8") as fh:
71-
# sents = sorted(list(set(sentences + phrases)))
72-
# np.savetxt(fh, sents, fmt="%s")
73-
# except:
74-
# with codecs.open(f"{os.getcwd()}/data/embedding/history_sec.txt", "wb", encoding="utf-8") as fh:
75-
# sents = sorted(list(set(phrases)))
76-
# np.savetxt(fh, sents, fmt="%s")
77-
# print()
62+
links = list(filter(None, chain(*asyncio.run(carregar(get_links, links)))))
63+
print(f"Links carregados... {len(links)}")
64+
phrases = filter(None, chain(*asyncio.run(carregar(get_link_content, links))))
65+
phrases = [
66+
pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10 and not phrase.startswith("Imagem:")
67+
]
68+
save_phrases(phrases, "/data/embedding/history.txt")
69+
print()

0 commit comments

Comments
 (0)