1- import time
1+ import asyncio
22from datetime import datetime
3+ from itertools import chain
4+ from random import randint
35
46import arrow
57import httpx
68from aiomultiprocess import Pool
79from bs4 import BeautifulSoup
810
11+ from .utils import save_phrases
12+
913
1014async def get_link_content (url ):
1115 phrases = []
1216 try :
13- time .sleep (1.5 )
17+ await asyncio .sleep (randint ( 1 , 3 ) )
1418 async with httpx .AsyncClient () as client :
1519 r = await client .get (url , timeout = 240 )
1620 if r .status_code == 200 :
1721 html = BeautifulSoup (r .content , "lxml" )
18- posts = html .findAll ("article " , {"class" : "hstEntry__content " })
22+ posts = html .findAll ("div " , {"class" : "content " })
1923 for post in posts :
2024 phrases += post .get_text (strip = True ).split ("." )
2125 except Exception as e :
@@ -26,19 +30,19 @@ async def get_link_content(url):
2630async def get_links (url ):
2731 links = []
2832 try :
33+ await asyncio .sleep (randint (1 , 2 ))
2934 async with httpx .AsyncClient () as client :
30- r = await client .get (url , timeout = 240 )
35+ r = await client .get (url , timeout = 240 , follow_redirects = True )
3136 if r .status_code == 200 :
3237 html = BeautifulSoup (r .content , "lxml" )
33- links_ = html .findAll ("article " , {"class" : "block-opacable hstBlock " })
38+ links_ = html .findAll ("div " , {"class" : "card-img " })
3439 for link in links_ :
3540 href = link .find ("a" ).get ("href" )
36- if "https://history.uol .com.br" not in href :
37- href = f"https://history.uol .com.br{ href } "
41+ if "https://www.canalhistory .com.br" not in href :
42+ href = f"https://www.canalhistory .com.br{ href } "
3843 links .append (href )
39- except Exception :
40- # print(f"2. Erro ao carregar posts: {url}, {str(e)}")
41- pass
44+ except Exception as e :
45+ print (f"2. Erro ao carregar links: { url } , { str (e )} " )
4246 return links
4347
4448
@@ -53,25 +57,13 @@ async def carregar(func, urls):
5357 print ("-" * 30 )
5458 start_date = arrow .get (datetime (2020 , 1 , 1 ))
5559 links = [
56- f"https://history.uol.com.br/hoje-na-historia/{ start_date .shift (days = d ).format ('YYYY-MM-DD' )} "
57- for d in range (1 , 366 )
60+ f"https://history.uol.com.br/hoje-na-historia/{ start_date .shift (days = d ).format ('DD/MM' )} " for d in range (1 , 366 )
5861 ]
59- print (links )
60- # links = list(filter(None, chain(*asyncio.run(carregar(get_links, links)))))
61- # print(f"Links carregados... {len(links)}")
62- # phrases = filter(None, chain(*asyncio.run(carregar(get_link_content, links))))
63- # phrases = [phrase.strip() for phrase in phrases if len(phrase) > 15 and not phrase.startswith("Imagem:")]
64-
65- # try:
66- # sentences = []
67- # with codecs.open(f"{os.getcwd()}/data/embedding/history.txt", "rb", encoding="utf-8") as fh:
68- # sentences = fh.readlines()
69- # sentences = [sent.strip() for sent in sentences]
70- # with codecs.open(f"{os.getcwd()}/data/embedding/history.txt", "wb", encoding="utf-8") as fh:
71- # sents = sorted(list(set(sentences + phrases)))
72- # np.savetxt(fh, sents, fmt="%s")
73- # except:
74- # with codecs.open(f"{os.getcwd()}/data/embedding/history_sec.txt", "wb", encoding="utf-8") as fh:
75- # sents = sorted(list(set(phrases)))
76- # np.savetxt(fh, sents, fmt="%s")
77- # print()
62+ links = list (filter (None , chain (* asyncio .run (carregar (get_links , links )))))
63+ print (f"Links carregados... { len (links )} " )
64+ phrases = filter (None , chain (* asyncio .run (carregar (get_link_content , links ))))
65+ phrases = [
66+ pphrase for phrase in phrases if len (pphrase := phrase .strip ()) > 10 and not phrase .startswith ("Imagem:" )
67+ ]
68+ save_phrases (phrases , "/data/embedding/history.txt" )
69+ print ()
0 commit comments