rdenadai
diff --git a/‎src/data/scraping/embedding/bula_extractor.py‎
Lines changed: 3 additions & 17 deletions b/‎src/data/scraping/embedding/bula_extractor.py‎
Lines changed: 3 additions & 17 deletions
diff --git a/‎src/data/scraping/embedding/compress.py‎
Lines changed: 6 additions & 6 deletions b/‎src/data/scraping/embedding/compress.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/data/scraping/embedding/fapesp_extractor.py‎
Lines changed: 8 additions & 27 deletions b/‎src/data/scraping/embedding/fapesp_extractor.py‎
Lines changed: 8 additions & 27 deletions
diff --git a/‎src/data/scraping/embedding/frases_extractor.py‎
Lines changed: 11 additions & 26 deletions b/‎src/data/scraping/embedding/frases_extractor.py‎
Lines changed: 11 additions & 26 deletions
diff --git a/‎src/data/scraping/embedding/g1_extractor.py‎
Lines changed: 3 additions & 23 deletions b/‎src/data/scraping/embedding/g1_extractor.py‎
Lines changed: 3 additions & 23 deletions
diff --git a/‎src/data/scraping/embedding/history_extractor.py‎
Lines changed: 23 additions & 31 deletions b/‎src/data/scraping/embedding/history_extractor.py‎
Lines changed: 23 additions & 31 deletions
@@ -1,15 +1,15 @@
-import codecs
 import os
 import random
 import time
 from concurrent.futures import ProcessPoolExecutor
 from string import ascii_lowercase
 
-import numpy as np
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service as ChromeService
 from selenium.webdriver.common.by import By
 
+from .utils import chunks, save_phrases
+
 
 def chrome_options():
     options = webdriver.ChromeOptions()
@@ -67,27 +67,13 @@ def browser_loader(url):
         phrases = filter(None, phrases)
         phrases = [pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]
 
-        sentences = []
-        try:
-            with codecs.open(f"{os.getcwd()}/data/embedding/bulas.txt", "rb", encoding="utf-8") as fh:
-                sentences = fh.readlines()
-        except:
-            pass
-        with codecs.open(f"{os.getcwd()}/data/embedding/bulas.txt", "wb", encoding="utf-8") as fh:
-            sents = list(set(sentences + phrases))
-            np.savetxt(fh, sents, fmt="%s")
-
+        save_phrases(phrases, "/data/embedding/bulas.txt")
         time.sleep(random.randint(1, 3))
     time.sleep(random.randint(4, 6))
 
     driver.close()
 
 
-def chunks(lst, n):
-    for i in range(0, len(lst), n):
-        yield lst[i : i + n]
-
-
 if __name__ == "__main__":
     urls = [
         f"https://consultaremedios.com.br/bulas/{w}"
 
@@ -14,15 +14,15 @@
 warnings.filterwarnings("ignore")
 
 
-def load_sentences(tipo, sentence):
-    sentence = sentence[0] if tipo == 0 else " ".join(sentence)
-    sentence = normalizar.fit(sentence)
+def load_sentences(type_, sentence):
+    sentence = sentence[0] if type_ == 0 else " ".join(sentence)
+    sentence = cleanup.fit(sentence)
     if len(sentence) >= 5:
         return " ".join(sentence).strip()
     return None
 
 
-def carregar_sentencas(exc, fh, filename):
+def load_sentences_files(exc, fh, filename):
     if not os.path.exists(filename):
         return
 
@@ -46,7 +46,7 @@ def corpus_nltk(exc, fh, model):
 
     print("Carregando sentenças...")
 
-    normalizar = CleanUp(
+    cleanup = CleanUp(
         remove_accentuation=False,
         remove_4_comment=False,
         remove_numbers=False,
@@ -86,6 +86,6 @@ def corpus_nltk(exc, fh, model):
         print("Carregando sentenças dos corpus criados...")
         for filename in filenames:
             print(f"Carregando sentenças: {filename}")
-            carregar_sentencas(exc, fh, filename)
+            load_sentences_files(exc, fh, filename)
 
     print(f"Tempo total da compressao: {round(time.time() - start, 2)}s")
@@ -1,13 +1,11 @@
 import asyncio
-import codecs
-import os
 from itertools import chain
 
 import httpx
-import numpy as np
-from aiomultiprocess import Pool
 from bs4 import BeautifulSoup
 
+from .utils import chunks, loader, save_phrases
+
 main_urls = [
     "https://revistapesquisa.fapesp.br/category/impressa/humanidades/",
     "https://revistapesquisa.fapesp.br/saude/",
@@ -32,7 +30,7 @@ async def get_link_content(url):
                 html = BeautifulSoup(r.content, "lxml")
                 posts = html.findAll("div", {"class": "post-content"})
                 for post in posts:
-                    phrases += post.get_text().split(".")
+                    phrases += post.get_text().strip().split(".")
     except Exception as e:
         print(f"2. Erro ao carregar frases: {url}, {str(e)}")
     return phrases
@@ -56,27 +54,10 @@ async def get_links(url):
     return links
 
 
-async def loader(func, urls):
-    async with Pool() as pool:
-        result = await pool.map(func, urls)
-    return result
-
-
 if __name__ == "__main__":
-    links = filter(None, chain(*asyncio.run(loader(get_links, urls))))
-    phrases = filter(None, chain(*asyncio.run(loader(get_link_content, links))))
-    phrases = [pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]
-
-    try:
-        sentences = []
-        with codecs.open(f"{os.getcwd()}/data/embedding/fapesp.txt", "rb", encoding="utf-8") as fh:
-            sentences = fh.readlines()
-            sentences = [sent.strip() for sent in sentences]
-        with codecs.open(f"{os.getcwd()}/data/embedding/fapesp.txt", "wb", encoding="utf-8") as fh:
-            sents = list(set(sentences + phrases))
-            np.savetxt(fh, sents, fmt="%s")
-    except:
-        with codecs.open(f"{os.getcwd()}/data/embedding/fapesp_sec.txt", "wb", encoding="utf-8") as fh:
-            sents = list(set(phrases))
-            np.savetxt(fh, sents, fmt="%s")
+    for chunked in chunks(urls, 10):
+        links = filter(None, chain(*asyncio.run(loader(get_links, chunked))))
+        phrases = filter(None, chain(*asyncio.run(loader(get_link_content, links))))
+        phrases = [pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]
+        save_phrases("/data/embedding/fapesp.txt", phrases)
     print()
@@ -1,13 +1,11 @@
 import asyncio
-import codecs
-import os
 from itertools import chain
 
 import httpx
-import numpy as np
-from aiomultiprocess import Pool
 from bs4 import BeautifulSoup
 
+from .utils import loader, save_phrases
+
 main_urls = list(
     set(
         [
@@ -149,7 +147,9 @@
     )
 )
 
-urls = main_urls + [f"{url}page/{i}/" if type_ == 0 else f"{url}{i}/" for i in range(2, 55) for url, type_ in main_urls]
+urls = [url for url, p in main_urls] + [
+    f"{url}page/{i}/" if type_ == 0 else f"{url}{i}/" for i in range(2, 55) for url, type_ in main_urls
+]
 
 
 async def get_link_content(url):
@@ -159,34 +159,19 @@ async def get_link_content(url):
             r = await client.get(url, timeout=240)
             if r.status_code == 200:
                 html = BeautifulSoup(r.content, "lxml")
-                posts = html.findAll("p", {"class": "frase"})
+                if not (posts := html.findAll("p", {"class": "frase"})):
+                    posts = html.findAll("div", {"class": "card"})
                 for post in posts:
-                    phrases += BeautifulSoup(post.get_text(), "lxml").get_text().replace("\n", " ").split(".")
+                    phrases += (
+                        BeautifulSoup(post.get_text().strip(), "lxml").get_text().strip().replace("\n", " ").split(".")
+                    )
         except Exception as e:
             print(f"1. Erro ao carregar frases: {url}, {str(e)}")
     return phrases
 
 
-async def loader(func, urls):
-    async with Pool() as pool:
-        result = await pool.map(func, urls)
-    return result
-
-
 if __name__ == "__main__":
     phrases = filter(None, chain(*asyncio.run(loader(get_link_content, urls))))
     phrases = list(set([pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]))
-
-    try:
-        sentences = []
-        with codecs.open(f"{os.getcwd()}/data/embedding/frases.txt", "rb", encoding="utf-8") as fh:
-            sentences = fh.readlines()
-            sentences = [sent.strip() for sent in sentences]
-        with codecs.open(f"{os.getcwd()}/data/embedding/frases.txt", "wb", encoding="utf-8") as fh:
-            sents = list(set(sentences + phrases))
-            np.savetxt(fh, sents, fmt="%s")
-    except:
-        with codecs.open(f"{os.getcwd()}/data/embedding/frases_sec.txt", "wb", encoding="utf-8") as fh:
-            sents = list(set(phrases))
-            np.savetxt(fh, sents, fmt="%s")
+    save_phrases(phrases, "/data/embedding/frases.txt")
     print()
@@ -1,13 +1,11 @@
 import asyncio
-import codecs
-import os
 from itertools import chain
 
 import feedparser
-import numpy as np
-from aiomultiprocess import Pool
 from bs4 import BeautifulSoup
 
+from .utils import loader, save_phrases
+
 rss = [
     "http://g1.globo.com/dynamo/brasil/rss2.xml",
     "http://g1.globo.com/dynamo/carros/rss2.xml",
@@ -54,12 +52,6 @@ async def get_link_content(url):
     return phrases
 
 
-async def loader(func, urls):
-    async with Pool() as pool:
-        result = await pool.map(func, urls)
-    return result
-
-
 if __name__ == "__main__":
     print("Iniciando G1")
     print("-" * 30)
@@ -70,17 +62,5 @@ async def loader(func, urls):
         )
     )
     phrases = [pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10]
-
-    try:
-        sentences = []
-        with codecs.open(f"{os.getcwd()}/data/embedding/g1.txt", "rb", encoding="utf-8") as fh:
-            sentences = fh.readlines()
-            sentences = [sent.strip() for sent in sentences]
-        with codecs.open(f"{os.getcwd()}/data/embedding/g1.txt", "wb", encoding="utf-8") as fh:
-            sents = sorted(list(set(sentences + phrases)))
-            np.savetxt(fh, sents, fmt="%s")
-    except:
-        with codecs.open(f"{os.getcwd()}/data/embedding/g1_sec.txt", "wb", encoding="utf-8") as fh:
-            sents = sorted(list(set(phrases)))
-            np.savetxt(fh, sents, fmt="%s")
+    save_phrases(phrases, "/data/embedding/g1.txt")
     print()
@@ -1,21 +1,25 @@
-import time
+import asyncio
 from datetime import datetime
+from itertools import chain
+from random import randint
 
 import arrow
 import httpx
 from aiomultiprocess import Pool
 from bs4 import BeautifulSoup
 
+from .utils import save_phrases
+
 
 async def get_link_content(url):
     phrases = []
     try:
-        time.sleep(1.5)
+        await asyncio.sleep(randint(1, 3))
         async with httpx.AsyncClient() as client:
             r = await client.get(url, timeout=240)
             if r.status_code == 200:
                 html = BeautifulSoup(r.content, "lxml")
-                posts = html.findAll("article", {"class": "hstEntry__content"})
+                posts = html.findAll("div", {"class": "content"})
                 for post in posts:
                     phrases += post.get_text(strip=True).split(".")
     except Exception as e:
@@ -26,19 +30,19 @@ async def get_link_content(url):
 async def get_links(url):
     links = []
     try:
+        await asyncio.sleep(randint(1, 2))
         async with httpx.AsyncClient() as client:
-            r = await client.get(url, timeout=240)
+            r = await client.get(url, timeout=240, follow_redirects=True)
             if r.status_code == 200:
                 html = BeautifulSoup(r.content, "lxml")
-                links_ = html.findAll("article", {"class": "block-opacable hstBlock"})
+                links_ = html.findAll("div", {"class": "card-img"})
                 for link in links_:
                     href = link.find("a").get("href")
-                    if "https://history.uol.com.br" not in href:
-                        href = f"https://history.uol.com.br{href}"
+                    if "https://www.canalhistory.com.br" not in href:
+                        href = f"https://www.canalhistory.com.br{href}"
                     links.append(href)
-    except Exception:
-        # print(f"2. Erro ao carregar posts: {url}, {str(e)}")
-        pass
+    except Exception as e:
+        print(f"2. Erro ao carregar links: {url}, {str(e)}")
     return links
 
 
@@ -53,25 +57,13 @@ async def carregar(func, urls):
     print("-" * 30)
     start_date = arrow.get(datetime(2020, 1, 1))
     links = [
-        f"https://history.uol.com.br/hoje-na-historia/{start_date.shift(days=d).format('YYYY-MM-DD')}"
-        for d in range(1, 366)
+        f"https://history.uol.com.br/hoje-na-historia/{start_date.shift(days=d).format('DD/MM')}" for d in range(1, 366)
     ]
-    print(links)
-    # links = list(filter(None, chain(*asyncio.run(carregar(get_links, links)))))
-    # print(f"Links carregados... {len(links)}")
-    # phrases = filter(None, chain(*asyncio.run(carregar(get_link_content, links))))
-    # phrases = [phrase.strip() for phrase in phrases if len(phrase) > 15 and not phrase.startswith("Imagem:")]
-
-    # try:
-    #     sentences = []
-    #     with codecs.open(f"{os.getcwd()}/data/embedding/history.txt", "rb", encoding="utf-8") as fh:
-    #         sentences = fh.readlines()
-    #         sentences = [sent.strip() for sent in sentences]
-    #     with codecs.open(f"{os.getcwd()}/data/embedding/history.txt", "wb", encoding="utf-8") as fh:
-    #         sents = sorted(list(set(sentences + phrases)))
-    #         np.savetxt(fh, sents, fmt="%s")
-    # except:
-    #     with codecs.open(f"{os.getcwd()}/data/embedding/history_sec.txt", "wb", encoding="utf-8") as fh:
-    #         sents = sorted(list(set(phrases)))
-    #         np.savetxt(fh, sents, fmt="%s")
-    # print()
+    links = list(filter(None, chain(*asyncio.run(carregar(get_links, links)))))
+    print(f"Links carregados... {len(links)}")
+    phrases = filter(None, chain(*asyncio.run(carregar(get_link_content, links))))
+    phrases = [
+        pphrase for phrase in phrases if len(pphrase := phrase.strip()) > 10 and not phrase.startswith("Imagem:")
+    ]
+    save_phrases(phrases, "/data/embedding/history.txt")
+    print()