Update V1.5.2

Den4ikAI · web-flow · commit 721fa055c3a0 · 2023-09-22T22:13:43.000+08:00
Эксперимент с Ё-омографами
Переделка механизма скачивания моделей
Обновление моделей
diff --git a/README.md b/README.md
@@ -13,15 +13,15 @@ RUAccent - это библиотека для автоматической ра
    ```
 ## Параметры работы
 
-    load(omograph_model_size='big', use_dictionary=False, custom_dict={}, custom_homographs={}
+    load(omograph_model_size='big', use_dictionary=False, custom_dict={}, custom_homographs={}, load_yo_homographs_model=False)
 
 
- - На данный момент доступны две модели: **big** (рекомендуется к использованию) и **small**. 
- - Модель **big** имеет 178 миллионов параметров, а **small** 10 миллионов
- - Переменная **use_dict** отвечает за загрузку всего словаря (требуется больше ОЗУ), иначе все ударения расставляет нейросеть. 
+ - На данный момент доступны две модели: **big** (рекомендуется к использованию), **medium** и **small**. 
+ - Модель **big** имеет 178 миллионов параметров, **medium** 85 миллионов, а **small** 42 миллиона
+ - Переменная **use_dictionary** отвечает за загрузку всего словаря (требуется больше ОЗУ), иначе все ударения расставляет нейросеть. 
  - Переменная **custom_homographs** отвечает за добавление своих омографов. Формат такой: `{'слово-омограф': ['вариант ударения 1', 'вариант ударения 2']}`. 
  - Функция **custom_dict** отвечает за добавление своих вариантов ударений в словарь. Формат такой: `{'слово': 'сл+ово с удар+ением'}`
-
+ - Также вы можете протестировать **beta-функцию** разрешения Ё-омографов, установив `load_yo_homographs_model=True` в `load()`, а также `accentizer.process_all(text, process_yo_omographs=True)` или `accentizer.process_yo(text, process_yo_omographs=True)`.
 
 
 ## Пример использования
diff --git a/ruaccent/__init__.py b/ruaccent/__init__.py
@@ -1,6 +1,6 @@
 """Russian accentizer"""
 
-__version__ = "1.5.1"
+__version__ = "1.5.2"
 
 
 from .ruaccent import RUAccent
diff --git a/ruaccent/omograph_model.py b/ruaccent/omograph_model.py
@@ -18,7 +18,6 @@ def softmax(self, x):
     def classify(self, text, hypotheses):
         hypotheses_probs = []
         text = re.sub(r'\s+(?=(?:[,.?!:;…]))', r'', text)
-
         for h in hypotheses:
             inputs = self.tokenizer(text, h, return_tensors="np")
             inputs = {k: v.astype(np.int64) for k, v in inputs.items()}
diff --git a/ruaccent/ruaccent.py b/ruaccent/ruaccent.py
@@ -1,51 +1,80 @@
 import json
 import pathlib
-from huggingface_hub import snapshot_download
+from huggingface_hub import HfFileSystem, hf_hub_download
 import os
 from os.path import join as join_path
 from .omograph_model import OmographModel
 from .accent_model import AccentModel
+from .yo_omograph_model import YomographModel
 from .text_split import split_by_sentences
 import re
 
 
 class RUAccent:
     def __init__(self, workdir=None):
         self.omograph_model = OmographModel()
+        self.yo_omograph_model = YomographModel()
         self.accent_model = AccentModel()
+        self.fs = HfFileSystem()
+        self.omograph_models_paths = {'big': '/nn/nn_omograph/big', 'medium': '/nn/nn_omograph/medium', 'small': '/nn/nn_omograph/small'}
+        self.accentuator_paths = ['/nn/nn_accent', '/dictionary']
+        self.yo_omograph_path = ['/nn/nn_yo_omograph']
         if not workdir:
             self.workdir = str(pathlib.Path(__file__).resolve().parent)
         else:
             self.workdir = workdir
 
+
     def load(
         self,
         omograph_model_size="big",
         use_dictionary=False,
         custom_dict={},
         custom_homographs={},
+        load_yo_homographs_model=False,
         repo="TeraTTS/accentuator",
         ):
 
+        self.load_yo_homographs_model = load_yo_homographs_model
         self.custom_dict = custom_dict
         self.accents = {}
         if not os.path.exists(
             join_path(self.workdir, "dictionary")
-        ) or not os.path.exists(join_path(self.workdir, "nn")):
-            snapshot_download(
-                repo_id=repo,
-                ignore_patterns=["*.md", "*.gitattributes"],
-                local_dir=self.workdir,
-                local_dir_use_symlinks=False,
-            )
+        ):
+            for path in self.accentuator_paths:
+                files = self.fs.ls(repo + path)
+                for file in files:
+                    hf_hub_download(repo_id=repo, local_dir_use_symlinks=False, local_dir=self.workdir, filename=file['name'].replace(repo+'/', ''))
+    
+        if not os.path.exists(join_path(self.workdir, "nn")):
+            os.mkdir(join_path(self.workdir, "nn"))
+        
+        if not os.path.exists(join_path(self.workdir, "nn", "nn_omograph", omograph_model_size)):
+            model_path = self.omograph_models_paths.get(omograph_model_size, None)
+            if model_path:
+                files = self.fs.ls(repo + model_path)
+                for file in files:
+                    hf_hub_download(repo_id=repo, local_dir_use_symlinks=False, local_dir=self.workdir, filename=file['name'].replace(repo+'/', ''))
+            else:
+                raise FileNotFoundError
+        
         self.omographs = json.load(
             open(join_path(self.workdir, "dictionary/omographs.json"), encoding='utf-8')
         )
-        #self.yo_omographs = json.load(
-        #    open(join_path(self.workdir, "dictionary/yo_omographs.json"), encoding='utf-8')
-        #)
-        #self.omographs.update(self.yo_omographs)
         self.omographs.update(custom_homographs)
+
+        if load_yo_homographs_model:
+            if not os.path.exists(join_path(self.workdir, "nn", "nn_yo_omograph")):
+                for path in self.yo_omograph_path:
+                    files = self.fs.ls(repo + path)
+                    for file in files:
+                        hf_hub_download(repo_id=repo, local_dir_use_symlinks=False, local_dir=self.workdir, filename=file['name'].replace(repo+'/', ''))
+
+            self.yo_omographs = json.load(
+                open(join_path(self.workdir, "dictionary/yo_omographs.json"), encoding='utf-8')
+            )
+            self.yo_omograph_model.load(join_path(self.workdir, "nn/nn_yo_omograph/"))
+
         self.yo_words = json.load(
             open(join_path(self.workdir, "dictionary/yo_words.json"), encoding='utf-8')
         )
@@ -57,16 +86,13 @@ def load(
 
         self.accents.update(self.custom_dict)
 
-        if omograph_model_size not in ["small", "big"]:
-            raise NotImplementedError
-
         self.omograph_model.load(
             join_path(self.workdir, f"nn/nn_omograph/{omograph_model_size}/")
-            
         )
         self.accent_model.load(join_path(self.workdir, "nn/nn_accent/"))
 
 
+
     def split_by_words(self, string):
         result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
         return [res for res in result if res]
@@ -115,6 +141,26 @@ def _process_omographs(self, text):
             splitted_text[omograph["position"]] = cls
         return splitted_text
 
+    def _process_yo_omographs(self, text):
+        splitted_text = text
+
+        founded_omographs = []
+        for i, word in enumerate(splitted_text):
+            variants = self.yo_omographs.get(word)
+            if variants:
+                founded_omographs.append(
+                    {"word": word, "variants": variants, "position": i}
+                )
+        for omograph in founded_omographs:
+            splitted_text[
+                omograph["position"]
+            ] = f"<w>{splitted_text[omograph['position']]}</w>"
+            cls = self.yo_omograph_model.classify(
+                " ".join(splitted_text), omograph["variants"]
+            )
+            splitted_text[omograph["position"]] = cls
+        return splitted_text
+    
     def _process_accent(self, text):
         splitted_text = text
 
@@ -126,23 +172,27 @@ def _process_accent(self, text):
                 splitted_text[i] = stressed_word
         return splitted_text
 
-    def process_yo(self, text):
+    def process_yo(self, text, process_yo_omographs=False):
         sentences = split_by_sentences(text)
         outputs = []
         for sentence in sentences:
             text = self.split_by_words(sentence)
             processed_text = self._process_yo(text)
+            if process_yo_omographs:
+                processed_text = self._process_yo_omographs(processed_text)
             processed_text = " ".join(processed_text)
             processed_text = self.delete_spaces_before_punc(processed_text)
             outputs.append(processed_text)
         return " ".join(outputs)
     
-    def process_all(self, text):
+    def process_all(self, text, process_yo_omographs=False):
         sentences = split_by_sentences(text)
         outputs = []
         for sentence in sentences:
             text = self.split_by_words(sentence)
             processed_text = self._process_yo(text)
+            if process_yo_omographs:
+                processed_text = self._process_yo_omographs(processed_text)
             processed_text = self._process_omographs(processed_text)
             processed_text = self._process_accent(processed_text)
             processed_text = " ".join(processed_text)
diff --git a/ruaccent/yo_omograph_model.py b/ruaccent/yo_omograph_model.py
@@ -0,0 +1,29 @@
+import numpy as np
+from onnxruntime import InferenceSession
+from transformers import AutoTokenizer
+import re
+
+class YomographModel:
+    def __init__(self):
+        pass
+
+    def load(self, path):
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.session = InferenceSession(f"{path}/model.onnx", providers=['CPUExecutionProvider'])
+
+    def softmax(self, x):
+        e_x = np.exp(x - np.max(x))
+        return e_x / e_x.sum()
+
+    def classify(self, text, hypotheses):
+        hypotheses_probs = []
+        text = re.sub(r'\s+(?=(?:[,.?!:;…]))', r'', text)
+        for h in hypotheses:
+            inputs = self.tokenizer(text, h, return_tensors="np")
+            inputs = {k: v.astype(np.int64) for k, v in inputs.items()}
+
+            outputs = self.session.run(None, inputs)[0]
+            outputs = self.softmax(outputs)
+            prob_label_is_true = [float(p[1]) for p in outputs][0]
+            hypotheses_probs.append(prob_label_is_true)
+        return hypotheses[hypotheses_probs.index(max(hypotheses_probs))]
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='ruaccent',
-    version='1.5.1',
+    version='1.5.2',
     author='Denis Petrov',
     author_email='arduino4b@gmail.com',
     description='A Russian text accentuation tool',