1.2.0 Update

Den4ikAI · web-flow · commit 34ba472dd0b2 · 2023-08-31T14:46:28.000+08:00
1. Переход на ORT (onnxruntime)
2. Обновлена модель расстановки ударений в неизвестных словах
3. Расширен набор обучающих данных 1.2M -&gt; 3.3M
4. Добавлена возможность отключить словарь
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ RUAccent - это библиотека для автоматической ра
 
 RUAccent предоставляет следующие методы:
 
-- `load(omograph_model_size='medium', dict_load_startup=False)`: Загрузка моделей и словарей. На данные момент доступны две модели: medium    (рекомендуется к использованию) и small. Переменная dict_load_startup отвечает за загрузку всего словаря (требуется больше ОЗУ), либо во время работы для необходимых слов (экономит ОЗУ, но требует быстрые ЖД и работает медленее)
+- `load(omograph_model_size='medium', dict_load_startup=False), disable_accent_dict=False`: Загрузка моделей и словарей. На данные момент доступны две модели: medium    (рекомендуется к использованию) и small. Переменная dict_load_startup отвечает за загрузку всего словаря (требуется больше ОЗУ), либо во время работы для необходимых слов (экономит ОЗУ, но требует быстрыq ЖД и работает медленее). Переменная disable_accent_dict отключает использование словаря (все ударения расставляет нейросеть). Данная функция экономит ОЗУ, по скорости работы сопоставима со всем словарём в ОЗУ.
 
 - `process_all(text)`: Обрабатывает текст всем сразу (ёфикация, расстановка ударений и расстановка ударений в словах-омографах)
 
@@ -24,13 +24,13 @@ RUAccent предоставляет следующие методы:
 from ruaccent import RUAccent
 
 accentizer = RUAccent()
-accentizer.load(omograph_model_size='medium', dict_load_startup=False)
+accentizer.load(omograph_model_size='medium', dict_load_startup=False, disable_accent_dict=False)
 
 text = 'на двери висит замок'
-print(accentizer.process_all(text)) # на двер+и вис+ит зам+ок
+print(text_processor.process_all(text))
 
 text = 'ежик нашел в лесу ягоды'
-print(accentizer.process_yo(text)) # ёжик нашел в лесу ягоды
+print(text_processor.process_yo(text))
 ```
 
 
diff --git a/ruaccent/accent_model.py b/ruaccent/accent_model.py
@@ -1,27 +1,37 @@
-import torch
+import numpy as np
+import json
+from onnxruntime import InferenceSession
 from .char_tokenizer import CharTokenizer
-from transformers import AutoModelForTokenClassification
 
 class AccentModel:
     def __init__(self) -> None:
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        pass
+
     def load(self, path):
-        self.model = AutoModelForTokenClassification.from_pretrained(path).to(self.device)
+        self.session = InferenceSession(f"{path}/model.onnx", providers=["CPUExecutionProvider"])
+
+        with open(f"{path}/config.json", "r") as f:
+            self.id2label = json.load(f)["id2label"]
         self.tokenizer = CharTokenizer.from_pretrained(path)
-    
-    def render_stress(self, word, token_classes):
-        if 'STRESS' in token_classes:
-            index = token_classes.index('STRESS')
-            word = list(word)
-            word[index-1] = '+' + word[index-1]
-            return ''.join(word)
-        else:
-            return word
-    
+        self.tokenizer.model_input_names = ["input_ids", "attention_mask"]
+
+    def render_stress(self, text, pred):
+        text = list(text)
+        i = 0
+        for chunk in pred:
+            if chunk != "NO":
+                text[i - 1] = "+" + text[i - 1]
+            i += 1
+        text = "".join(text)
+        return text
+
     def put_accent(self, word):
-        inputs = self.tokenizer(word, return_tensors="pt").to(self.device)
-        with torch.no_grad():
-            logits = self.model(**inputs).logits
-            predictions = torch.argmax(logits, dim=2)
-            predicted_token_class = [self.model.config.id2label[t.item()] for t in predictions[0]]
-        return self.render_stress(word, predicted_token_class)
+        inputs = self.tokenizer(word, return_tensors="np")
+        inputs = {k: v.astype(np.int64) for k, v in inputs.items()}
+        outputs = self.session.run(None, inputs)
+        output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
+        logits = outputs[output_names["logits"]]
+        labels = np.argmax(logits, axis=-1)[0]
+        labels = [self.id2label[str(label)] for label in labels]
+        stressed_word = self.render_stress(word, labels)
+        return stressed_word
diff --git a/ruaccent/omograph_model.py b/ruaccent/omograph_model.py
@@ -1,21 +1,29 @@
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-import torch
+import numpy as np
+from onnxruntime import InferenceSession
+from transformers import AutoTokenizer
 
 class OmographModel:
-    def __init__(self) -> None:
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        
+    def __init__(self):
+        pass
+
     def load(self, path):
-        self.nli_model = AutoModelForSequenceClassification.from_pretrained(path, torch_dtype=torch.bfloat16).to(self.device)
         self.tokenizer = AutoTokenizer.from_pretrained(path)
-        
+        self.session = InferenceSession(f"{path}/model.onnx", providers=['CPUExecutionProvider'])
+
+    def softmax(self, x):
+        e_x = np.exp(x - np.max(x))
+        return e_x / e_x.sum()
+
     def classify(self, text, hypotheses):
-        encodings = self.tokenizer.batch_encode_plus([(text, hyp) for hyp in hypotheses], return_tensors='pt', padding=True)
-        input_ids = encodings['input_ids'].to(self.device)
-        with torch.no_grad():
-            logits = self.nli_model(input_ids)[0]
-            entail_contradiction_logits = logits[:,[0,2]]
-            probs = entail_contradiction_logits.softmax(dim=1)
-            prob_label_is_true = [float(p[1]) for p in probs]
+        hypotheses_probs = []
+        for h in hypotheses:
+            inputs = self.tokenizer(text, h, return_tensors="np")
+            inputs = {k: v.astype(np.int64) for k, v in inputs.items()}
+            outputs = self.session.run(None, inputs)[0]
+            entail_contradiction_logits = outputs[:, [0, 2]]
+            probs = self.softmax(entail_contradiction_logits)
+            prob_label_is_true = [float(p[1]) for p in probs][0]
+            hypotheses_probs.append(prob_label_is_true)
+        return hypotheses[hypotheses_probs.index(max(hypotheses_probs))]
+
 
-        return hypotheses[prob_label_is_true.index(max(prob_label_is_true))]
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='ruaccent',
-    version='1.0.0',
+    version='1.2.0',
     author='Denis Petrov',
     author_email='arduino4b@gmail.com',
     description='A Russian text accentuation tool',
@@ -11,9 +11,10 @@
     packages=find_packages(),
     install_requires=[
         'huggingface_hub',
-        'torch==1.13.1',
+        'onnxruntime',
         'transformers',
-        'sentencepiece'
+        'sentencepiece',
+        'numpy'
     ],
     classifiers=[
         'Development Status :: 5 - Production/Stable',