andreysm
diff --git a/‎README.md‎
Lines changed: 7 additions & 9 deletions b/‎README.md‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎ruaccent/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎ruaccent/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎ruaccent/accent_model.py‎
Lines changed: 45 additions & 0 deletions b/‎ruaccent/accent_model.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎ruaccent/char_tokenizer.py‎
Lines changed: 106 additions & 0 deletions b/‎ruaccent/char_tokenizer.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎ruaccent/omograph_model.py‎
Lines changed: 90 additions & 0 deletions b/‎ruaccent/omograph_model.py‎
Lines changed: 90 additions & 0 deletions
@@ -16,26 +16,24 @@ RUAccent - это библиотека для автоматической ра
    ```
 ## Параметры работы
 
-    load(omograph_model_size='big_poetry', use_dictionary=True, custom_dict={})
+    load(omograph_model_size='turbo', use_dictionary=True, custom_dict={}, device="CPU", workdir=None)
 
- - На данный момент доступно 6 моделей. **big** (рекомендуется к использованию), **medium** и **small**. Рекомендуются к использованию модели версии **poetry**. Их названия **big_poetry**, **medium_poetry**, **small_poetry**.
- - Модель **big** имеет 178 миллионов параметров, **medium** 85 миллионов, а **small** 12 миллионов
+ - На данный момент доступно 4 модели - **turbo**, **big_poetry**, **medium_poetry**, **small_poetry**
  - Переменная **use_dictionary** отвечает за загрузку всего словаря (требуется больше ОЗУ), иначе все ударения расставляет нейросеть. 
  - Функция **custom_dict** отвечает за добавление своих вариантов ударений в словарь. Формат такой: `{'слово': 'сл+ово с удар+ением'}`
-
-    **Для работы требуется 5 гигабайт ОЗУ**
+- Выбор устройства CPU или CUDA. **Для работы с CUDA требуется установить onnxruntime-gpu и CUDA.**
+- workdir - принимает строку. Является путём, куда скачиваются модели.
+    
+    **Для стабильной работы требуется минимум 3 гигабайта ОЗУ**
 ## Пример использования
 ```python
 from ruaccent import RUAccent
 
 accentizer = RUAccent()
-accentizer.load(omograph_model_size='big_poetry', use_dictionary=True)
+accentizer.load(omograph_model_size='turbo', use_dictionary=True)
 
 text = 'на двери висит замок.'
 print(accentizer.process_all(text))
-
-text = 'ежик нашел в лесу ягоды.'
-print(accentizer.process_yo(text))
 ```
 
 Файлы моделей и словарей располагаются по [ссылке](https://huggingface.co/ruaccent/accentuator). Мы будем признательны фидбеку на [telegram аккаунт](https://t.me/chckdskeasfsd)
@@ -0,0 +1,6 @@
+"""Russian accentizer"""
+
+__version__ = "1.5.6.1"
+
+
+from .ruaccent import RUAccent
@@ -0,0 +1,45 @@
+import numpy as np
+import json
+from onnxruntime import InferenceSession
+from .char_tokenizer import CharTokenizer
+
+def softmax(x):
+    e_x = np.exp(x - np.max(x))
+    return e_x / e_x.sum(axis=-1, keepdims=True)
+
+class AccentModel:
+    def __init__(self) -> None:
+        pass
+
+    def load(self, path, device="CPU"):
+        self.session = InferenceSession(f"{path}/model.onnx", providers=["CUDAExecutionProvider" if device == "CUDA" else "CPUExecutionProvider"])
+
+        with open(f"{path}/config.json", "r") as f:
+            self.id2label = json.load(f)["id2label"]
+        self.tokenizer = CharTokenizer.from_pretrained(path)
+
+    def render_stress(self, text, pred):
+        text = list(text)
+        i = 0
+        for chunk in pred:
+            if chunk['label'] != "NO" and chunk['label'] != "STRESS_SECONDARY" and chunk["score"] >= 0.55:
+                text[i - 1] = "+" + text[i - 1]
+            i += 1
+        text = "".join(text)
+        return text
+
+    def put_accent(self, word):
+        inputs = self.tokenizer(word, return_tensors="np")
+        inputs = {k: v.astype(np.int64) for k, v in inputs.items()}
+        outputs = self.session.run(None, inputs)
+        output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
+        logits = outputs[output_names["logits"]]
+        probabilities = softmax(logits)
+        scores = np.max(probabilities, axis=-1)[0]
+        labels = np.argmax(logits, axis=-1)[0]
+        pred_with_scores = [{'label': self.id2label[str(label)], 'score': float(score)} 
+                            for label, score in zip(labels, scores)]
+
+        stressed_word = self.render_stress(word, pred_with_scores)
+
+        return stressed_word
@@ -0,0 +1,106 @@
+import os
+from typing import Optional, Tuple, List
+from collections import OrderedDict
+
+from transformers import PreTrainedTokenizer
+
+
+def load_vocab(vocab_file):
+    vocab = OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+class CharTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "vocab.txt"}
+
+    def __init__(
+        self,
+        vocab_file=None,
+        pad_token="[pad]",
+        unk_token="[unk]",
+        bos_token="[bos]",
+        eos_token="[eos]",
+        do_lower_case=False,
+        *args,
+        **kwargs
+    ):
+        self.vocab = load_vocab(vocab_file)
+        super().__init__(
+            pad_token=pad_token,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            do_lower_case=do_lower_case,
+            **kwargs
+        )
+        self.do_lower_case = do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab)
+
+    def _convert_token_to_id(self, token):
+        if self.do_lower_case:
+            token = token.lower()
+        return self.vocab.get(token, self.vocab[self.unk_token])
+
+    def _convert_id_to_token(self, index):
+        return self.ids_to_tokens[index]
+
+    def _tokenize(self, text):
+        if self.do_lower_case:
+            text = text.lower()
+        return list(text)
+
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+
+    def build_inputs_with_special_tokens(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        bos = [self.bos_token_id]
+        eos = [self.eos_token_id]
+        return bos + token_ids_0 + eos
+
+    def get_special_tokens_mask(
+         self,
+         token_ids_0: List[int],
+         token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        return (len(token_ids_0) + 2) * [0]
+
+    def save_vocabulary(
+        self,
+        save_directory: str,
+        filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        assert os.path.isdir(save_directory)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") +
+            self.vocab_files_names["vocab_file"]
+        )
+        index = 0
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                assert index == token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
@@ -0,0 +1,90 @@
+import numpy as np
+from onnxruntime import InferenceSession
+from transformers import AutoTokenizer
+import re
+
+
+class OmographModel:
+    def __init__(self):
+        self.special_words = ['балчуга', 'вертела', 'волоки', 'волоку', 'воронью', 'выбродите', 'вывозите', 'выносите', 'выноситесь', 'выходите', 'железы', 'начала', 'округа', 'перепела', 'развитая', 'развитого', 'развитое', 'развитой', 'развитом', 'развитому', 'развитою', 'развитую', 'развитые', 'развитым', 'развитыми', 'развитых', 'сторожа', 'сторожи', 'сторожу', 'удало', 'начался', 'началась', 'началось', 'бутиках', 'ожила', 'создало', 'коротки', 'проклята', 'роженица', 'роженицы', 'рожениц', 'роженице', 'роженицам', 'роженицу', 'роженицей', 'роженицею', 'роженицами', 'роженицах', 'пристава', 'приставов', 'приставам', 'приставами', 'приставах', 'пережитое', 'пережитого', 'пережитые', 'пережитых', 'пережитому', 'пережитым', 'пережитыми', 'пережитом', 'нипоняла']
+
+
+    def load(self, path, device="CPU"):
+        self.session = InferenceSession(f"{path}/model.onnx", providers=["CUDAExecutionProvider" if device == "CUDA" else "CPUExecutionProvider"])
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        
+    def softmax(self, x):
+        e_x = np.exp(x - np.max(x))
+        return e_x / e_x.sum()
+
+    def group_words(self, words):
+        groups = {}
+        for word in words:
+            parts = word.replace('+', '')
+            key = parts
+            group = groups.setdefault(key, [])
+            group.append(word)
+    
+        result = []
+        for group in groups.values():
+            has_special_word = any(word.replace('+', '') in self.special_words for word in group)
+            if has_special_word and len(group) > 3:
+                subgroups = [group[i:i+3] for i in range(0, len(group), 3)]
+                result.extend(subgroups)
+            elif len(group) > 3 and len(group) % 2 == 0:
+                subgroups = [group[i:i+2] for i in range(0, len(group), 2)]
+                result.extend(subgroups)
+            else:
+                result.append(group)
+    
+        return result
+        
+    def transfer_grouping(self, grouped_list, target_list):
+        new_grouped_list = []
+        start_index = 0
+        for group in grouped_list:
+            group_length = len(group)
+            new_group = target_list[start_index:start_index + group_length]
+            new_grouped_list.append(new_group)
+            start_index += group_length
+        return new_grouped_list
+        
+    def classify(self, texts, hypotheses):
+        hypotheses_probs = []
+        preprocessed_texts = [re.sub(r'\s+(?=(?:[,.?!:;…]))', r'', text) for text in texts]
+        if len(hypotheses) % 2 != 0:
+            #print("NO_BATCH")
+            outs = []
+            grouped_h = self.group_words(hypotheses)
+            grouped_t = self.transfer_grouping(grouped_h, preprocessed_texts)
+            for h, t in zip(grouped_h, grouped_t):
+                probs = []
+                for hp in h:
+                    inputs = self.tokenizer(t[0], hp, max_length=512, truncation=True, return_tensors="np")
+                    inputs = {k: v.astype(np.int64) for k, v in inputs.items()}
+                    outputs = self.session.run(None, inputs)[0]
+                    outputs = self.softmax(outputs)
+                    prob_label_is_true = [float(p[1]) for p in outputs][0]
+                    probs.append(prob_label_is_true)
+                    #print(h, prob_label_is_true)
+                outs.append(h[probs.index(max(probs))])
+            return outs
+        else:
+            inputs = self.tokenizer(preprocessed_texts, hypotheses, return_tensors="np", padding=True, truncation=True, max_length=512)
+            inputs = {k: v.astype(np.int64) for k, v in inputs.items()}
+    
+            outputs = self.session.run(None, inputs)[0]
+            outputs = self.softmax(outputs)
+            #print(hypotheses)
+            preprocessed_texts = [(preprocessed_texts[i], preprocessed_texts[i+1]) for i in range(0, len(preprocessed_texts), 2)]
+            hypotheses =  [(hypotheses[i], hypotheses[i+1]) for i in range(0, len(hypotheses), 2)]
+            
+            for i in range(len(texts)):
+                prob_label_is_true = float(outputs[i][1])
+                hypotheses_probs.append(prob_label_is_true)
+    
+            hypotheses_probs = [(hypotheses_probs[i], hypotheses_probs[i+1]) for i in range(0, len(hypotheses_probs), 2)]
+            outs = []
+            for pair1, pair2 in zip(hypotheses, hypotheses_probs):
+              outs.append(pair1[pair2.index(max(pair2))])
+            return outs