BUG FIX

Den4ikAI · web-flow · commit 704bd30788e0 · 2024-10-24T00:00:58.000+08:00
Исправлен баг с перемешиванием слов на некоторых примерах
diff --git a/ruaccent/__init__.py b/ruaccent/__init__.py
@@ -1,6 +1,6 @@
 """Russian accentizer"""
 
-__version__ = "1.5.8.1"
+__version__ = "1.5.8.3"
 
 
 from .ruaccent import RUAccent
diff --git a/ruaccent/omograph_model.py b/ruaccent/omograph_model.py
@@ -18,25 +18,40 @@ def softmax(self, x):
         return e_x / e_x.sum()
 
     def group_words(self, words):
-        groups = {}
-        for word in words:
-            parts = word.replace('+', '')
-            key = parts
-            group = groups.setdefault(key, [])
-            group.append(word)
-    
+        if not words:
+            return []
+            
         result = []
-        for group in groups.values():
-            has_special_word = any(word.replace('+', '') in self.special_words for word in group)
-            if has_special_word and len(group) > 3:
-                subgroups = [group[i:i+3] for i in range(0, len(group), 3)]
-                result.extend(subgroups)
-            elif len(group) > 3 and len(group) % 2 == 0:
-                subgroups = [group[i:i+2] for i in range(0, len(group), 2)]
-                result.extend(subgroups)
+        current_group = [words[0]]
+        current_base = words[0].replace('+', '')
+        
+        for word in words[1:]:
+            base_word = word.replace('+', '')
+            
+            if base_word == current_base:
+                current_group.append(word)
             else:
-                result.append(group)
-    
+                if current_base in self.special_words and len(current_group) > 3:
+                    subgroups = [current_group[i:i+3] for i in range(0, len(current_group), 3)]
+                    result.extend(subgroups)
+                elif len(current_group) > 3 and len(current_group) % 2 == 0:
+                    subgroups = [current_group[i:i+2] for i in range(0, len(current_group), 2)]
+                    result.extend(subgroups)
+                else:
+                    result.append(current_group)
+                
+                current_group = [word]
+                current_base = base_word
+        
+        if current_base in self.special_words and len(current_group) > 3:
+            subgroups = [current_group[i:i+3] for i in range(0, len(current_group), 3)]
+            result.extend(subgroups)
+        elif len(current_group) > 3 and len(current_group) % 2 == 0:
+            subgroups = [current_group[i:i+2] for i in range(0, len(current_group), 2)]
+            result.extend(subgroups)
+        else:
+            result.append(current_group)
+        
         return result
         
     def transfer_grouping(self, grouped_list, target_list):
@@ -57,6 +72,8 @@ def classify(self, texts, hypotheses, num_hypotheses):
             #print("NO_BATCH")
             outs = []
             grouped_h = self.group_words(hypotheses)
+            #print(grouped_h)
+            #print(hypotheses)
             grouped_t = self.transfer_grouping(grouped_h, preprocessed_texts)
             for h, t in zip(grouped_h, grouped_t):
                 probs = []
diff --git a/ruaccent/ruaccent.py b/ruaccent/ruaccent.py
@@ -89,6 +89,7 @@ def load(
         self.omographs = json.load(
             gzip.open(join_path(self.workdir, "dictionary","omographs.json.gz"))
         )
+        self.omographs.update({"коса": ["к+оса", "кос+а"]})
         self.omographs.update(custom_homographs)
         self.omograph_model.load(join_path(self.workdir, self.omograph_models_paths[omograph_model_size][1:]), device=device)
 
@@ -192,11 +193,9 @@ def _process_omographs(self, text):
                     texts_batch.append(self.delete_spaces_before_punc(" ".join(t.copy())))
                 t[position] = t_back
             cls_batch = self.omograph_model.classify(texts_batch, hypotheses_batch, num_hypotheses)
-    
             for cls_index, omograph in enumerate(founded_omographs):
                 position = omograph["position"]
                 splitted_text[position] = cls_batch[cls_index]
-    
         return splitted_text
 
 
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name='ruaccent',
-    version='1.5.8',
+    version='1.5.8.3',
     author='Denis Petrov',
     author_email='arduino4b@gmail.com',
     description='A Russian text accentuation tool',