add unittest for the custom formatters

ivuorio · ivuorio · commit 0c4836cdc4fd · 2025-02-08T07:53:54.000Z
diff --git a/tests/data_tests/test_dataset_formatters.py b/tests/data_tests/test_dataset_formatters.py
@@ -15,3 +15,14 @@ def test_common_voice_preprocessor(self):  # pylint: disable=no-self-use
 
         assert items[-1]["text"] == "Competition for limited resources has also resulted in some local conflicts."
         assert items[-1]["audio_file"] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_19737074.wav")
+
+    def test_custom_formatter_with_existing_name(self):
+        from TTS.tts.datasets import add_formatter
+
+        def custom_formatter(root_path, meta_file, ignored_speakers=None):
+            return []
+
+        add_formatter("custom_formatter", custom_formatter)
+
+        with self.assertRaises(ValueError):
+            add_formatter("custom_formatter", custom_formatter)
diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py
@@ -8,7 +8,7 @@
 
 from tests import get_tests_data_path
 from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig
-from TTS.tts.datasets import load_tts_samples
+from TTS.tts.datasets import add_formatter, load_tts_samples
 from TTS.tts.datasets.dataset import TTSDataset
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
@@ -251,3 +251,36 @@ def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
         # check batch zero-frame conditions (zero-frame disabled)
         # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
         # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
+
+
+def test_custom_formatted_dataset_with_loader():
+    def custom_formatter(path, metafile, **kwargs):
+        with open(os.path.join(path, metafile)) as f:
+            data = f.readlines()
+        items = []
+        for line in data:
+            file_path, text = line.split("|", 1)
+            items.append({"text": text, "audio_file": file_path, "root_path": path, "speaker_name": "test"})
+        return items
+
+    def custom_formatter2(x, *args, **kwargs):
+        items = custom_formatter(x, *args, **kwargs)
+        [item.update({"audio_file": f"{item['audio_file']}.wav"}) for item in items]
+        return items
+
+    add_formatter("custom_formatter1", custom_formatter)
+    add_formatter("custom_formatter2", custom_formatter2)
+    dataset1 = BaseDatasetConfig(
+        formatter="custom_formatter1",
+        meta_file_train="metadata.csv",
+        path=c.data_path,
+    )
+    dataset2 = BaseDatasetConfig(
+        formatter="custom_formatter2",
+        meta_file_train="metadata.csv",
+        path=c.data_path,
+    )
+    dataset_configs = [dataset1, dataset2]
+    train_samples, eval_samples = load_tts_samples(dataset_configs, eval_split=True, eval_split_size=0.2)
+    assert len(train_samples) == 14
+    assert len(eval_samples) == 2