idiap
diff --git a/‎.github/workflows/docker.yaml‎
Lines changed: 10 additions & 10 deletions b/‎.github/workflows/docker.yaml‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎TTS/tts/datasets/__init__.py‎
Lines changed: 5 additions & 8 deletions b/‎TTS/tts/datasets/__init__.py‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎TTS/tts/datasets/formatters.py‎
Lines changed: 60 additions & 0 deletions b/‎TTS/tts/datasets/formatters.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎TTS/tts/layers/bark/inference_funcs.py‎
Lines changed: 7 additions & 6 deletions b/‎TTS/tts/layers/bark/inference_funcs.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎TTS/tts/models/bark.py‎
Lines changed: 50 additions & 1 deletion b/‎TTS/tts/models/bark.py‎
Lines changed: 50 additions & 1 deletion
@@ -37,7 +37,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - name: Log in to the Container registry
-        uses: docker/login-action@v1
+        uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
@@ -66,14 +66,14 @@ jobs:
             fi
             tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}"
           fi
-          echo "::set-output name=tags::${tags}"
+          echo "tags=${tags}" >> $GITHUB_OUTPUT
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v1
+        uses: docker/setup-qemu-action@v3
       - name: Set up Docker Buildx
         id: buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@v3
       - name: Build and push
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@v6
         with:
           context: .
           platforms: linux/${{ matrix.arch }}
@@ -91,7 +91,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - name: Log in to the Container registry
-        uses: docker/login-action@v1
+        uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
@@ -120,14 +120,14 @@ jobs:
             fi
             tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}"
           fi
-          echo "::set-output name=tags::${tags}"
+          echo "tags=${tags}" >> $GITHUB_OUTPUT
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v1
+        uses: docker/setup-qemu-action@v3
       - name: Set up Docker Buildx
         id: buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@v3
       - name: Build and push
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@v6
         with:
           context: .
           file: dockerfiles/Dockerfile.dev
 
@@ -9,7 +9,7 @@
 import numpy as np
 
 from TTS.tts.datasets.dataset import *
-from TTS.tts.datasets.formatters import *
+from TTS.tts.datasets.formatters import _FORMATTER_REGISTRY, Formatter, register_formatter
 
 logger = logging.getLogger(__name__)
 
@@ -162,15 +162,12 @@ def load_attention_mask_meta_data(metafile_path):
     return meta_data
 
 
-def _get_formatter_by_name(name):
+def _get_formatter_by_name(name: str) -> Formatter:
     """Returns the respective preprocessing function."""
-    thismodule = sys.modules[__name__]
-    if not hasattr(thismodule, name.lower()):
-        msg = (
-            f"{name} formatter not found. If it is a custom formatter, pass the function to load_tts_samples() instead."
-        )
+    if name.lower() not in _FORMATTER_REGISTRY:
+        msg = f"{name} formatter not found. If it is a custom formatter, make sure to call register_formatter() first."
         raise ValueError(msg)
-    return getattr(thismodule, name.lower())
+    return _FORMATTER_REGISTRY[name.lower()]
 
 
 def find_unique_chars(data_samples):
 
@@ -5,11 +5,39 @@
 import xml.etree.ElementTree as ET
 from glob import glob
 from pathlib import Path
+from typing import Any, Protocol
 
 from tqdm import tqdm
 
 logger = logging.getLogger(__name__)
 
+
+class Formatter(Protocol):
+    def __call__(
+        self,
+        root_path: str | os.PathLike[Any],
+        meta_file: str | os.PathLike[Any],
+        ignored_speakers: list[str] | None,
+        **kwargs,
+    ) -> list[dict[str, Any]]: ...
+
+
+_FORMATTER_REGISTRY: dict[str, Formatter] = {}
+
+
+def register_formatter(name: str, formatter: Formatter) -> None:
+    """Add a formatter function to the registry.
+
+    Args:
+        name: Name of the formatter.
+        formatter: Formatter function.
+    """
+    if name.lower() in _FORMATTER_REGISTRY:
+        msg = f"Formatter {name} already exists."
+        raise ValueError(msg)
+    _FORMATTER_REGISTRY[name.lower()] = formatter
+
+
 ########################
 # DATASETS
 ########################
@@ -659,3 +687,35 @@ def bel_tts_formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused
             text = cols[1]
             items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
+
+
+### Registrations
+register_formatter("cml_tts", cml_tts)
+register_formatter("coqui", coqui)
+register_formatter("tweb", tweb)
+register_formatter("mozilla", mozilla)
+register_formatter("mozilla_de", mozilla_de)
+register_formatter("mailabs", mailabs)
+register_formatter("ljspeech", ljspeech)
+register_formatter("ljspeech_test", ljspeech_test)
+register_formatter("thorsten", thorsten)
+register_formatter("sam_accenture", sam_accenture)
+register_formatter("ruslan", ruslan)
+register_formatter("css10", css10)
+register_formatter("nancy", nancy)
+register_formatter("common_voice", common_voice)
+register_formatter("libri_tts", libri_tts)
+register_formatter("custom_turkish", custom_turkish)
+register_formatter("brspeech", brspeech)
+register_formatter("vctk", vctk)
+register_formatter("vctk_old", vctk_old)
+register_formatter("synpaflex", synpaflex)
+register_formatter("open_bible", open_bible)
+register_formatter("mls", mls)
+register_formatter("voxceleb2", voxceleb2)
+register_formatter("voxceleb1", voxceleb1)
+register_formatter("emotion", emotion)
+register_formatter("baker", baker)
+register_formatter("kokoro", kokoro)
+register_formatter("kss", kss)
+register_formatter("bel_tts_formatter", bel_tts_formatter)
@@ -76,7 +76,10 @@ def generate_text_semantic(
         )
     else:
         semantic_history = None
-    encoded_text = torch.LongTensor(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET
+    encoded_text = (
+        torch.tensor(_tokenize(model.tokenizer, text), device=model.device, dtype=torch.long)
+        + model.config.TEXT_ENCODING_OFFSET
+    )
     if len(encoded_text) > 256:
         p = (len(encoded_text) - 256) / len(encoded_text) * 100
         logger.warning("warning, text too long, lopping of last %.1f%%", p)
@@ -99,11 +102,9 @@ def generate_text_semantic(
         )
     else:
         semantic_history = torch.full((256,), model.config.SEMANTIC_PAD_TOKEN, dtype=torch.int64)
-    x = (
-        torch.cat([encoded_text, semantic_history, torch.tensor([model.config.SEMANTIC_INFER_TOKEN])])
-        .unsqueeze(0)
-        .to(model.device)
-    )
+    x = torch.cat(
+        [encoded_text, semantic_history, torch.tensor([model.config.SEMANTIC_INFER_TOKEN], device=model.device)]
+    ).unsqueeze(0)
     assert x.shape[1] == 256 + 256 + 1
 
     n_tot_steps = 768
 
@@ -1,9 +1,11 @@
+import logging
 import os
 import warnings
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 
+import numpy as np
 import torch
 import torchaudio
 from coqpit import Coqpit
@@ -25,7 +27,14 @@
 from TTS.tts.layers.bark.model import GPT
 from TTS.tts.layers.bark.model_fine import FineGPT
 from TTS.tts.models.base_tts import BaseTTS
-from TTS.utils.generic_utils import warn_synthesize_config_deprecated, warn_synthesize_speaker_id_deprecated
+from TTS.utils.generic_utils import (
+    is_pytorch_at_least_2_4,
+    slugify,
+    warn_synthesize_config_deprecated,
+    warn_synthesize_speaker_id_deprecated,
+)
+
+logger = logging.getLogger(__name__)
 
 
 @dataclass
@@ -209,6 +218,46 @@ def _clone_voice(
         metadata = {"name": self.config["model"]}
         return voice, metadata
 
+    def get_voices(self, voice_dir: str | os.PathLike[Any]) -> dict[str, Path]:
+        """Return all available voices in the given directory.
+
+        Args:
+            voice_dir: Directory to search for voices.
+
+        Returns:
+            Dictionary mapping a speaker ID to its voice file.
+        """
+        # For Bark we overwrite the base method to also allow loading the npz
+        # files included with the original model.
+        return {path.stem: path for path in Path(voice_dir).iterdir() if path.suffix in (".npz", ".pth")}
+
+    def load_voice_file(
+        self,
+        speaker_id: str,
+        voice_dir: str | os.PathLike[Any],
+    ) -> dict[str, Any]:
+        """Load the voice for the given speaker.
+
+        Args:
+            speaker_id:
+                Speaker ID to load.
+            voice_dir:
+                Directory where to look for the voice.
+        """
+        # For Bark we overwrite the base method to also allow loading the npz
+        # files included with the original model.
+        voices = self.get_voices(voice_dir)
+        if speaker_id not in voices:
+            msg = f"Voice file `{slugify(speaker_id)}.pth` or .npz for speaker `{speaker_id}` not found in: {voice_dir}"
+            raise FileNotFoundError(msg)
+        if voices[speaker_id].suffix == ".npz":
+            np_voice = np.load(voices[speaker_id])
+            voice = {key: torch.tensor(np_voice[key]) for key in np_voice.keys()}
+        else:
+            voice = torch.load(voices[speaker_id], map_location="cpu", weights_only=is_pytorch_at_least_2_4())
+        logger.info("Loaded voice `%s` from: %s", speaker_id, voices[speaker_id])
+        return voice
+
     def synthesize(
         self,
         text: str,