diff --git a/API-README.md b/API-README.md
new file mode 100644
index 0000000..c841961
--- /dev/null
+++ b/API-README.md
@@ -0,0 +1,36 @@
+# WotR-API-TextToSpeechMod
+By [lvaskys](https://github.com/lvaskys)
+
+This README is for the API implementation that adds the ability to use a backend API for TTS instead of the Windows TTS engine. Currently, [Auralis](https://github.com/astramind-ai/Auralis) (based on xttsv2) and [Kokoro-FastAPI](https://github.com/remsky/Kokoro-FastAPI) are supported. I believe an NVIDIA gpu is required for both, but see their documentation for more information.
+
+## How to use
+
+This mod's main features are configured in a `settings.json` file that lives in the base mod folder. Comments are included to help guide your configuration. Of note is `speech_impl` which defines the implementation of the speech service to use, either `AuralisSpeech` or `KokoroSpeech` (this new implementation), or `WindowsSpeech` or `AppleSpeech` for the original implementation.
+
+The API service must be up and running for the mod to work. See the documentation for the API service you are using for more information on how to set it up. I used WSL to run Auralis, although it may work in native Windows now as well, I'm not sure. For Kokoro, I used the [docker-run](https://github.com/remsky/Kokoro-FastAPI?tab=readme-ov-file#get-started) instructions. I think docker on Windows may require WSL for proper sharing of gpu to the container, so you may need to install it either way.
+
+Make sure the endpoint matches. If you are keeping the `settings.json` file as is, then for Auralis:
+```
+auralis.openai --host 127.0.0.1 --port 8000 --model AstraMindAI/xttsv2 --gpt_model AstraMindAI/xtts2-gpt --max_concurrency 4 --vllm_logging_level warn
+```
+or Kokoro:
+```
+docker run --gpus all -p 8000:8880 ghcr.io/remsky/kokoro-fastapi-gpu:v0.2.2
+```
+
+**Note: in order to use Auralis, you must provide a wav file for server to use for one-shot voice cloning.** Currently, this is set up to live in your base game directory, not your mod directory. Although perhaps that can be fixed in the future. An example file you can use is [female_01.wav](samples/female_01.wav).
+
+## Other new features
+This supports cancelling playback with the controller cancel/B/Circle button. Specifically, it will cancel the current sentence or two sentence chunk being played and continue with the next sentence. This allows for a kind of "fast-forward" type effect if you don't feel like listening to the entire dialogue, but still want to hear later portions. Like, for example, if your reading outpaces the speaker.
+
+## Multilingual Support
+This should theoretically be multi-lingual, but is untested. Both XTTS and Kokoro support multiple languages.
+
+## Linux/WINE Support
+I believe some people have wanted a version of this mod for Linux. I have not tested this on Linux/WINE, but Auralis or Kokoro should theoretically work.
+
+## Limitations/Broken Features
+These new features only support one speaker at the moment. Both voiced and narrator content will be spoken with the chosen voice. I may fully implement male/female/narrator as it was in the original mod, or maybe even characters-specific voices. But as it stands for now, this is a good initial release and works fine for my own needs. However, Windows/Apple implementation should work as intended.
+
+## Motivation and thoughts
+Windows natural TTS voices are pretty good, but lack proper cadence and emotion. Also, I did not want to pay for API, but rather have it run locally. XTTS is excellent in that regard, and seems to pick up on cues without even feeding it any additional information. The sound quality is poorer, however, and it is a good bit slower, but still responsive enough for my needs. Kororo is another TTS I heard about, and decided to add it as an option as well for another alternative. It's super fast, many times more than realtime, and the quality is excellent. The cadence and emotion aren't super, though, and seem rather similar to Windows natural voices.
\ No newline at end of file
diff --git a/SpeechMod/Configuration/Settings/JsonSettingsSerializer.cs b/SpeechMod/Configuration/Settings/JsonSettingsSerializer.cs
new file mode 100644
index 0000000..080f38a
--- /dev/null
+++ b/SpeechMod/Configuration/Settings/JsonSettingsSerializer.cs
@@ -0,0 +1,82 @@
+using System;
+using System.IO;
+using System.Text;
+using Newtonsoft.Json;
+using Newtonsoft.Json.Linq;
+using Newtonsoft.Json.Serialization;
+
+namespace SpeechMod.Configuration.Settings
+{
+ ///
+ /// Handles serialization and deserialization of JsonSettings to and from JSON files.
+ ///
+ public static class JsonSettingsSerializer
+ {
+ private static readonly JsonSerializerSettings SerializerSettings = new JsonSerializerSettings
+ {
+ Formatting = Formatting.Indented,
+ ContractResolver = new CamelCasePropertyNamesContractResolver(),
+ NullValueHandling = NullValueHandling.Include,
+ };
+
+
+ ///
+ /// Saves JsonSettings object to the specified JSON file.
+ /// Isn't used right now, but might be in the future
+ ///
+ /// The settings object to serialize
+ /// Path to save the JSON file
+ /// True if successful, false otherwise
+ public static bool SaveSettings(JsonSettings settings, string filePath)
+ {
+ if (!File.Exists(filePath)) {
+ return false;
+ }
+
+ try
+ {
+ string json = JsonConvert.SerializeObject(settings, SerializerSettings);
+ File.WriteAllText(filePath, json, Encoding.UTF8);
+ Main.Logger?.Log($"Settings saved to {filePath}");
+ return true;
+ }
+ catch (Exception ex)
+ {
+ Main.Logger?.Error($"Failed to save settings: {ex.Message}");
+ return false;
+ }
+ }
+
+ ///
+ /// Loads JsonSettings from the specified JSON file
+ ///
+ /// Path to the JSON file (optional, uses default if not specified)
+ /// Loaded JsonSettings object or default if file doesn't exist or is invalid
+ public static JsonSettings LoadSettings(string filePath = null)
+ {
+ Main.Logger?.Log("Loading JSON settings...");
+
+ try
+ {
+ if (!File.Exists(filePath))
+ {
+ Main.Logger?.Log($"Settings file not found at {filePath}, creating with defaults");
+ JsonSettings defaultSettings = new JsonSettings();
+ //SaveSettings(defaultSettings, filePath);
+ return defaultSettings;
+ }
+
+ string json = File.ReadAllText(filePath, Encoding.UTF8);
+ JsonSettings settings = JsonConvert.DeserializeObject(json, SerializerSettings);
+ Main.Logger?.Log($"Settings loaded from {filePath}");
+ return settings;
+ }
+ catch (Exception ex)
+ {
+ Main.Logger?.Error($"Failed to load settings: {ex.Message}");
+ return new JsonSettings(); // Return default settings on error
+ }
+ }
+
+ }
+}
diff --git a/SpeechMod/JsonSettings.cs b/SpeechMod/JsonSettings.cs
new file mode 100644
index 0000000..06841a2
--- /dev/null
+++ b/SpeechMod/JsonSettings.cs
@@ -0,0 +1,57 @@
+namespace SpeechMod;
+
+// These default settings will normally be overwritten upon deserialization from settings.json
+public class JsonSettings
+{
+ // AuralisSpeech, KokoroSpeech, AppleSpeech, WindowsSpeech
+ public string speech_impl = "AuralisSpeech";
+
+ public string endpoint = "http://127.0.0.1:8000/v1/audio/speech";
+
+ // possibly create setting for audio file download location
+ //public string audio_file_download_location = Path.GetTempPath();
+
+ // Auralis-specific settings
+ public AuralisJsonSettings auralis_settings = new AuralisJsonSettings();
+
+ // Kokoro-specific settings
+ public KokoroJsonSettings kokoro_settings = new KokoroJsonSettings();
+
+}
+
+public class AuralisJsonSettings
+{
+ public string path_to_voice_one_shot = "female_01.wav";
+ public string response_format = "wav";
+ public float speed = 1.0f;
+ public string model = "xttsv2";
+ public bool enhance_speech = true;
+ public bool sound_norm_refs = false;
+ public int max_ref_length = 60;
+ public int gpt_cond_len = 30;
+ public int gpt_cond_chunk_len = 4;
+ public float temperature = 0.75f;
+ public float top_p = 0.85f;
+ public int top_k = 50;
+ public float repetition_penalty = 5.0f;
+ public float length_penalty = 1.0f;
+ public bool do_sample = true;
+ public string language = "auto";
+
+}
+
+public class KokoroJsonSettings
+{
+ public string voice = "af_heart";
+ public string model = "kokoro";
+ public float speed = 1.0f;
+ public string lang_code = "a";
+ public string response_format = "wav";
+
+ // Normalization options
+ public bool normalize = true;
+ public bool unit_normalization = false;
+ public bool url_normalization = true;
+ public bool email_normalization = true;
+ public bool optional_pluralization_normalization = true;
+}
diff --git a/SpeechMod/Main.cs b/SpeechMod/Main.cs
index e5cdbfa..d2d3aaf 100644
--- a/SpeechMod/Main.cs
+++ b/SpeechMod/Main.cs
@@ -1,10 +1,13 @@
using HarmonyLib;
+using Rewired;
using SpeechMod.Configuration;
+using SpeechMod.Configuration.Settings;
using SpeechMod.Keybinds;
using SpeechMod.Unity;
using SpeechMod.Voice;
using System;
using System.Collections.Generic;
+using System.IO;
using System.Linq;
using System.Reflection;
using TMPro;
@@ -20,6 +23,7 @@ public static class Main
{
public static UnityModManager.ModEntry.ModLogger Logger;
public static Settings Settings;
+ public static JsonSettings JsonSettings;
public static bool Enabled;
public static string[] FontStyleNames = Enum.GetNames(typeof(FontStyles));
@@ -45,6 +49,8 @@ private static bool Load(UnityModManager.ModEntry modEntry)
Logger = modEntry.Logger;
+ JsonSettings = JsonSettingsSerializer.LoadSettings(Path.Combine(modEntry.Path, "settings.json"));
+
if (!SetSpeech())
return false;
@@ -69,11 +75,26 @@ private static bool Load(UnityModManager.ModEntry modEntry)
PhoneticDictionary.LoadDictionary();
+ // For ReInput.players.AllPlayers :
+ // 0 System, 1 MainPlayer
+ if (ReInput.players.allPlayerCount >= 1)
+ {
+ Rewired.Player p = ReInput.players.AllPlayers[1];
+
+ p.AddInputEventDelegate(doButtonWork, UpdateLoopType.Update, InputActionEventType.ButtonJustPressed, "Decline");
+ }
+
Debug.Log("Pathfinder: Wrath of the Righteous Speech Mod Initialized!");
m_Loaded = true;
return true;
}
+ public static void doButtonWork(InputActionEventData data)
+ {
+ // Interrupts current speech and plays the next phrase (if any)
+ Speech.NextPhrase();
+ }
+
private static void SetUpSettings()
{
if (ModConfigurationManager.Instance.GroupedSettings.TryGetValue("main", out _))
@@ -132,19 +153,57 @@ private static bool SetAvailableVoices()
return true;
}
+ // TODO clean up UMM configuration to better show what speech implementation is being used
+ // and what can be changed in-game. I prefer the json way, anyway, so I'm not sure how much
+ // I will actually change this
private static bool SetSpeech()
{
+ // Dispose of existing speech instance if it exists
+ if (Speech is IDisposable disposableSpeech)
+ {
+ disposableSpeech.Dispose();
+ }
+
+ // keep the setting of uielements/config section the same for now (until maybe I change it)
+ // but use the json config for the speech implementation instantiation
+ try {
+ var className = JsonSettings.speech_impl;
+
+ Logger.Log("Setting speech impl...." + className);
+
+ Assembly assembly = Assembly.GetExecutingAssembly();
+ Type type = assembly.GetTypes()
+ .FirstOrDefault(t => t.Name.Equals(className, StringComparison.Ordinal));
+
+ if (type == null)
+ {
+ throw new ArgumentException($"Class '{className}' not found in the current assembly.");
+ }
+
+ Speech = (ISpeech) Activator.CreateInstance(type);
+ }
+ catch (Exception e)
+ {
+ Logger.Critical($"Failed to instantiate speech implementation: {JsonSettings.speech_impl}");
+ Logger.Critical(e.ToString());
+ return false;
+ }
+
switch (Application.platform)
{
case RuntimePlatform.OSXPlayer:
- Speech = new AppleSpeech();
+ //Speech = new AppleSpeech();
SpeechExtensions.AddUiElements(Constants.APPLE_VOICE_NAME);
break;
case RuntimePlatform.WindowsPlayer:
- Speech = new WindowsSpeech();
+ //Speech = new WindowsSpeech();
+ //Speech = new AuralisSpeech();
+ //Speech = new KokoroSpeech();
SpeechExtensions.AddUiElements(Constants.WINDOWS_VOICE_NAME);
break;
default:
+ // I'm not sure if this will ever run, as the Linux version does not exist.
+ // Those running Linux use wine of some sort, which I believe would still show as Windows
Logger.Critical($"SpeechMod is not supported on {Application.platform}!");
return false;
}
@@ -154,6 +213,11 @@ private static bool SetSpeech()
private static bool OnToggle(UnityModManager.ModEntry modEntry, bool value)
{
+ if (!value && Speech is IDisposable disposableSpeech)
+ {
+ disposableSpeech.Dispose();
+ Speech = null;
+ }
Enabled = value;
return true;
}
diff --git a/SpeechMod/SpeechMod.csproj b/SpeechMod/SpeechMod.csproj
index 44be776..1337b4b 100644
--- a/SpeechMod/SpeechMod.csproj
+++ b/SpeechMod/SpeechMod.csproj
@@ -25,10 +25,14 @@
+
+
+
+