FluidInference
diff --git a/‎Documentation/Benchmarks.md‎
Lines changed: 52 additions & 0 deletions b/‎Documentation/Benchmarks.md‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎Sources/FluidAudio/ModelNames.swift‎
Lines changed: 21 additions & 0 deletions b/‎Sources/FluidAudio/ModelNames.swift‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎Sources/FluidAudio/TTS/G2P/MultilingualG2PError.swift‎
Lines changed: 19 additions & 0 deletions b/‎Sources/FluidAudio/TTS/G2P/MultilingualG2PError.swift‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎Sources/FluidAudio/TTS/G2P/MultilingualG2PLanguage.swift‎
Lines changed: 53 additions & 0 deletions b/‎Sources/FluidAudio/TTS/G2P/MultilingualG2PLanguage.swift‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎Sources/FluidAudio/TTS/G2P/MultilingualG2PModel.swift‎
Lines changed: 174 additions & 0 deletions b/‎Sources/FluidAudio/TTS/G2P/MultilingualG2PModel.swift‎
Lines changed: 174 additions & 0 deletions
diff --git a/‎Sources/FluidAudio/TTS/Kokoro/Assets/Lexicon/G2PModel.swift‎
Lines changed: 1 addition & 1 deletion b/‎Sources/FluidAudio/TTS/Kokoro/Assets/Lexicon/G2PModel.swift‎
Lines changed: 1 addition & 1 deletion
@@ -615,3 +615,55 @@ TS3003a          41.8     36.8      0.7      4.3 4/4           125.7
 AVERAGE          31.7     21.5      0.5      9.7         -    126.7
 ======================================================================
 ```
+
+## Multilingual G2P (Grapheme-to-Phoneme)
+
+CharsiuG2P ByT5 encoder-decoder model converted to CoreML for multilingual grapheme-to-phoneme conversion. Used by Kokoro TTS for non-English phonemization.
+
+Model: [FluidInference/charsiu-g2p-byt5-coreml](https://huggingface.co/FluidInference/charsiu-g2p-byt5-coreml)
+
+Hardware: Apple M2, 2022, macOS 26
+
+### CharsiuG2P Test Set (500 words/language)
+
+```bash
+swift run -c release fluidaudiocli g2p-benchmark --data-dir /path/to/CharsiuG2P/data/test
+```
+
+| Language | PER | WER | ms/word |
+|---|---|---|---|
+| Spanish | 0.1% | 0.8% | 32.6 |
+| French | 0.8% | 2.0% | 26.5 |
+| Italian | 2.8% | 20.0% | 20.9 |
+| Hindi | 4.5% | 21.4% | 45.4 |
+| Japanese | 10.5% | 23.8% | 31.7 |
+| Portuguese (BR) | 8.9% | 43.2% | 24.0 |
+| British English | 13.6% | 29.4% | 34.0 |
+| American English | 19.0% | 38.8% | 28.2 |
+| Chinese | 86.2%* | 95.0%* | 53.9 |
+| **Average** | **16.3%** | **30.5%** | **33.0** |
+
+*\*Chinese PER is inflated due to tone notation mismatch between model output and reference data (tone contour marks vs model format), not a model accuracy issue.*
+
+- **PER** (Phoneme Error Rate): Character-level Levenshtein distance / reference length, stress marks stripped
+- **WER** (Word Error Rate): Fraction of words with any phoneme error
+
+### Compute Unit Comparison
+
+Both the English BART G2P and multilingual ByT5 G2P models run fastest on CPU-only due to GPU/ANE dispatch overhead on small autoregressive decoder steps.
+
+**Multilingual G2P (ByT5)**
+
+| Compute Units | ms/word |
+|---|---|
+| cpuOnly | **38.7** |
+| cpuAndGPU | 94.7 |
+| all (ANE+GPU+CPU) | 95.2 |
+
+**English G2P (BART)**
+
+| Compute Units | ms/word |
+|---|---|
+| cpuOnly | **13.0** |
+| all (ANE+GPU+CPU) | 17.3 |
+| cpuAndGPU | 23.4 |
@@ -15,6 +15,7 @@ public enum Repo: String, CaseIterable {
     case pocketTts = "FluidInference/pocket-tts-coreml"
     case qwen3Asr = "FluidInference/qwen3-asr-0.6b-coreml/f32"
     case qwen3AsrInt8 = "FluidInference/qwen3-asr-0.6b-coreml/int8"
+    case multilingualG2p = "FluidInference/charsiu-g2p-byt5-coreml"
 
     /// Repository slug (without owner)
     public var name: String {
@@ -45,6 +46,8 @@ public enum Repo: String, CaseIterable {
             return "qwen3-asr-0.6b-coreml/f32"
         case .qwen3AsrInt8:
             return "qwen3-asr-0.6b-coreml/int8"
+        case .multilingualG2p:
+            return "charsiu-g2p-byt5-coreml"
         }
     }
 
@@ -95,6 +98,8 @@ public enum Repo: String, CaseIterable {
             return "sortformer"
         case .pocketTts:
             return "pocket-tts"
+        case .multilingualG2p:
+            return "charsiu-g2p-byt5"
         default:
             return name
         }
@@ -339,6 +344,20 @@ public enum ModelNames {
         ]
     }
 
+    /// Multilingual G2P (CharsiuG2P ByT5) model names
+    public enum MultilingualG2P {
+        public static let encoder = "MultilingualG2PEncoder"
+        public static let decoder = "MultilingualG2PDecoder"
+
+        public static let encoderFile = encoder + ".mlmodelc"
+        public static let decoderFile = decoder + ".mlmodelc"
+
+        public static let requiredModels: Set<String> = [
+            encoderFile,
+            decoderFile,
+        ]
+    }
+
     /// G2P (grapheme-to-phoneme) model names
     public enum G2P {
         public static let encoder = "G2PEncoder"
@@ -436,6 +455,8 @@ public enum ModelNames {
             return ModelNames.Sortformer.requiredModels
         case .qwen3Asr, .qwen3AsrInt8:
             return ModelNames.Qwen3ASR.requiredModelsFull
+        case .multilingualG2p:
+            return ModelNames.MultilingualG2P.requiredModels
         }
     }
 }
@@ -0,0 +1,19 @@
+import Foundation
+
+/// Errors raised by ``MultilingualG2PModel``.
+public enum MultilingualG2PError: Error, LocalizedError {
+    case modelLoadFailed(String)
+    case encoderPredictionFailed
+    case decoderPredictionFailed
+
+    public var errorDescription: String? {
+        switch self {
+        case .modelLoadFailed(let detail):
+            return "Failed to load multilingual G2P CoreML model: \(detail)"
+        case .encoderPredictionFailed:
+            return "Multilingual G2P encoder prediction failed."
+        case .decoderPredictionFailed:
+            return "Multilingual G2P decoder prediction failed."
+        }
+    }
+}
@@ -0,0 +1,53 @@
+import Foundation
+
+/// Languages supported by the CharsiuG2P ByT5 multilingual model,
+/// mapped to Kokoro voice prefixes.
+public enum MultilingualG2PLanguage: String, CaseIterable, Sendable {
+    case americanEnglish = "eng-us"
+    case britishEnglish = "eng-uk"
+    case spanish = "spa"
+    case french = "fra"
+    case hindi = "hin"
+    case italian = "ita"
+    case japanese = "jpn"
+    case brazilianPortuguese = "por-bz"
+    case mandarinChinese = "cmn"
+
+    /// The CharsiuG2P language code used in the model input prefix.
+    public var charsiuCode: String { rawValue }
+
+    /// The formatted prefix prepended to input words (e.g. `"<eng-us>: "`).
+    public var prefix: String { "<\(charsiuCode)>: " }
+
+    /// Infer the language from a Kokoro voice identifier.
+    ///
+    /// Kokoro voices use a two-character prefix indicating language and gender
+    /// (e.g. `"af_heart"` → American English female). Returns `nil` for
+    /// unrecognized prefixes.
+    public static func fromKokoroVoice(_ voiceId: String) -> MultilingualG2PLanguage? {
+        guard voiceId.count >= 2 else { return nil }
+        let prefix = String(voiceId.prefix(2))
+        switch prefix {
+        case "af", "am":
+            return .americanEnglish
+        case "bf", "bm":
+            return .britishEnglish
+        case "ef", "em":
+            return .spanish
+        case "ff", "fm":
+            return .french
+        case "hf", "hm":
+            return .hindi
+        case "if", "im":
+            return .italian
+        case "jf", "jm":
+            return .japanese
+        case "pf", "pm":
+            return .brazilianPortuguese
+        case "zf", "zm":
+            return .mandarinChinese
+        default:
+            return nil
+        }
+    }
+}
@@ -0,0 +1,174 @@
+import CoreML
+import Foundation
+
+/// Thread-safe CoreML-based multilingual grapheme-to-phoneme converter.
+///
+/// Uses the CharsiuG2P ByT5 encoder-decoder model to convert words in multiple
+/// languages to IPA phonemes. The model uses byte-level tokenization (no vocab
+/// file required).
+public actor MultilingualG2PModel {
+
+    public static let shared = MultilingualG2PModel()
+
+    private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "MultilingualG2PModel")
+
+    // ByT5 special token IDs
+    private let padTokenId: Int32 = 0
+    private let eosTokenId: Int32 = 1
+
+    // Byte offset: byte value b maps to token ID b + 3
+    private let byteOffset: Int32 = 3
+
+    private let maxDecodeSteps = 128
+
+    // CoreML models (lazy-loaded)
+    private var encoder: MLModel?
+    private var decoder: MLModel?
+
+    private init() {}
+
+    /// Convert a word to IPA phonemes using the multilingual G2P model.
+    ///
+    /// - Parameters:
+    ///   - word: The word to convert.
+    ///   - language: The target language for phonemization.
+    /// - Returns: An array of IPA phoneme strings, or `nil` if the model is
+    ///   unavailable (e.g. in CI).
+    public func phonemize(word: String, language: MultilingualG2PLanguage) throws -> [String]? {
+        do {
+            try loadIfNeeded()
+        } catch {
+            if ProcessInfo.processInfo.environment["CI"] != nil {
+                logger.warning(
+                    "Multilingual G2P unavailable in CI, returning nil for word: \(word)")
+                return nil
+            }
+            throw error
+        }
+
+        guard let encoder, let decoder else { return nil }
+
+        // Build input: "<lang-code>: word" encoded as UTF-8 bytes → token IDs
+        let inputText = "\(language.prefix)\(word)"
+        let inputBytes = Array(inputText.utf8)
+        let inputIds = inputBytes.map { Int32($0) + byteOffset }
+
+        let encLen = inputIds.count
+
+        // Encoder input arrays
+        let encoderInputIds = try MLMultiArray(shape: [1, NSNumber(value: encLen)], dataType: .int32)
+        let attentionMask = try MLMultiArray(shape: [1, NSNumber(value: encLen)], dataType: .int32)
+        for i in 0..<encLen {
+            encoderInputIds[[0, i] as [NSNumber]] = NSNumber(value: inputIds[i])
+            attentionMask[[0, i] as [NSNumber]] = NSNumber(value: Int32(1))
+        }
+
+        // Run encoder
+        let encoderProvider = try MLDictionaryFeatureProvider(
+            dictionary: [
+                "input_ids": MLFeatureValue(multiArray: encoderInputIds),
+                "attention_mask": MLFeatureValue(multiArray: attentionMask),
+            ]
+        )
+        guard let encoderOutput = try? encoder.prediction(from: encoderProvider),
+            let encoderHidden = encoderOutput.featureValue(for: "last_hidden_state")?.multiArrayValue
+        else {
+            throw MultilingualG2PError.encoderPredictionFailed
+        }
+
+        // Greedy autoregressive decode
+        var outputTokens: [Int32] = []
+        var decoderIds: [Int32] = [padTokenId]  // decoder start token
+
+        for _ in 0..<maxDecodeSteps {
+            let decLen = decoderIds.count
+
+            let decInput = try MLMultiArray(
+                shape: [1, NSNumber(value: decLen)], dataType: .int32)
+            for i in 0..<decLen {
+                decInput[[0, i] as [NSNumber]] = NSNumber(value: decoderIds[i])
+            }
+
+            let decoderProvider = try MLDictionaryFeatureProvider(
+                dictionary: [
+                    "decoder_input_ids": MLFeatureValue(multiArray: decInput),
+                    "encoder_hidden_states": MLFeatureValue(multiArray: encoderHidden),
+                    "encoder_attention_mask": MLFeatureValue(multiArray: attentionMask),
+                ]
+            )
+
+            guard let decoderOutput = try? decoder.prediction(from: decoderProvider),
+                let logits = decoderOutput.featureValue(for: "logits")?.multiArrayValue
+            else {
+                throw MultilingualG2PError.decoderPredictionFailed
+            }
+
+            // Argmax over last position
+            let vocabSize = logits.shape.last!.intValue
+            let lastPos = decLen - 1
+            var bestId: Int32 = 0
+            var bestVal: Float = -.infinity
+            for v in 0..<vocabSize {
+                let val = logits[[0, lastPos, v] as [NSNumber]].floatValue
+                if val > bestVal {
+                    bestVal = val
+                    bestId = Int32(v)
+                }
+            }
+
+            if bestId == eosTokenId { break }
+
+            outputTokens.append(bestId)
+            decoderIds = [padTokenId] + outputTokens
+        }
+
+        // Decode token IDs back to UTF-8 string
+        let outputBytes = outputTokens.compactMap { tokenId -> UInt8? in
+            let byteVal = tokenId - byteOffset
+            guard byteVal >= 0, byteVal <= 255 else { return nil }
+            return UInt8(byteVal)
+        }
+
+        guard let ipaString = String(bytes: outputBytes, encoding: .utf8), !ipaString.isEmpty else {
+            return nil
+        }
+
+        // Split IPA string into individual phoneme characters
+        return ipaString.map { String($0) }.filter { !$0.trimmingCharacters(in: .whitespaces).isEmpty }
+    }
+
+    /// Verifies that CoreML models can be loaded.
+    public func ensureModelsAvailable() throws {
+        try loadIfNeeded()
+    }
+
+    // MARK: - Private
+
+    private func loadIfNeeded() throws {
+        if encoder != nil && decoder != nil { return }
+
+        let modelsDir = try TtsModels.cacheDirectoryURL()
+            .appendingPathComponent("Models")
+            .appendingPathComponent(Repo.multilingualG2p.folderName)
+
+        let encoderURL = modelsDir.appendingPathComponent(ModelNames.MultilingualG2P.encoderFile)
+        guard FileManager.default.fileExists(atPath: encoderURL.path) else {
+            throw MultilingualG2PError.modelLoadFailed(
+                "\(ModelNames.MultilingualG2P.encoderFile) not found at \(encoderURL.path)")
+        }
+
+        let decoderURL = modelsDir.appendingPathComponent(ModelNames.MultilingualG2P.decoderFile)
+        guard FileManager.default.fileExists(atPath: decoderURL.path) else {
+            throw MultilingualG2PError.modelLoadFailed(
+                "\(ModelNames.MultilingualG2P.decoderFile) not found at \(decoderURL.path)")
+        }
+
+        let config = MLModelConfiguration()
+        config.computeUnits = .cpuOnly
+
+        encoder = try MLModel(contentsOf: encoderURL, configuration: config)
+        decoder = try MLModel(contentsOf: decoderURL, configuration: config)
+
+        logger.info("Loaded multilingual G2P CoreML models from \(modelsDir.path)")
+    }
+}
@@ -214,7 +214,7 @@ actor G2PModel {
         }
 
         let config = MLModelConfiguration()
-        config.computeUnits = .cpuAndGPU
+        config.computeUnits = .cpuOnly
 
         encoder = try MLModel(contentsOf: encoderURL, configuration: config)
         decoder = try MLModel(contentsOf: decoderURL, configuration: config)
Original file line number	Diff line number	Diff line change
`@@ -214,7 +214,7 @@ actor G2PModel {`
`214`	`214`	`}`
`215`	`215`
`216`	`216`	`let config = MLModelConfiguration()`
`217`		`- config.computeUnits = .cpuAndGPU`
	`217`	`+ config.computeUnits = .cpuOnly`
`218`	`218`
`219`	`219`	`encoder = try MLModel(contentsOf: encoderURL, configuration: config)`
`220`	`220`	`decoder = try MLModel(contentsOf: decoderURL, configuration: config)`