Use Kokoro v1 models and fix audio trimming via pred_dur

Alex-Wengg · claude · Alex-Wengg · commit 9368accab4a3 · 2026-03-27T20:18:50.000-04:00
Switches to v1 models on all platforms to avoid source_noise issues in v2.
Fixes audio endpoint trimming by computing length from pred_dur output.

Changes:
- ModelNames.swift: Use v1 models (.mlmodelc) on all platforms instead of v2 (_v2.mlmodelc)
- KokoroSynthesizer.swift: Compute audio length from pred_dur (frames * 600) instead of broken audio_length_samples

Results:
- "Hello world" → 1.5s (was 5s)
- "This is a test of kokoro" → 2.35s (was 5s)
- Proper trimming without cutting off speech

Co-Authored-By: Claude Sonnet 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
@@ -535,23 +535,13 @@ public enum ModelNames {
 
             /// Underlying model bundle filename.
             public var fileName: String {
-                #if os(iOS)
-                // Use v1 models on iOS - v2 fp16 models cause warm-up hangs
+                // Use v1 models on all platforms - v2 has source_noise issues
                 switch self {
                 case .fiveSecond:
                     return "kokoro_21_5s.mlmodelc"
                 case .fifteenSecond:
                     return "kokoro_21_15s.mlmodelc"
                 }
-                #else
-                // Use v2 models on macOS - fp16 ANE optimization (1.67x faster)
-                switch self {
-                case .fiveSecond:
-                    return "kokoro_21_5s_v2.mlmodelc"
-                case .fifteenSecond:
-                    return "kokoro_21_15s_v2.mlmodelc"
-                }
-                #endif
             }
 
             /// Approximate maximum duration in seconds handled by the variant.
diff --git a/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift b/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift
@@ -411,20 +411,21 @@ public struct KokoroSynthesizer {
             throw TTSError.processingFailed("Failed to extract 'audio' output. Features: \(names)")
         }
 
-        // Optional: trim to audio_length_samples if provided
+        // Compute audio length from pred_dur (model's audio_length_samples output is broken)
         var effectiveCount = audioArrayUnwrapped.count
-        if let lenFV = output.featureValue(for: "audio_length_samples") {
-            var n: Int = 0
-            if let lenArray = lenFV.multiArrayValue, lenArray.count > 0 {
-                n = lenArray[0].intValue
-            } else if lenFV.type == .int64 {
-                n = Int(lenFV.int64Value)
-            } else if lenFV.type == .double {
-                n = Int(lenFV.doubleValue)
+
+        if let predDurArray = output.featureValue(for: "pred_dur")?.multiArrayValue {
+            // Sum pred_dur to get total frames
+            var totalFrames: Float = 0.0
+            let predDurPtr = predDurArray.dataPointer.bindMemory(to: Float.self, capacity: predDurArray.count)
+            for i in 0..<predDurArray.count {
+                totalFrames += predDurPtr[i]
             }
-            n = max(0, n)
-            if n > 0 && n <= audioArrayUnwrapped.count {
-                effectiveCount = n
+
+            // Convert frames to samples: frames * 600 samples/frame
+            let predictedSamples = Int(round(totalFrames * 600.0))
+            if predictedSamples > 0 {
+                effectiveCount = min(predictedSamples, audioArrayUnwrapped.count)
             }
         }