From 9368accab4a380ae29e279115da869a8a3c2b432 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Fri, 27 Mar 2026 20:18:50 -0400 Subject: [PATCH] Use Kokoro v1 models and fix audio trimming via pred_dur MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switches to v1 models on all platforms to avoid source_noise issues in v2. Fixes audio endpoint trimming by computing length from pred_dur output. Changes: - ModelNames.swift: Use v1 models (.mlmodelc) on all platforms instead of v2 (_v2.mlmodelc) - KokoroSynthesizer.swift: Compute audio length from pred_dur (frames * 600) instead of broken audio_length_samples Results: - "Hello world" → 1.5s (was 5s) - "This is a test of kokoro" → 2.35s (was 5s) - Proper trimming without cutting off speech Co-Authored-By: Claude Sonnet 4.5 --- Sources/FluidAudio/ModelNames.swift | 12 +-------- .../Synthesize/KokoroSynthesizer.swift | 25 ++++++++++--------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 203aa457f..1d4d7e9fc 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -535,23 +535,13 @@ public enum ModelNames { /// Underlying model bundle filename. public var fileName: String { - #if os(iOS) - // Use v1 models on iOS - v2 fp16 models cause warm-up hangs + // Use v1 models on all platforms - v2 has source_noise issues switch self { case .fiveSecond: return "kokoro_21_5s.mlmodelc" case .fifteenSecond: return "kokoro_21_15s.mlmodelc" } - #else - // Use v2 models on macOS - fp16 ANE optimization (1.67x faster) - switch self { - case .fiveSecond: - return "kokoro_21_5s_v2.mlmodelc" - case .fifteenSecond: - return "kokoro_21_15s_v2.mlmodelc" - } - #endif } /// Approximate maximum duration in seconds handled by the variant. diff --git a/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift b/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift index d19564d96..f9b73d464 100644 --- a/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift +++ b/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift @@ -411,20 +411,21 @@ public struct KokoroSynthesizer { throw TTSError.processingFailed("Failed to extract 'audio' output. Features: \(names)") } - // Optional: trim to audio_length_samples if provided + // Compute audio length from pred_dur (model's audio_length_samples output is broken) var effectiveCount = audioArrayUnwrapped.count - if let lenFV = output.featureValue(for: "audio_length_samples") { - var n: Int = 0 - if let lenArray = lenFV.multiArrayValue, lenArray.count > 0 { - n = lenArray[0].intValue - } else if lenFV.type == .int64 { - n = Int(lenFV.int64Value) - } else if lenFV.type == .double { - n = Int(lenFV.doubleValue) + + if let predDurArray = output.featureValue(for: "pred_dur")?.multiArrayValue { + // Sum pred_dur to get total frames + var totalFrames: Float = 0.0 + let predDurPtr = predDurArray.dataPointer.bindMemory(to: Float.self, capacity: predDurArray.count) + for i in 0.. 0 && n <= audioArrayUnwrapped.count { - effectiveCount = n + + // Convert frames to samples: frames * 600 samples/frame + let predictedSamples = Int(round(totalFrames * 600.0)) + if predictedSamples > 0 { + effectiveCount = min(predictedSamples, audioArrayUnwrapped.count) } }