diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 203aa457f..1d4d7e9fc 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -535,23 +535,13 @@ public enum ModelNames { /// Underlying model bundle filename. public var fileName: String { - #if os(iOS) - // Use v1 models on iOS - v2 fp16 models cause warm-up hangs + // Use v1 models on all platforms - v2 has source_noise issues switch self { case .fiveSecond: return "kokoro_21_5s.mlmodelc" case .fifteenSecond: return "kokoro_21_15s.mlmodelc" } - #else - // Use v2 models on macOS - fp16 ANE optimization (1.67x faster) - switch self { - case .fiveSecond: - return "kokoro_21_5s_v2.mlmodelc" - case .fifteenSecond: - return "kokoro_21_15s_v2.mlmodelc" - } - #endif } /// Approximate maximum duration in seconds handled by the variant. diff --git a/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift b/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift index d19564d96..f9b73d464 100644 --- a/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift +++ b/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift @@ -411,20 +411,21 @@ public struct KokoroSynthesizer { throw TTSError.processingFailed("Failed to extract 'audio' output. Features: \(names)") } - // Optional: trim to audio_length_samples if provided + // Compute audio length from pred_dur (model's audio_length_samples output is broken) var effectiveCount = audioArrayUnwrapped.count - if let lenFV = output.featureValue(for: "audio_length_samples") { - var n: Int = 0 - if let lenArray = lenFV.multiArrayValue, lenArray.count > 0 { - n = lenArray[0].intValue - } else if lenFV.type == .int64 { - n = Int(lenFV.int64Value) - } else if lenFV.type == .double { - n = Int(lenFV.doubleValue) + + if let predDurArray = output.featureValue(for: "pred_dur")?.multiArrayValue { + // Sum pred_dur to get total frames + var totalFrames: Float = 0.0 + let predDurPtr = predDurArray.dataPointer.bindMemory(to: Float.self, capacity: predDurArray.count) + for i in 0.. 0 && n <= audioArrayUnwrapped.count { - effectiveCount = n + + // Convert frames to samples: frames * 600 samples/frame + let predictedSamples = Int(round(totalFrames * 600.0)) + if predictedSamples > 0 { + effectiveCount = min(predictedSamples, audioArrayUnwrapped.count) } }