Skip to content

Commit 9368acc

Browse files
Alex-Wenggclaude
andcommitted
Use Kokoro v1 models and fix audio trimming via pred_dur
Switches to v1 models on all platforms to avoid source_noise issues in v2. Fixes audio endpoint trimming by computing length from pred_dur output. Changes: - ModelNames.swift: Use v1 models (.mlmodelc) on all platforms instead of v2 (_v2.mlmodelc) - KokoroSynthesizer.swift: Compute audio length from pred_dur (frames * 600) instead of broken audio_length_samples Results: - "Hello world" → 1.5s (was 5s) - "This is a test of kokoro" → 2.35s (was 5s) - Proper trimming without cutting off speech Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent 06fc2ab commit 9368acc

File tree

2 files changed

+14
-23
lines changed

2 files changed

+14
-23
lines changed

Sources/FluidAudio/ModelNames.swift

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -535,23 +535,13 @@ public enum ModelNames {
535535

536536
/// Underlying model bundle filename.
537537
public var fileName: String {
538-
#if os(iOS)
539-
// Use v1 models on iOS - v2 fp16 models cause warm-up hangs
538+
// Use v1 models on all platforms - v2 has source_noise issues
540539
switch self {
541540
case .fiveSecond:
542541
return "kokoro_21_5s.mlmodelc"
543542
case .fifteenSecond:
544543
return "kokoro_21_15s.mlmodelc"
545544
}
546-
#else
547-
// Use v2 models on macOS - fp16 ANE optimization (1.67x faster)
548-
switch self {
549-
case .fiveSecond:
550-
return "kokoro_21_5s_v2.mlmodelc"
551-
case .fifteenSecond:
552-
return "kokoro_21_15s_v2.mlmodelc"
553-
}
554-
#endif
555545
}
556546

557547
/// Approximate maximum duration in seconds handled by the variant.

Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -411,20 +411,21 @@ public struct KokoroSynthesizer {
411411
throw TTSError.processingFailed("Failed to extract 'audio' output. Features: \(names)")
412412
}
413413

414-
// Optional: trim to audio_length_samples if provided
414+
// Compute audio length from pred_dur (model's audio_length_samples output is broken)
415415
var effectiveCount = audioArrayUnwrapped.count
416-
if let lenFV = output.featureValue(for: "audio_length_samples") {
417-
var n: Int = 0
418-
if let lenArray = lenFV.multiArrayValue, lenArray.count > 0 {
419-
n = lenArray[0].intValue
420-
} else if lenFV.type == .int64 {
421-
n = Int(lenFV.int64Value)
422-
} else if lenFV.type == .double {
423-
n = Int(lenFV.doubleValue)
416+
417+
if let predDurArray = output.featureValue(for: "pred_dur")?.multiArrayValue {
418+
// Sum pred_dur to get total frames
419+
var totalFrames: Float = 0.0
420+
let predDurPtr = predDurArray.dataPointer.bindMemory(to: Float.self, capacity: predDurArray.count)
421+
for i in 0..<predDurArray.count {
422+
totalFrames += predDurPtr[i]
424423
}
425-
n = max(0, n)
426-
if n > 0 && n <= audioArrayUnwrapped.count {
427-
effectiveCount = n
424+
425+
// Convert frames to samples: frames * 600 samples/frame
426+
let predictedSamples = Int(round(totalFrames * 600.0))
427+
if predictedSamples > 0 {
428+
effectiveCount = min(predictedSamples, audioArrayUnwrapped.count)
428429
}
429430
}
430431

0 commit comments

Comments
 (0)