Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
9600bb8
feat: Qwen3-TTS CoreML conversion pipeline
Alex-Wengg Jan 28, 2026
b0942e5
feat: add best-of-n sampling for improved prosody
Alex-Wengg Jan 28, 2026
f784c7d
fix: use sampling for code_predictor to prevent muffled audio
Alex-Wengg Jan 28, 2026
9f69b33
fix: include RMS in audio scoring to select louder, clearer audio
Alex-Wengg Jan 28, 2026
2108b85
fix: eliminate double code_predictor calls to reduce background noise
Alex-Wengg Jan 28, 2026
e3ee51c
feat: PocketTTS pure CoreML pipeline — zero PyTorch dependency
Alex-Wengg Jan 28, 2026
5ca5880
chore: remove redundant debug, test, and old conversion scripts
Alex-Wengg Jan 28, 2026
b44c071
docs: add trial log and conversion guide for PocketTTS CoreML
Alex-Wengg Jan 28, 2026
7f63055
feat: add PocketTTS Python package, conversion scripts, and docs
Alex-Wengg Jan 29, 2026
7d7f370
refactor: use spectral similarity instead of Resemblyzer
Alex-Wengg Feb 4, 2026
87fe41d
feat: add Qwen3-TTS v9/v10 conversion, bilingual testing, and RAM mea…
Alex-Wengg Feb 5, 2026
3d87e9a
docs: add issue documentation for Qwen3-TTS conversion and integration
Alex-Wengg Feb 5, 2026
4fd954e
chore: remove debug, intermediate, and measurement scripts
Alex-Wengg Feb 5, 2026
d8760e1
refactor: organize scripts into convert/, explore/, test/ subfolders
Alex-Wengg Feb 5, 2026
a5c8e04
docs: add Swift integration and performance issue documentation
Alex-Wengg Feb 5, 2026
c6cfa40
Merge main into feature/qwen3-tts-coreml (accept main for pocket_tts …
Alex-Wengg Feb 5, 2026
b47105f
chore: remove unrelated pocket_tts files from PR
Alex-Wengg Feb 5, 2026
4c69dd2
docs: add debugging methodology guide for CoreML model conversion
Alex-Wengg Feb 5, 2026
2a5a173
fix: correct EOS token ID and add uv.lock for Qwen3 TTS
Alex-Wengg Mar 21, 2026
1e77d03
revert root .gitignore to match main
Alex-Wengg Mar 21, 2026
a4765ff
Merge remote-tracking branch 'origin/main' into feature/qwen3-tts-coreml
Alex-Wengg Mar 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactor: use spectral similarity instead of Resemblyzer
- Mel spectrogram cosine similarity for voice timbre
- MFCC similarity for voice characteristics
- No neural network dependency (just librosa/scipy)
- Simpler, faster, more interpretable
  • Loading branch information
Alex-Wengg committed Feb 4, 2026
commit 7d7f370017792552191042823b9292a68acf167d
296 changes: 296 additions & 0 deletions models/tts/pocket_tts/coreml/voice_cloning/evaluate_voice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
#!/usr/bin/env python3
"""Evaluate voice cloning quality using spectral similarity.

Compares a reference voice sample with synthesized TTS output using
mel-spectrogram cosine similarity - no neural network required.

Requirements:
pip install librosa numpy scipy

Usage:
python evaluate_voice.py reference.wav synthesized.wav
python evaluate_voice.py reference.wav synthesized.wav --plot
"""
import argparse
import logging
import sys
from pathlib import Path

import numpy as np

logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)

SAMPLE_RATE = 24000 # PocketTTS native sample rate


def load_audio(path: Path) -> np.ndarray:
"""Load audio and resample to target sample rate."""
try:
import librosa
audio, _ = librosa.load(str(path), sr=SAMPLE_RATE, mono=True)
return audio
except ImportError:
from scipy.io import wavfile
from scipy import signal
sr, audio = wavfile.read(str(path))
if audio.dtype == np.int16:
audio = audio.astype(np.float32) / 32768.0
elif audio.dtype == np.int32:
audio = audio.astype(np.float32) / 2147483648.0
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
if sr != SAMPLE_RATE:
num_samples = int(len(audio) * SAMPLE_RATE / sr)
audio = signal.resample(audio, num_samples)
return audio.astype(np.float32)


def compute_mel_spectrogram(audio: np.ndarray, n_mels: int = 80, n_fft: int = 1024,
hop_length: int = 256) -> np.ndarray:
"""Compute mel spectrogram."""
try:
import librosa
mel = librosa.feature.melspectrogram(
y=audio, sr=SAMPLE_RATE, n_mels=n_mels,
n_fft=n_fft, hop_length=hop_length
)
return librosa.power_to_db(mel, ref=np.max)
except ImportError:
# Fallback using scipy
from scipy import signal
from scipy.fftpack import dct

# Simple STFT
_, _, Sxx = signal.spectrogram(audio, fs=SAMPLE_RATE, nperseg=n_fft,
noverlap=n_fft - hop_length)
# Approximate mel scaling (simplified)
mel_basis = np.zeros((n_mels, Sxx.shape[0]))
for i in range(n_mels):
center = int(Sxx.shape[0] * (i + 1) / (n_mels + 1))
width = max(1, Sxx.shape[0] // (n_mels * 2))
mel_basis[i, max(0, center-width):min(Sxx.shape[0], center+width)] = 1
mel_basis = mel_basis / (mel_basis.sum(axis=1, keepdims=True) + 1e-8)
mel = np.dot(mel_basis, Sxx)
return 10 * np.log10(mel + 1e-10)


def compute_mfcc(audio: np.ndarray, n_mfcc: int = 13) -> np.ndarray:
"""Compute MFCCs."""
try:
import librosa
return librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=n_mfcc)
except ImportError:
mel = compute_mel_spectrogram(audio)
from scipy.fftpack import dct
return dct(mel, type=2, axis=0, norm='ortho')[:n_mfcc]


def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Compute cosine similarity between two vectors."""
a_flat = a.flatten()
b_flat = b.flatten()
# Truncate to same length
min_len = min(len(a_flat), len(b_flat))
a_flat = a_flat[:min_len]
b_flat = b_flat[:min_len]

norm_a = np.linalg.norm(a_flat)
norm_b = np.linalg.norm(b_flat)
if norm_a == 0 or norm_b == 0:
return 0.0
return float(np.dot(a_flat, b_flat) / (norm_a * norm_b))


def compute_spectral_similarity(ref_audio: np.ndarray, syn_audio: np.ndarray) -> dict:
"""Compute spectral similarity metrics."""
# Compute mel spectrograms
ref_mel = compute_mel_spectrogram(ref_audio)
syn_mel = compute_mel_spectrogram(syn_audio)

# Compute mean mel vectors (voice timbre signature)
ref_mel_mean = ref_mel.mean(axis=1)
syn_mel_mean = syn_mel.mean(axis=1)
mel_similarity = cosine_similarity(ref_mel_mean, syn_mel_mean)

# Compute MFCCs
ref_mfcc = compute_mfcc(ref_audio)
syn_mfcc = compute_mfcc(syn_audio)

# MFCC mean (captures voice characteristics)
ref_mfcc_mean = ref_mfcc.mean(axis=1)
syn_mfcc_mean = syn_mfcc.mean(axis=1)
mfcc_similarity = cosine_similarity(ref_mfcc_mean, syn_mfcc_mean)

# MFCC std (captures dynamics)
ref_mfcc_std = ref_mfcc.std(axis=1)
syn_mfcc_std = syn_mfcc.std(axis=1)
mfcc_std_similarity = cosine_similarity(ref_mfcc_std, syn_mfcc_std)

return {
'mel_similarity': mel_similarity,
'mfcc_similarity': mfcc_similarity,
'mfcc_std_similarity': mfcc_std_similarity,
}


def evaluate_voice_cloning(
reference_path: Path,
synthesized_path: Path,
plot: bool = False
) -> dict:
"""Evaluate voice cloning quality using spectral similarity."""
logger.info(f"Reference: {reference_path}")
logger.info(f"Synthesized: {synthesized_path}")
logger.info("")

# Load audio
ref_audio = load_audio(reference_path)
syn_audio = load_audio(synthesized_path)

logger.info(f"Reference duration: {len(ref_audio) / SAMPLE_RATE:.2f}s")
logger.info(f"Synthesized duration: {len(syn_audio) / SAMPLE_RATE:.2f}s")
logger.info("")

# Compute spectral similarity
logger.info("Computing spectral similarity...")
metrics = compute_spectral_similarity(ref_audio, syn_audio)

# Combined score (weighted average)
combined = (
0.4 * metrics['mel_similarity'] +
0.4 * metrics['mfcc_similarity'] +
0.2 * metrics['mfcc_std_similarity']
)
metrics['combined_similarity'] = combined

logger.info("")
logger.info(f" Mel Similarity: {metrics['mel_similarity']:.4f}")
logger.info(f" MFCC Similarity: {metrics['mfcc_similarity']:.4f}")
logger.info(f" MFCC Std Similarity: {metrics['mfcc_std_similarity']:.4f}")
logger.info(f" Combined Score: {combined:.4f}")

# Quality interpretation
if combined >= 0.90:
quality = "Excellent"
elif combined >= 0.80:
quality = "Good"
elif combined >= 0.70:
quality = "Fair"
else:
quality = "Poor"

metrics['quality'] = quality
logger.info(f" Quality: {quality}")

# Plot if requested
if plot:
plot_spectrograms(ref_audio, syn_audio, reference_path.stem, synthesized_path.stem)

return metrics


def plot_spectrograms(ref_audio: np.ndarray, syn_audio: np.ndarray,
ref_name: str, syn_name: str):
"""Visualize mel spectrograms."""
try:
import matplotlib.pyplot as plt
except ImportError:
logger.warning("matplotlib not installed, skipping plot")
return

ref_mel = compute_mel_spectrogram(ref_audio)
syn_mel = compute_mel_spectrogram(syn_audio)

fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Reference mel spectrogram
im0 = axes[0, 0].imshow(ref_mel, aspect='auto', origin='lower', cmap='magma')
axes[0, 0].set_title(f'Reference: {ref_name}')
axes[0, 0].set_ylabel('Mel bin')
plt.colorbar(im0, ax=axes[0, 0], format='%+2.0f dB')

# Synthesized mel spectrogram
im1 = axes[0, 1].imshow(syn_mel, aspect='auto', origin='lower', cmap='magma')
axes[0, 1].set_title(f'Synthesized: {syn_name}')
axes[0, 1].set_ylabel('Mel bin')
plt.colorbar(im1, ax=axes[0, 1], format='%+2.0f dB')

# Mean mel comparison
ref_mel_mean = ref_mel.mean(axis=1)
syn_mel_mean = syn_mel.mean(axis=1)
axes[1, 0].plot(ref_mel_mean, label='Reference', alpha=0.8)
axes[1, 0].plot(syn_mel_mean, label='Synthesized', alpha=0.8)
axes[1, 0].set_xlabel('Mel bin')
axes[1, 0].set_ylabel('Mean energy (dB)')
axes[1, 0].set_title('Mean Mel Spectrum (Voice Timbre)')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# MFCC comparison
ref_mfcc = compute_mfcc(ref_audio).mean(axis=1)
syn_mfcc = compute_mfcc(syn_audio).mean(axis=1)
x = np.arange(len(ref_mfcc))
width = 0.35
axes[1, 1].bar(x - width/2, ref_mfcc, width, label='Reference', alpha=0.8)
axes[1, 1].bar(x + width/2, syn_mfcc, width, label='Synthesized', alpha=0.8)
axes[1, 1].set_xlabel('MFCC coefficient')
axes[1, 1].set_ylabel('Value')
axes[1, 1].set_title('Mean MFCCs')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('spectral_comparison.png', dpi=150)
logger.info("\nSaved comparison plot to: spectral_comparison.png")
plt.show()


def main():
parser = argparse.ArgumentParser(
description="Evaluate voice cloning using spectral similarity",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Spectral Similarity Thresholds:
0.90+ Excellent - Very close spectral match
0.80+ Good - Similar voice characteristics
0.70+ Fair - Some similarity
<0.70 Poor - Different spectral characteristics

Metrics:
- Mel Similarity: Cosine similarity of mean mel spectrum (timbre)
- MFCC Similarity: Cosine similarity of mean MFCCs (voice characteristics)
- MFCC Std Similarity: Similarity of MFCC dynamics

Requirements:
pip install librosa numpy
# Or minimal: pip install scipy numpy

Examples:
python evaluate_voice.py original_speaker.wav tts_output.wav
python evaluate_voice.py reference.wav synthesized.wav --plot
"""
)
parser.add_argument("reference", type=Path, help="Reference voice audio file")
parser.add_argument("synthesized", type=Path, help="Synthesized TTS audio file")
parser.add_argument("--plot", action="store_true", help="Show spectrogram comparison plots")
parser.add_argument("--json", action="store_true", help="Output metrics as JSON")

args = parser.parse_args()

if not args.reference.exists():
logger.error(f"Reference file not found: {args.reference}")
sys.exit(1)
if not args.synthesized.exists():
logger.error(f"Synthesized file not found: {args.synthesized}")
sys.exit(1)

metrics = evaluate_voice_cloning(args.reference, args.synthesized, plot=args.plot)

if args.json:
import json
print(json.dumps(metrics, indent=2))


if __name__ == "__main__":
main()