Skip to content

Commit 937bbb3

Browse files
committed
feat(audio): normalize all inputs to 16k mono s16 + 5s gate & reduce
silence timeout to 10s;
1 parent 0748230 commit 937bbb3

File tree

5 files changed

+222
-5
lines changed

5 files changed

+222
-5
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,4 @@ notarized-build-*
4444
claudedocs/
4545

4646
agent-docs/
47+
.build

src-tauri/src/audio/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ pub mod level_meter;
33
pub mod recorder;
44
pub mod resampler;
55
pub mod silence_detector;
6+
pub mod normalizer;
67

78
#[cfg(test)]
89
mod converter_tests;

src-tauri/src/audio/normalizer.rs

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
use hound::{SampleFormat, WavReader, WavSpec, WavWriter};
2+
use rand::Rng;
3+
use crate::audio::resampler::resample_to_16khz;
4+
use std::fs;
5+
use std::path::{Path, PathBuf};
6+
7+
const TARGET_RATE: u32 = 16_000;
8+
const TARGET_CHANNELS: u16 = 1;
9+
const TARGET_BITS: u16 = 16;
10+
const TARGET_PEAK: f32 = 0.8; // ~ -1.9 dBFS
11+
const SILENCE_RMS_THRESHOLD: f32 = 1e-4; // ~ -80 dBFS
12+
13+
/// Normalize any WAV (our recorder output) to Whisper contract:
14+
/// WAV PCM S16LE, mono, 16 kHz, peak-normalized with light dither.
15+
pub fn normalize_to_whisper_wav(input_wav: &Path, out_dir: &Path) -> Result<PathBuf, String> {
16+
if !input_wav.exists() {
17+
return Err(format!("Input WAV does not exist: {:?}", input_wav));
18+
}
19+
20+
fs::create_dir_all(out_dir).map_err(|e| format!("Failed to create out_dir: {}", e))?;
21+
22+
// Open source wav (expect our recorder: PCM 16-bit interleaved)
23+
let mut reader = WavReader::open(input_wav)
24+
.map_err(|e| format!("Failed to open WAV: {}", e))?;
25+
let spec = reader.spec();
26+
27+
if spec.sample_format != SampleFormat::Int || spec.bits_per_sample != 16 {
28+
// For now we handle our own 16-bit files; other formats can be extended later.
29+
// Avoid surprising runtime errors by surfacing a clear message.
30+
log::warn!(
31+
"Normalizer expected 16-bit PCM. Got {:?} {}-bit; proceeding best-effort.",
32+
spec.sample_format, spec.bits_per_sample
33+
);
34+
}
35+
36+
let channels = spec.channels.max(1);
37+
let sample_rate = spec.sample_rate.max(1);
38+
39+
// Read samples as i16 → f32 [-1,1]
40+
let samples_i16: Vec<i16> = reader
41+
.samples::<i16>()
42+
.collect::<Result<Vec<_>, _>>()
43+
.map_err(|e| format!("Failed to read samples: {}", e))?;
44+
if samples_i16.is_empty() {
45+
return Err("WAV contains no samples".to_string());
46+
}
47+
48+
let mut samples_f32: Vec<f32> = samples_i16
49+
.iter()
50+
.map(|&s| s as f32 / i16::MAX as f32)
51+
.collect();
52+
53+
// If multi-channel, compute per-channel RMS and ignore near-silent channels.
54+
let mono: Vec<f32> = if channels == 1 {
55+
samples_f32
56+
} else {
57+
downmix_equal_power_ignore_silent(&samples_f32, channels as usize)
58+
};
59+
60+
// Resample to 16 kHz using our high-quality rubato resampler
61+
let resampled = if sample_rate != TARGET_RATE {
62+
resample_to_16khz(&mono, sample_rate)?
63+
} else {
64+
mono
65+
};
66+
67+
// Peak normalize to TARGET_PEAK with a soft clamp
68+
let peak = resampled
69+
.iter()
70+
.fold(0.0f32, |m, &x| m.max(x.abs()));
71+
let gain = if peak > 0.0 { (TARGET_PEAK / peak).min(10.0) } else { 1.0 };
72+
let mut normalized: Vec<f32> = if (gain - 1.0).abs() > 1e-3 {
73+
resampled.iter().map(|&x| (x * gain).clamp(-1.0, 1.0)).collect()
74+
} else {
75+
resampled
76+
};
77+
78+
// Quantize to i16 with TPDF dither
79+
let mut rng = rand::thread_rng();
80+
let mut pcm_i16 = Vec::with_capacity(normalized.len());
81+
for &x in &normalized {
82+
// TPDF dither: add two independent uniform(-0.5,0.5) LSBs
83+
let dither = (rng.gen::<f32>() - 0.5) + (rng.gen::<f32>() - 0.5);
84+
let y = (x * i16::MAX as f32 + dither).clamp(i16::MIN as f32, i16::MAX as f32);
85+
pcm_i16.push(y as i16);
86+
}
87+
88+
// Write final WAV
89+
let ts = chrono::Local::now().format("%Y%m%d_%H%M%S");
90+
let out_path = out_dir.join(format!("normalized_{}.wav", ts));
91+
let out_spec = WavSpec {
92+
channels: TARGET_CHANNELS,
93+
sample_rate: TARGET_RATE,
94+
bits_per_sample: TARGET_BITS,
95+
sample_format: SampleFormat::Int,
96+
};
97+
let mut writer =
98+
WavWriter::create(&out_path, out_spec).map_err(|e| format!("WAV create failed: {}", e))?;
99+
for s in pcm_i16 {
100+
writer
101+
.write_sample(s)
102+
.map_err(|e| format!("WAV write failed: {}", e))?;
103+
}
104+
writer.finalize().map_err(|e| format!("WAV finalize failed: {}", e))?;
105+
106+
Ok(out_path)
107+
}
108+
109+
fn downmix_equal_power_ignore_silent(input: &[f32], channels: usize) -> Vec<f32> {
110+
if channels == 0 {
111+
return vec![];
112+
}
113+
let frames = input.len() / channels;
114+
if frames == 0 {
115+
return vec![];
116+
}
117+
118+
// RMS per channel
119+
let mut sumsq = vec![0.0f32; channels];
120+
for frame in 0..frames {
121+
let base = frame * channels;
122+
for ch in 0..channels {
123+
let s = input[base + ch];
124+
sumsq[ch] += s * s;
125+
}
126+
}
127+
let rms: Vec<f32> = sumsq
128+
.iter()
129+
.map(|&s| (s / frames as f32).sqrt())
130+
.collect();
131+
let mut active: Vec<usize> = rms
132+
.iter()
133+
.enumerate()
134+
.filter(|(_, &e)| e > SILENCE_RMS_THRESHOLD)
135+
.map(|(i, _)| i)
136+
.collect();
137+
if active.is_empty() {
138+
// If all channels are silent by threshold, use all channels to avoid empty output
139+
active = (0..channels).collect();
140+
}
141+
142+
let gain = (1.0f32 / (active.len() as f32)).sqrt();
143+
144+
let mut out = Vec::with_capacity(frames);
145+
for frame in 0..frames {
146+
let base = frame * channels;
147+
let mut sum = 0.0f32;
148+
for &ch in &active {
149+
sum += input[base + ch];
150+
}
151+
out.push((sum * gain).clamp(-1.0, 1.0));
152+
}
153+
out
154+
}

src-tauri/src/audio/recorder.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ impl AudioRecorder {
104104
let (audio_level_tx, audio_level_rx) = mpsc::channel::<f64>();
105105

106106
// Silence detection config for VAD
107-
let silence_duration = Duration::from_secs(60); // 60 seconds of silence
107+
let silence_duration = Duration::from_secs(10); // 10 seconds of continuous silence
108108

109109
// Spawn recording thread
110110
let thread_handle = thread::spawn(move || -> Result<String, String> {

src-tauri/src/commands/audio.rs

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -694,9 +694,62 @@ pub async fn stop_recording(
694694
}
695695
};
696696

697-
// === Audio validation now handled by transcriber ===
698-
// Transcriber will check duration and format during processing
699-
log_with_context(log::Level::Debug, "Proceeding directly to transcription", &[
697+
// Normalize captured audio to Whisper contract (WAV PCM s16, mono, 16k)
698+
let parent_dir = audio_path
699+
.parent()
700+
.map(|p| p.to_path_buf())
701+
.unwrap_or_else(|| std::path::Path::new(".").to_path_buf());
702+
703+
let normalized_path = match crate::audio::normalizer::normalize_to_whisper_wav(&audio_path, &parent_dir) {
704+
Ok(p) => p,
705+
Err(e) => {
706+
log::error!("Audio normalization failed: {}", e);
707+
update_recording_state(&app, RecordingState::Error, Some("Audio normalization failed".to_string()));
708+
// Attempt cleanup
709+
let _ = std::fs::remove_file(&audio_path);
710+
return Err("Audio normalization failed".to_string());
711+
}
712+
};
713+
714+
// Remove raw capture after successful normalization
715+
if let Err(e) = std::fs::remove_file(&audio_path) {
716+
log::debug!("Failed to remove raw audio: {}", e);
717+
}
718+
719+
// Duration gate (min 5s) using normalized file
720+
let too_short = (|| -> Result<bool, String> {
721+
let reader = hound::WavReader::open(&normalized_path)
722+
.map_err(|e| format!("Failed to open normalized wav: {}", e))?;
723+
let spec = reader.spec();
724+
let frames = reader.duration() / spec.channels as u32; // mono expected
725+
let duration = frames as f32 / spec.sample_rate as f32;
726+
log_with_context(log::Level::Info, "NORMALIZED_AUDIO", &[
727+
("path", &format!("{:?}", normalized_path).as_str()),
728+
("sample_rate", &spec.sample_rate.to_string().as_str()),
729+
("channels", &spec.channels.to_string().as_str()),
730+
("bits", &spec.bits_per_sample.to_string().as_str()),
731+
("duration_s", &format!("{:.2}", duration).as_str()),
732+
]);
733+
Ok(duration < 5.0)
734+
})();
735+
736+
if let Ok(true) = too_short {
737+
// Emit friendly feedback and stop here
738+
let _ = emit_to_window(&app, "pill", "recording-too-short", "Recording shorter than 5 seconds");
739+
if let Err(e) = std::fs::remove_file(&normalized_path) {
740+
log::debug!("Failed to remove short normalized audio: {}", e);
741+
}
742+
// Hide pill and return to Idle
743+
if let Err(e) = crate::commands::window::hide_pill_widget(app.clone()).await {
744+
log::error!("Failed to hide pill window: {}", e);
745+
}
746+
update_recording_state(&app, RecordingState::Idle, None);
747+
return Ok("".to_string());
748+
}
749+
750+
// Proceed to transcription with normalized file
751+
let audio_path = normalized_path;
752+
log_with_context(log::Level::Debug, "Proceeding to transcription", &[
700753
("audio_path", &format!("{:?}", audio_path).as_str()),
701754
("stage", "pre_transcription")
702755
]);
@@ -1356,6 +1409,10 @@ pub async fn transcribe_audio_file(
13561409
let wav_path = crate::audio::converter::convert_to_wav(audio_path, &recordings_dir)?;
13571410
let is_converted = wav_path != audio_path;
13581411

1412+
// Normalize imported audio to Whisper contract (WAV PCM s16, mono, 16k)
1413+
let normalized_path = crate::audio::normalizer::normalize_to_whisper_wav(&wav_path, &recordings_dir)
1414+
.map_err(|e| format!("Audio normalization failed: {}", e))?;
1415+
13591416
// Get model path
13601417
let whisper_manager = app.state::<AsyncRwLock<WhisperManager>>();
13611418
let model_path = whisper_manager
@@ -1395,7 +1452,7 @@ pub async fn transcribe_audio_file(
13951452
};
13961453

13971454
let text = transcriber.transcribe_with_translation(
1398-
&wav_path,
1455+
&normalized_path,
13991456
Some(&language),
14001457
translate_to_english,
14011458
)?;
@@ -1408,6 +1465,10 @@ pub async fn transcribe_audio_file(
14081465
log::debug!("Cleaned up temporary converted WAV file");
14091466
}
14101467
}
1468+
// Clean up normalized WAV
1469+
if let Err(e) = std::fs::remove_file(&normalized_path) {
1470+
log::warn!("Failed to remove normalized WAV file: {}", e);
1471+
}
14111472

14121473
Ok(text)
14131474
}

0 commit comments

Comments
 (0)