feat(audio): normalize all inputs to 16k mono s16 + 5s gate & reduce

moinulmoin · moinulmoin · commit 937bbb3620ce · 2025-10-09T14:17:06.000+06:00
silence timeout to 10s;
diff --git a/.gitignore b/.gitignore
@@ -44,3 +44,4 @@ notarized-build-*
 claudedocs/
 
 agent-docs/
+.build
diff --git a/src-tauri/src/audio/mod.rs b/src-tauri/src/audio/mod.rs
@@ -3,6 +3,7 @@ pub mod level_meter;
 pub mod recorder;
 pub mod resampler;
 pub mod silence_detector;
+pub mod normalizer;
 
 #[cfg(test)]
 mod converter_tests;
diff --git a/src-tauri/src/audio/normalizer.rs b/src-tauri/src/audio/normalizer.rs
@@ -0,0 +1,154 @@
+use hound::{SampleFormat, WavReader, WavSpec, WavWriter};
+use rand::Rng;
+use crate::audio::resampler::resample_to_16khz;
+use std::fs;
+use std::path::{Path, PathBuf};
+
+const TARGET_RATE: u32 = 16_000;
+const TARGET_CHANNELS: u16 = 1;
+const TARGET_BITS: u16 = 16;
+const TARGET_PEAK: f32 = 0.8; // ~ -1.9 dBFS
+const SILENCE_RMS_THRESHOLD: f32 = 1e-4; // ~ -80 dBFS
+
+/// Normalize any WAV (our recorder output) to Whisper contract:
+/// WAV PCM S16LE, mono, 16 kHz, peak-normalized with light dither.
+pub fn normalize_to_whisper_wav(input_wav: &Path, out_dir: &Path) -> Result<PathBuf, String> {
+    if !input_wav.exists() {
+        return Err(format!("Input WAV does not exist: {:?}", input_wav));
+    }
+
+    fs::create_dir_all(out_dir).map_err(|e| format!("Failed to create out_dir: {}", e))?;
+
+    // Open source wav (expect our recorder: PCM 16-bit interleaved)
+    let mut reader = WavReader::open(input_wav)
+        .map_err(|e| format!("Failed to open WAV: {}", e))?;
+    let spec = reader.spec();
+
+    if spec.sample_format != SampleFormat::Int || spec.bits_per_sample != 16 {
+        // For now we handle our own 16-bit files; other formats can be extended later.
+        // Avoid surprising runtime errors by surfacing a clear message.
+        log::warn!(
+            "Normalizer expected 16-bit PCM. Got {:?} {}-bit; proceeding best-effort.",
+            spec.sample_format, spec.bits_per_sample
+        );
+    }
+
+    let channels = spec.channels.max(1);
+    let sample_rate = spec.sample_rate.max(1);
+
+    // Read samples as i16 → f32 [-1,1]
+    let samples_i16: Vec<i16> = reader
+        .samples::<i16>()
+        .collect::<Result<Vec<_>, _>>()
+        .map_err(|e| format!("Failed to read samples: {}", e))?;
+    if samples_i16.is_empty() {
+        return Err("WAV contains no samples".to_string());
+    }
+
+    let mut samples_f32: Vec<f32> = samples_i16
+        .iter()
+        .map(|&s| s as f32 / i16::MAX as f32)
+        .collect();
+
+    // If multi-channel, compute per-channel RMS and ignore near-silent channels.
+    let mono: Vec<f32> = if channels == 1 {
+        samples_f32
+    } else {
+        downmix_equal_power_ignore_silent(&samples_f32, channels as usize)
+    };
+
+    // Resample to 16 kHz using our high-quality rubato resampler
+    let resampled = if sample_rate != TARGET_RATE {
+        resample_to_16khz(&mono, sample_rate)?
+    } else {
+        mono
+    };
+
+    // Peak normalize to TARGET_PEAK with a soft clamp
+    let peak = resampled
+        .iter()
+        .fold(0.0f32, |m, &x| m.max(x.abs()));
+    let gain = if peak > 0.0 { (TARGET_PEAK / peak).min(10.0) } else { 1.0 };
+    let mut normalized: Vec<f32> = if (gain - 1.0).abs() > 1e-3 {
+        resampled.iter().map(|&x| (x * gain).clamp(-1.0, 1.0)).collect()
+    } else {
+        resampled
+    };
+
+    // Quantize to i16 with TPDF dither
+    let mut rng = rand::thread_rng();
+    let mut pcm_i16 = Vec::with_capacity(normalized.len());
+    for &x in &normalized {
+        // TPDF dither: add two independent uniform(-0.5,0.5) LSBs
+        let dither = (rng.gen::<f32>() - 0.5) + (rng.gen::<f32>() - 0.5);
+        let y = (x * i16::MAX as f32 + dither).clamp(i16::MIN as f32, i16::MAX as f32);
+        pcm_i16.push(y as i16);
+    }
+
+    // Write final WAV
+    let ts = chrono::Local::now().format("%Y%m%d_%H%M%S");
+    let out_path = out_dir.join(format!("normalized_{}.wav", ts));
+    let out_spec = WavSpec {
+        channels: TARGET_CHANNELS,
+        sample_rate: TARGET_RATE,
+        bits_per_sample: TARGET_BITS,
+        sample_format: SampleFormat::Int,
+    };
+    let mut writer =
+        WavWriter::create(&out_path, out_spec).map_err(|e| format!("WAV create failed: {}", e))?;
+    for s in pcm_i16 {
+        writer
+            .write_sample(s)
+            .map_err(|e| format!("WAV write failed: {}", e))?;
+    }
+    writer.finalize().map_err(|e| format!("WAV finalize failed: {}", e))?;
+
+    Ok(out_path)
+}
+
+fn downmix_equal_power_ignore_silent(input: &[f32], channels: usize) -> Vec<f32> {
+    if channels == 0 {
+        return vec![];
+    }
+    let frames = input.len() / channels;
+    if frames == 0 {
+        return vec![];
+    }
+
+    // RMS per channel
+    let mut sumsq = vec![0.0f32; channels];
+    for frame in 0..frames {
+        let base = frame * channels;
+        for ch in 0..channels {
+            let s = input[base + ch];
+            sumsq[ch] += s * s;
+        }
+    }
+    let rms: Vec<f32> = sumsq
+        .iter()
+        .map(|&s| (s / frames as f32).sqrt())
+        .collect();
+    let mut active: Vec<usize> = rms
+        .iter()
+        .enumerate()
+        .filter(|(_, &e)| e > SILENCE_RMS_THRESHOLD)
+        .map(|(i, _)| i)
+        .collect();
+    if active.is_empty() {
+        // If all channels are silent by threshold, use all channels to avoid empty output
+        active = (0..channels).collect();
+    }
+
+    let gain = (1.0f32 / (active.len() as f32)).sqrt();
+
+    let mut out = Vec::with_capacity(frames);
+    for frame in 0..frames {
+        let base = frame * channels;
+        let mut sum = 0.0f32;
+        for &ch in &active {
+            sum += input[base + ch];
+        }
+        out.push((sum * gain).clamp(-1.0, 1.0));
+    }
+    out
+}
diff --git a/src-tauri/src/audio/recorder.rs b/src-tauri/src/audio/recorder.rs
@@ -104,7 +104,7 @@ impl AudioRecorder {
         let (audio_level_tx, audio_level_rx) = mpsc::channel::<f64>();
 
         // Silence detection config for VAD
-        let silence_duration = Duration::from_secs(60); // 60 seconds of silence
+        let silence_duration = Duration::from_secs(10); // 10 seconds of continuous silence
 
         // Spawn recording thread
         let thread_handle = thread::spawn(move || -> Result<String, String> {
diff --git a/src-tauri/src/commands/audio.rs b/src-tauri/src/commands/audio.rs
@@ -694,9 +694,62 @@ pub async fn stop_recording(
         }
     };
 
-    // === Audio validation now handled by transcriber ===
-    // Transcriber will check duration and format during processing
-    log_with_context(log::Level::Debug, "Proceeding directly to transcription", &[
+    // Normalize captured audio to Whisper contract (WAV PCM s16, mono, 16k)
+    let parent_dir = audio_path
+        .parent()
+        .map(|p| p.to_path_buf())
+        .unwrap_or_else(|| std::path::Path::new(".").to_path_buf());
+
+    let normalized_path = match crate::audio::normalizer::normalize_to_whisper_wav(&audio_path, &parent_dir) {
+        Ok(p) => p,
+        Err(e) => {
+            log::error!("Audio normalization failed: {}", e);
+            update_recording_state(&app, RecordingState::Error, Some("Audio normalization failed".to_string()));
+            // Attempt cleanup
+            let _ = std::fs::remove_file(&audio_path);
+            return Err("Audio normalization failed".to_string());
+        }
+    };
+
+    // Remove raw capture after successful normalization
+    if let Err(e) = std::fs::remove_file(&audio_path) {
+        log::debug!("Failed to remove raw audio: {}", e);
+    }
+
+    // Duration gate (min 5s) using normalized file
+    let too_short = (|| -> Result<bool, String> {
+        let reader = hound::WavReader::open(&normalized_path)
+            .map_err(|e| format!("Failed to open normalized wav: {}", e))?;
+        let spec = reader.spec();
+        let frames = reader.duration() / spec.channels as u32; // mono expected
+        let duration = frames as f32 / spec.sample_rate as f32;
+        log_with_context(log::Level::Info, "NORMALIZED_AUDIO", &[
+            ("path", &format!("{:?}", normalized_path).as_str()),
+            ("sample_rate", &spec.sample_rate.to_string().as_str()),
+            ("channels", &spec.channels.to_string().as_str()),
+            ("bits", &spec.bits_per_sample.to_string().as_str()),
+            ("duration_s", &format!("{:.2}", duration).as_str()),
+        ]);
+        Ok(duration < 5.0)
+    })();
+
+    if let Ok(true) = too_short {
+        // Emit friendly feedback and stop here
+        let _ = emit_to_window(&app, "pill", "recording-too-short", "Recording shorter than 5 seconds");
+        if let Err(e) = std::fs::remove_file(&normalized_path) {
+            log::debug!("Failed to remove short normalized audio: {}", e);
+        }
+        // Hide pill and return to Idle
+        if let Err(e) = crate::commands::window::hide_pill_widget(app.clone()).await {
+            log::error!("Failed to hide pill window: {}", e);
+        }
+        update_recording_state(&app, RecordingState::Idle, None);
+        return Ok("".to_string());
+    }
+
+    // Proceed to transcription with normalized file
+    let audio_path = normalized_path;
+    log_with_context(log::Level::Debug, "Proceeding to transcription", &[
         ("audio_path", &format!("{:?}", audio_path).as_str()),
         ("stage", "pre_transcription")
     ]);
@@ -1356,6 +1409,10 @@ pub async fn transcribe_audio_file(
     let wav_path = crate::audio::converter::convert_to_wav(audio_path, &recordings_dir)?;
     let is_converted = wav_path != audio_path;
 
+    // Normalize imported audio to Whisper contract (WAV PCM s16, mono, 16k)
+    let normalized_path = crate::audio::normalizer::normalize_to_whisper_wav(&wav_path, &recordings_dir)
+        .map_err(|e| format!("Audio normalization failed: {}", e))?;
+
     // Get model path
     let whisper_manager = app.state::<AsyncRwLock<WhisperManager>>();
     let model_path = whisper_manager
@@ -1395,7 +1452,7 @@ pub async fn transcribe_audio_file(
     };
 
     let text = transcriber.transcribe_with_translation(
-        &wav_path,
+        &normalized_path,
         Some(&language),
         translate_to_english,
     )?;
@@ -1408,6 +1465,10 @@ pub async fn transcribe_audio_file(
             log::debug!("Cleaned up temporary converted WAV file");
         }
     }
+    // Clean up normalized WAV
+    if let Err(e) = std::fs::remove_file(&normalized_path) {
+        log::warn!("Failed to remove normalized WAV file: {}", e);
+    }
 
     Ok(text)
 }

Original file line number	Diff line number	Diff line change
`@@ -44,3 +44,4 @@ notarized-build-*`
`44`	`44`	`claudedocs/`
`45`	`45`
`46`	`46`	`agent-docs/`
	`47`	`+.build`