diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp index 4c25df80..63f8f9a5 100644 --- a/projects/llm_framework/main_melotts/src/main.cpp +++ b/projects/llm_framework/main_melotts/src/main.cpp @@ -243,7 +243,6 @@ class llm_task { try { std::vector wav_pcm_data; if (msg_str.empty()) { - SLOGI("empty"); if (out_callback_) { std::string output = wav_pcm_data.empty() ? std::string() : std::string((char *)wav_pcm_data.data(), @@ -253,7 +252,6 @@ class llm_task { return false; } - // Convert text to phonemes and tones std::vector phones_bef, tones_bef; lexicon_->convert(msg_str, phones_bef, tones_bef); auto phones = intersperse(phones_bef, 0); @@ -261,7 +259,6 @@ class llm_task { int phone_len = phones.size(); std::vector langids(phone_len, 3); - // Run the encoder to generate hidden representations auto encoder_output = encoder_->Run(phones, tones, langids, g_matrix, mode_config_.noise_scale, mode_config_.noise_scale_w, mode_config_.get_length_scale(), mode_config_.sdp_ratio); @@ -270,7 +267,6 @@ class llm_task { auto zp_info = encoder_output.at(0).GetTensorTypeAndShapeInfo(); auto zp_shape = zp_info.GetShape(); - // Calculate decoder parameters int zp_size = decoder_->GetInputSize(0) / sizeof(float); int dec_len = zp_size / zp_shape[1]; int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float); @@ -283,12 +279,10 @@ class llm_task { int dec_slice_num = static_cast(std::ceil(static_cast(zp_shape[2]) / static_cast(effective_frames))); - // SOLA parameters setup - const int sola_buffer_frame = pad_frames * samples_per_frame; // Overlap buffer length - const int sola_search_frame = pad_frames * samples_per_frame; // Search window length - const int block_frame = (dec_len - 2 * pad_frames) * samples_per_frame; // Effective block length + const int sola_buffer_frame = pad_frames * samples_per_frame; + const int sola_search_frame = pad_frames * samples_per_frame; + const int block_frame = (dec_len - 2 * pad_frames) * samples_per_frame; - // Create fade-in/fade-out windows for smooth transitions std::vector fade_in_window(sola_buffer_frame); std::vector fade_out_window(sola_buffer_frame); @@ -297,46 +291,35 @@ class llm_task { fade_out_window[i] = 1.0f - fade_in_window[i]; } - // Initialize SOLA buffer std::vector sola_buffer(sola_buffer_frame, 0.0f); bool first_frame = true; std::vector pcmlist; - // Main decoding loop - process each slice for (int i = 0; i < dec_slice_num; i++) { - // Calculate start position for current batch input int input_start = i * effective_frames; - // Consider forward padding, but ensure non-negative if (i > 0) { input_start -= pad_frames; } input_start = std::max(0, input_start); - // Actual input length int actual_len = std::min(dec_len, static_cast(zp_shape[2] - input_start)); - // Calculate effective output range (frame level) int output_start_frame, output_end_frame; if (i == 0) { - // First frame: skip padding at beginning output_start_frame = 0; output_end_frame = effective_frames - 1; } else if (i == dec_slice_num - 1) { - // Last frame: calculate from current segment start output_start_frame = i * effective_frames; - // Last frame extends to encoder's maximum output length - output_end_frame = static_cast(zp_shape[2]) - 1; + output_end_frame = static_cast(zp_shape[2]) - 1; } else { - // Middle frames: standard calculation output_start_frame = i * effective_frames; output_end_frame = (i + 1) * effective_frames - 1; } - // Prepare decoder input, initialize all to zero + std::vector zp(zp_size, 0); - // Copy data to decoder input for (int n = 0; n < zp_shape[1]; n++) { int copy_size = std::min(actual_len, static_cast(zp_shape[2] - input_start)); if (copy_size > 0) { @@ -345,54 +328,37 @@ class llm_task { } } - // Run decoder std::vector decoder_output(audio_slice_len); decoder_->SetInput(zp.data(), 0); decoder_->SetInput(g_matrix.data(), 1); if (0 != decoder_->Run()) { - SLOGI("Inference #%d: decoding failed", i + 1); throw std::string("decoder_ RunSync error"); } decoder_->GetOutput(decoder_output.data(), 0); - // === SOLA Processing Logic === if (first_frame) { - // Special handling for first frame - should not skip initial content - // First frame starts directly from decoder output without skipping - int audio_start = 0; // Start from beginning, don't skip pad_frames - - // Calculate data length for first frame - // First frame should preserve complete decoder output, only reserving sola_buffer_frame at the end - // for next frame alignment - int audio_len = decoder_output.size() - sola_buffer_frame; - - // Boundary check - audio_len = std::max(0, audio_len); // Ensure non-negative + int audio_start = 0; + int audio_len = decoder_output.size() - sola_buffer_frame; + audio_len = std::max(0, audio_len); - // Add first frame data if (audio_len > 0) { pcmlist.insert(pcmlist.end(), decoder_output.begin() + audio_start, decoder_output.begin() + audio_start + audio_len); } - // Save sola_buffer_frame length from the end to SOLA buffer for next frame alignment int buffer_start = audio_len; - // Ensure sufficient data is available for copying if (buffer_start + sola_buffer_frame <= decoder_output.size()) { std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame, sola_buffer.begin()); } else { - // Possible case: first frame data is shorter than sola_buffer_frame int available = static_cast(decoder_output.size() - buffer_start); if (available > 0) { std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), sola_buffer.begin()); - // Fill with zeros std::fill(sola_buffer.begin() + available, sola_buffer.end(), 0.0f); } else { - // Completely insufficient data, fill all with zeros std::fill(sola_buffer.begin(), sola_buffer.end(), 0.0f); } } @@ -400,15 +366,12 @@ class llm_task { first_frame = false; } else { - // Non-first frame: SOLA alignment required int audio_start = pad_frames * samples_per_frame; - // 1. Prepare search window - beginning portion of current frame std::vector search_window(sola_buffer_frame + sola_search_frame); std::copy(decoder_output.begin() + audio_start, decoder_output.begin() + audio_start + search_window.size(), search_window.begin()); - // 2. Find best alignment point (calculate cross-correlation) int best_offset = 0; float best_correlation = -1.0; @@ -421,7 +384,6 @@ class llm_task { energy += search_window[j + offset] * search_window[j + offset]; } - // Normalize correlation (avoid division by zero) float normalized_correlation = (energy > 1e-8) ? correlation / std::sqrt(energy) : 0.0f; if (normalized_correlation > best_correlation) { @@ -430,30 +392,25 @@ class llm_task { } } - // 3. Apply alignment offset + int aligned_start = audio_start + best_offset; - // 4. Smooth transition processing (crossfade in alignment region) std::vector crossfade_region(sola_buffer_frame); for (int j = 0; j < sola_buffer_frame; j++) { - // Apply fade-in/fade-out window functions crossfade_region[j] = decoder_output[aligned_start + j] * fade_in_window[j] + sola_buffer[j] * fade_out_window[j]; } - // 5. Add crossfade region to output pcmlist.insert(pcmlist.end(), crossfade_region.begin(), crossfade_region.end()); int remaining_start = aligned_start + sola_buffer_frame; if (i == dec_slice_num - 1) { int total_expected_samples = audio_len * samples_per_frame / 512; - - int processed_samples = static_cast(pcmlist.size()); - - int remaining_needed = total_expected_samples - processed_samples; - remaining_needed = std::max(0, remaining_needed); + int processed_samples = static_cast(pcmlist.size()); + int remaining_needed = total_expected_samples - processed_samples; + remaining_needed = std::max(0, remaining_needed); int remaining_len = std::min(remaining_needed, static_cast(decoder_output.size() - remaining_start)); @@ -465,7 +422,6 @@ class llm_task { } else { int remaining_len = (dec_len - 2 * pad_frames) * samples_per_frame - sola_buffer_frame; - remaining_len = std::min(remaining_len, static_cast(decoder_output.size() - remaining_start)); @@ -495,7 +451,7 @@ class llm_task { pcmlist.resize(audio_len); } - // Post-processing: resample and convert to int16 + double src_ratio = static_cast(mode_config_.audio_rate) / static_cast(mode_config_.mode_rate); std::vector tmp_pcm((pcmlist.size() * src_ratio + 1)); @@ -503,12 +459,11 @@ class llm_task { resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio); - // Convert to 16-bit PCM + wav_pcm_data.reserve(len); std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data), [](const auto val) { return static_cast(val * INT16_MAX); }); - // Call the output callback function with the result if (out_callback_) { out_callback_( std::string(reinterpret_cast(wav_pcm_data.data()), wav_pcm_data.size() * sizeof(int16_t)), @@ -516,10 +471,8 @@ class llm_task { } } catch (const std::exception &e) { - SLOGI("TTS processing exception: %s", e.what()); return true; } catch (...) { - SLOGI("TTS processing encountered an unknown exception"); return true; } return false; @@ -932,4 +885,4 @@ int main(int argc, char *argv[]) } llm.llm_firework_exit(); return 0; -} \ No newline at end of file +}