From 3f49e519cbea7909279923006bf2357b83a0443a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Thu, 28 May 2026 14:21:45 +0200 Subject: [PATCH] fix(tts): add streamFlush() API and drain Kokoro buffer on streamStop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The streaming loop used a timer-based force-flush after `kStreamMaxSkippedIterations` idle iterations. It tried to extract zero characters whenever the buffer held content without an end-of-sentence character — the counter reset on every iteration and `streamStop(false)` hung forever (#1153). The same threshold also fired mid-token for LLM-style streaming, partitioning sentences before they finished. Replace the threshold with caller-driven flushing: - New `streamFlush()` API. Caller signals "drain what's currently buffered, EOS or not" — typically right before `streamStop(false)` for normal apps that ended on un-terminated content. - `streamStop(false)` now drains automatically (equivalent to `streamFlush()` plus auto-stop on empty buffer), so it's guaranteed to return even if the trailing tail has no EOS. - LLM-style callers feeding partial tokens never call `streamFlush()` — model punctuation drives natural EOS partitioning, and the residual tail is drained by `streamStop(false)` when generation completes. The skipped-iteration force-flush and its tuning constant `kStreamMaxSkippedIterations` are removed. Fixes #1153. --- apps/speech/screens/TextToSpeechLLMScreen.tsx | 3 +++ .../useTextToSpeech.md | 2 +- .../TextToSpeechModule.md | 2 +- .../host_objects/ModelHostObject.h | 3 +++ .../models/text_to_speech/kokoro/Kokoro.cpp | 27 +++++++++++++------ .../models/text_to_speech/kokoro/Kokoro.h | 21 +++++++++++++-- .../models/text_to_speech/kokoro/Params.h | 6 ----- .../useTextToSpeech.ts | 7 +++++ .../TextToSpeechModule.ts | 17 ++++++++++-- .../react-native-executorch/src/types/tts.ts | 26 ++++++++++++++++-- 10 files changed, 92 insertions(+), 22 deletions(-) diff --git a/apps/speech/screens/TextToSpeechLLMScreen.tsx b/apps/speech/screens/TextToSpeechLLMScreen.tsx index b3a3dad913..b90bcad577 100644 --- a/apps/speech/screens/TextToSpeechLLMScreen.tsx +++ b/apps/speech/screens/TextToSpeechLLMScreen.tsx @@ -140,6 +140,9 @@ export const TextToSpeechLLMScreen = ({ onBack }: TextToSpeechLLMProps) => { } catch (e) { console.error('Generation failed:', e); } finally { + // LLM finished — partition any trailing un-terminated tail so it gets + // synthesized before the stream closes. + tts.streamFlush(); tts.streamStop(false); await ttsPromise; diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md index fe0dd3b2ee..9598ab75a6 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md @@ -85,7 +85,7 @@ The module provides two ways to generate speech using either raw text or pre-gen 1. [**`forward({ text, speed, phonemize })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`. 2. [**`stream({ speed, phonemize, stopAutomatically, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator-like functionality (managed via callbacks like `onNext`) that yields chunks of audio as they are computed. - This is ideal for reducing the "time to first audio" for long sentences. You can also dynamically insert text during the generation process using `streamInsert(text)` and stop it with `streamStop(instant)`. + This is ideal for reducing the "time to first audio" for long sentences. You can also dynamically insert text during the generation process using `streamInsert(text)`, force-partition trailing content without an end-of-sentence character via `streamFlush()`, and stop the stream with `streamStop(instant)`. :::tip Recommendation In most cases, the **`stream()`** method is recommended over `forward()`. It significantly reduces latency by allowing audio playback to begin as soon as the first chunk is synthesized, rather than waiting for the entire text to be processed. diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md index daf5cb735b..f18abe1029 100644 --- a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md +++ b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md @@ -49,7 +49,7 @@ The module provides a way to generate speech using either raw text or pre-genera 1. [**`forward(text, speed, phonemize)`**](../../06-api-reference/classes/TextToSpeechModule.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`. - `phonemize` defaults to `true`. When set to `false`, the input is expected to be a string of IPA phonemes. -2. [**`stream({ speed, phonemize, stopAutomatically, ... })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. In contrast to `forward`, it enables inserting text chunks dynamically into processing buffer with [**`streamInsert(text)`**](../../06-api-reference/classes/TextToSpeechModule.md#streaminsert) and allows stopping generation early with [**`streamStop(instant)`**](../../06-api-reference/classes/TextToSpeechModule.md#streamstop). +2. [**`stream({ speed, phonemize, stopAutomatically, ... })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. In contrast to `forward`, it enables inserting text chunks dynamically into the processing buffer with [**`streamInsert(text)`**](../../06-api-reference/classes/TextToSpeechModule.md#streaminsert), force-partitioning trailing un-terminated content via [**`streamFlush()`**](../../06-api-reference/classes/TextToSpeechModule.md#streamflush), and stopping generation early with [**`streamStop(instant)`**](../../06-api-reference/classes/TextToSpeechModule.md#streamstop). ### Using Phonemes diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index cb8313598f..cb631e5fba 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -231,6 +231,9 @@ template class ModelHostObject : public JsiHostObject { addFunctions(JSI_EXPORT_FUNCTION( ModelHostObject, synchronousHostFunction<&Model::streamInsert>, "streamInsert")); + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, synchronousHostFunction<&Model::streamFlush>, + "streamFlush")); } if constexpr (meta::HasGenerateFromString) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp index ae11767f69..aa8486c276 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp @@ -174,6 +174,7 @@ void Kokoro::stream(std::shared_ptr callback, float speed, isStreaming_ = true; stopOnEmptyBuffer_ = stopOnEmptyBuffer; + flushPending_ = false; // The outer streaming loop is responsible for handling the input buffer. // The extracted text is then passed to the inner loop, which performs a @@ -205,15 +206,21 @@ void Kokoro::stream(std::shared_ptr callback, float speed, ? std::distance(eosIt, inputTextBuffer_.rend()) : 0; - // To maximize the quality of the speech, we try to avoid processing - // chunks which end in the middle of a sentence. - if (chunkSize > 0 || - streamSkippedIterations >= params::kStreamMaxSkippedIterations) { + // Default behavior: hold back partial content until an EOS arrives, so + // we don't synthesize mid-sentence (relevant for LLM token streaming). + // When the caller signals via `streamFlush()` / `streamStop(false)` + // that they want the tail drained, take the entire searchable window + // instead. The flush flag stays set until the buffer empties, so a + // multi-chunk drain progresses across iterations. + if (chunkSize == 0 && flushPending_.load()) { + chunkSize = searchLimit; + } + if (chunkSize > 0) { input = inputTextBuffer_.substr(0, chunkSize); inputTextBuffer_.erase(0, chunkSize); - streamSkippedIterations = 0; - } else { - streamSkippedIterations++; + if (inputTextBuffer_.empty()) { + flushPending_ = false; + } } } @@ -317,7 +324,7 @@ void Kokoro::stream(std::shared_ptr callback, float speed, std::scoped_lock lock(inputTextBufferMutex_); inputTextBuffer_.clear(); isStreaming_ = false; - streamSkippedIterations = 0; + flushPending_ = false; } } @@ -410,10 +417,14 @@ void Kokoro::streamInsert(std::u32string chunk) noexcept { inputTextBuffer_.append(chunk); } +void Kokoro::streamFlush() noexcept { flushPending_ = true; } + void Kokoro::streamStop(bool instant) noexcept { if (instant) { isStreaming_ = false; } else { + // Ensure trailing un-terminated content is drained before the loop exits. + flushPending_ = true; stopOnEmptyBuffer_ = true; } } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h index adf736bd28..ef9fa432b6 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h @@ -60,11 +60,24 @@ class Kokoro { */ void streamInsert(std::u32string chunk) noexcept; + /** + * Requests the streaming loop to partition and synthesize whatever is + * currently buffered, even if no end-of-sentence character is present. + * + * Use after the last `streamInsert` of an utterance when you want trailing + * un-terminated content to play out without stopping the stream. LLM-style + * callers feeding partial tokens typically should not call this — the + * model's punctuation drives natural EOS partitioning. + */ + void streamFlush() noexcept; + /** * Signals the streaming process to stop. * * @param instant If true, stops immediately, discarding remaining buffered - * text. If false, finishes processing the current buffer before stopping. + * text. If false, drains the current buffer (force-flushing any trailing + * un-terminated content) before stopping — equivalent to `streamFlush()` + * followed by an automatic stop once the buffer empties. */ void streamStop(bool instant) noexcept; @@ -100,7 +113,11 @@ class Kokoro { // --- Streaming control State --- std::atomic isStreaming_{false}; std::atomic stopOnEmptyBuffer_{true}; - int32_t streamSkippedIterations = 0; + // Set by `streamFlush()` or `streamStop(false)`. While true, the stream + // loop force-extracts the entire searchable window even when no EOS is + // present. Cleared once the buffer drains so subsequent inserts go back to + // EOS-aligned chunking. + std::atomic flushPending_{false}; }; } // namespace models::text_to_speech::kokoro diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h index 5f4e7cfe2b..5f61287f02 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h @@ -20,12 +20,6 @@ namespace rnexecutorch::models::text_to_speech::kokoro::params { */ inline constexpr size_t kMaxTextSize = 2048; -/** - * A number of skipped streaming iterations after which we process the remaining - * input no matter how it looks like. - */ -inline constexpr int32_t kStreamMaxSkippedIterations = 3; - /** * A size of pause (in miliseconds) applied after each streaming iteration. */ diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts index 53ca98b0b1..21576eb0f3 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts @@ -141,6 +141,12 @@ export const useTextToSpeech = ( [moduleInstance] ); + const streamFlush = useCallback(() => { + if (moduleInstance) { + moduleInstance.streamFlush(); + } + }, [moduleInstance]); + const streamStop = useCallback( (instant: boolean = true) => { if (moduleInstance) { @@ -157,6 +163,7 @@ export const useTextToSpeech = ( forward, stream, streamInsert, + streamFlush, streamStop, downloadProgress, }; diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts index 6b6695f1f7..df1aaa01c8 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts @@ -184,17 +184,30 @@ export class TextToSpeechModule { } /** - * Inserts new content (text or IPA phonemes) into the buffer to be processed in streaming mode. + * Inserts new content (text or IPA phonemes) into the buffer to be processed + * in streaming mode. Trailing un-terminated content sits in the buffer + * until {@link TextToSpeechModule.streamFlush} or `streamStop(false)` + * releases it. * @param input - The text or phoneme fragment to append to the streaming buffer. */ public streamInsert(input: string): void { this.nativeModule.streamInsert(input); } + /** + * Force-partitions whatever is currently buffered, even without an + * end-of-sentence character. Call after the final `streamInsert` of an + * utterance to play out the trailing tail without ending the stream. + */ + public streamFlush(): void { + this.nativeModule.streamFlush(); + } + /** * Stops the streaming process if there is any ongoing. * @param instant - If true, stops the streaming as soon as possible. Otherwise - * allows the module to complete processing for the remains of the buffer. + * drains the current buffer (force-flushing any trailing + * un-terminated content) before stopping. */ public streamStop(instant: boolean = true): void { this.nativeModule.streamStop(instant); diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts index a2dbd1905f..811d7d27c2 100644 --- a/packages/react-native-executorch/src/types/tts.ts +++ b/packages/react-native-executorch/src/types/tts.ts @@ -135,14 +135,36 @@ export interface TextToSpeechType { stream: (input: TextToSpeechStreamingInput) => Promise; /** - * Inserts new text chunk into the buffer to be processed in streaming mode. + * Inserts a new text chunk into the buffer to be processed in streaming mode. + * + * Chunks accumulate until an end-of-sentence character (`.?!;…`) appears in + * the buffer, at which point they're partitioned and synthesized. If the + * caller stops feeding before an EOS arrives, the trailing tail will sit in + * the buffer until `streamFlush()` or `streamStop(false)` is called. + * @param textChunk - Text (or IPA phonemes) to append to the streaming buffer. */ streamInsert: (textChunk: string) => void; + /** + * Requests the streaming session to partition and synthesize whatever is + * currently buffered, even if no end-of-sentence character is present. + * + * Call after the final `streamInsert` of an utterance when you want + * trailing un-terminated content to play out without ending the stream. + * LLM-style callers feeding partial tokens typically should not call this — + * model punctuation drives natural EOS partitioning, and the residual tail + * is drained by `streamStop(false)` when generation completes. + */ + streamFlush: () => void; + /** * Interrupts and stops the currently active audio generation stream. * @param instant If true, stops the streaming as soon as possible. Otherwise - * allows the module to complete processing for the remains of the buffer. + * drains the current buffer (force-flushing any trailing + * un-terminated content) before stopping — equivalent to + * calling {@link TextToSpeechType.streamFlush} followed by an + * automatic stop once the buffer empties, so this call + * always returns. */ streamStop: (instant?: boolean) => void; }