From 3f49e519cbea7909279923006bf2357b83a0443a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Thu, 28 May 2026 14:21:45 +0200
Subject: [PATCH] fix(tts): add streamFlush() API and drain Kokoro buffer on
 streamStop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The streaming loop used a timer-based force-flush after
`kStreamMaxSkippedIterations` idle iterations. It tried to extract zero
characters whenever the buffer held content without an end-of-sentence
character — the counter reset on every iteration and `streamStop(false)`
hung forever (#1153). The same threshold also fired mid-token for
LLM-style streaming, partitioning sentences before they finished.

Replace the threshold with caller-driven flushing:

- New `streamFlush()` API. Caller signals "drain what's currently
  buffered, EOS or not" — typically right before `streamStop(false)` for
  normal apps that ended on un-terminated content.
- `streamStop(false)` now drains automatically (equivalent to
  `streamFlush()` plus auto-stop on empty buffer), so it's guaranteed to
  return even if the trailing tail has no EOS.
- LLM-style callers feeding partial tokens never call `streamFlush()` —
  model punctuation drives natural EOS partitioning, and the residual
  tail is drained by `streamStop(false)` when generation completes.

The skipped-iteration force-flush and its tuning constant
`kStreamMaxSkippedIterations` are removed.

Fixes #1153.
---
 apps/speech/screens/TextToSpeechLLMScreen.tsx |  3 +++
 .../useTextToSpeech.md                        |  2 +-
 .../TextToSpeechModule.md                     |  2 +-
 .../host_objects/ModelHostObject.h            |  3 +++
 .../models/text_to_speech/kokoro/Kokoro.cpp   | 27 +++++++++++++------
 .../models/text_to_speech/kokoro/Kokoro.h     | 21 +++++++++++++--
 .../models/text_to_speech/kokoro/Params.h     |  6 -----
 .../useTextToSpeech.ts                        |  7 +++++
 .../TextToSpeechModule.ts                     | 17 ++++++++++--
 .../react-native-executorch/src/types/tts.ts  | 26 ++++++++++++++++--
 10 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/apps/speech/screens/TextToSpeechLLMScreen.tsx b/apps/speech/screens/TextToSpeechLLMScreen.tsx
index b3a3dad913..b90bcad577 100644
--- a/apps/speech/screens/TextToSpeechLLMScreen.tsx
+++ b/apps/speech/screens/TextToSpeechLLMScreen.tsx
@@ -140,6 +140,9 @@ export const TextToSpeechLLMScreen = ({ onBack }: TextToSpeechLLMProps) => {
     } catch (e) {
       console.error('Generation failed:', e);
     } finally {
+      // LLM finished — partition any trailing un-terminated tail so it gets
+      // synthesized before the stream closes.
+      tts.streamFlush();
       tts.streamStop(false);
       await ttsPromise;
 
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
index fe0dd3b2ee..9598ab75a6 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
@@ -85,7 +85,7 @@ The module provides two ways to generate speech using either raw text or pre-gen
 
 1.  [**`forward({ text, speed, phonemize })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
 2.  [**`stream({ speed, phonemize, stopAutomatically, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator-like functionality (managed via callbacks like `onNext`) that yields chunks of audio as they are computed.
-    This is ideal for reducing the "time to first audio" for long sentences. You can also dynamically insert text during the generation process using `streamInsert(text)` and stop it with `streamStop(instant)`.
+    This is ideal for reducing the "time to first audio" for long sentences. You can also dynamically insert text during the generation process using `streamInsert(text)`, force-partition trailing content without an end-of-sentence character via `streamFlush()`, and stop the stream with `streamStop(instant)`.
 
 :::tip Recommendation
 In most cases, the **`stream()`** method is recommended over `forward()`. It significantly reduces latency by allowing audio playback to begin as soon as the first chunk is synthesized, rather than waiting for the entire text to be processed.
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
index daf5cb735b..f18abe1029 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
@@ -49,7 +49,7 @@ The module provides a way to generate speech using either raw text or pre-genera
 
 1.  [**`forward(text, speed, phonemize)`**](../../06-api-reference/classes/TextToSpeechModule.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
     - `phonemize` defaults to `true`. When set to `false`, the input is expected to be a string of IPA phonemes.
-2.  [**`stream({ speed, phonemize, stopAutomatically, ... })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. In contrast to `forward`, it enables inserting text chunks dynamically into processing buffer with [**`streamInsert(text)`**](../../06-api-reference/classes/TextToSpeechModule.md#streaminsert) and allows stopping generation early with [**`streamStop(instant)`**](../../06-api-reference/classes/TextToSpeechModule.md#streamstop).
+2.  [**`stream({ speed, phonemize, stopAutomatically, ... })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. In contrast to `forward`, it enables inserting text chunks dynamically into the processing buffer with [**`streamInsert(text)`**](../../06-api-reference/classes/TextToSpeechModule.md#streaminsert), force-partitioning trailing un-terminated content via [**`streamFlush()`**](../../06-api-reference/classes/TextToSpeechModule.md#streamflush), and stopping generation early with [**`streamStop(instant)`**](../../06-api-reference/classes/TextToSpeechModule.md#streamstop).
 
 ### Using Phonemes
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index cb8313598f..cb631e5fba 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -231,6 +231,9 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>, synchronousHostFunction<&Model::streamInsert>,
           "streamInsert"));
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>, synchronousHostFunction<&Model::streamFlush>,
+          "streamFlush"));
     }
 
     if constexpr (meta::HasGenerateFromString<Model>) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
index ae11767f69..aa8486c276 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -174,6 +174,7 @@ void Kokoro::stream(std::shared_ptr<jsi::Function> callback, float speed,
 
   isStreaming_ = true;
   stopOnEmptyBuffer_ = stopOnEmptyBuffer;
+  flushPending_ = false;
 
   // The outer streaming loop is responsible for handling the input buffer.
   // The extracted text is then passed to the inner loop, which performs a
@@ -205,15 +206,21 @@ void Kokoro::stream(std::shared_ptr<jsi::Function> callback, float speed,
                              ? std::distance(eosIt, inputTextBuffer_.rend())
                              : 0;
 
-      // To maximize the quality of the speech, we try to avoid processing
-      // chunks which end in the middle of a sentence.
-      if (chunkSize > 0 ||
-          streamSkippedIterations >= params::kStreamMaxSkippedIterations) {
+      // Default behavior: hold back partial content until an EOS arrives, so
+      // we don't synthesize mid-sentence (relevant for LLM token streaming).
+      // When the caller signals via `streamFlush()` / `streamStop(false)`
+      // that they want the tail drained, take the entire searchable window
+      // instead. The flush flag stays set until the buffer empties, so a
+      // multi-chunk drain progresses across iterations.
+      if (chunkSize == 0 && flushPending_.load()) {
+        chunkSize = searchLimit;
+      }
+      if (chunkSize > 0) {
         input = inputTextBuffer_.substr(0, chunkSize);
         inputTextBuffer_.erase(0, chunkSize);
-        streamSkippedIterations = 0;
-      } else {
-        streamSkippedIterations++;
+        if (inputTextBuffer_.empty()) {
+          flushPending_ = false;
+        }
       }
     }
 
@@ -317,7 +324,7 @@ void Kokoro::stream(std::shared_ptr<jsi::Function> callback, float speed,
     std::scoped_lock<std::mutex> lock(inputTextBufferMutex_);
     inputTextBuffer_.clear();
     isStreaming_ = false;
-    streamSkippedIterations = 0;
+    flushPending_ = false;
   }
 }
 
@@ -410,10 +417,14 @@ void Kokoro::streamInsert(std::u32string chunk) noexcept {
   inputTextBuffer_.append(chunk);
 }
 
+void Kokoro::streamFlush() noexcept { flushPending_ = true; }
+
 void Kokoro::streamStop(bool instant) noexcept {
   if (instant) {
     isStreaming_ = false;
   } else {
+    // Ensure trailing un-terminated content is drained before the loop exits.
+    flushPending_ = true;
     stopOnEmptyBuffer_ = true;
   }
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
index adf736bd28..ef9fa432b6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
@@ -60,11 +60,24 @@ class Kokoro {
    */
   void streamInsert(std::u32string chunk) noexcept;
 
+  /**
+   * Requests the streaming loop to partition and synthesize whatever is
+   * currently buffered, even if no end-of-sentence character is present.
+   *
+   * Use after the last `streamInsert` of an utterance when you want trailing
+   * un-terminated content to play out without stopping the stream. LLM-style
+   * callers feeding partial tokens typically should not call this — the
+   * model's punctuation drives natural EOS partitioning.
+   */
+  void streamFlush() noexcept;
+
   /**
    * Signals the streaming process to stop.
    *
    * @param instant If true, stops immediately, discarding remaining buffered
-   * text. If false, finishes processing the current buffer before stopping.
+   * text. If false, drains the current buffer (force-flushing any trailing
+   * un-terminated content) before stopping — equivalent to `streamFlush()`
+   * followed by an automatic stop once the buffer empties.
    */
   void streamStop(bool instant) noexcept;
 
@@ -100,7 +113,11 @@ class Kokoro {
   // --- Streaming control State ---
   std::atomic<bool> isStreaming_{false};
   std::atomic<bool> stopOnEmptyBuffer_{true};
-  int32_t streamSkippedIterations = 0;
+  // Set by `streamFlush()` or `streamStop(false)`. While true, the stream
+  // loop force-extracts the entire searchable window even when no EOS is
+  // present. Cleared once the buffer drains so subsequent inserts go back to
+  // EOS-aligned chunking.
+  std::atomic<bool> flushPending_{false};
 };
 } // namespace models::text_to_speech::kokoro
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
index 5f4e7cfe2b..5f61287f02 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
@@ -20,12 +20,6 @@ namespace rnexecutorch::models::text_to_speech::kokoro::params {
  */
 inline constexpr size_t kMaxTextSize = 2048;
 
-/**
- * A number of skipped streaming iterations after which we process the remaining
- * input no matter how it looks like.
- */
-inline constexpr int32_t kStreamMaxSkippedIterations = 3;
-
 /**
  * A size of pause (in miliseconds) applied after each streaming iteration.
  */
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
index 53ca98b0b1..21576eb0f3 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
@@ -141,6 +141,12 @@ export const useTextToSpeech = (
     [moduleInstance]
   );
 
+  const streamFlush = useCallback(() => {
+    if (moduleInstance) {
+      moduleInstance.streamFlush();
+    }
+  }, [moduleInstance]);
+
   const streamStop = useCallback(
     (instant: boolean = true) => {
       if (moduleInstance) {
@@ -157,6 +163,7 @@ export const useTextToSpeech = (
     forward,
     stream,
     streamInsert,
+    streamFlush,
     streamStop,
     downloadProgress,
   };
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
index 6b6695f1f7..df1aaa01c8 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
@@ -184,17 +184,30 @@ export class TextToSpeechModule {
   }
 
   /**
-   * Inserts new content (text or IPA phonemes) into the buffer to be processed in streaming mode.
+   * Inserts new content (text or IPA phonemes) into the buffer to be processed
+   * in streaming mode. Trailing un-terminated content sits in the buffer
+   * until {@link TextToSpeechModule.streamFlush} or `streamStop(false)`
+   * releases it.
    * @param input - The text or phoneme fragment to append to the streaming buffer.
    */
   public streamInsert(input: string): void {
     this.nativeModule.streamInsert(input);
   }
 
+  /**
+   * Force-partitions whatever is currently buffered, even without an
+   * end-of-sentence character. Call after the final `streamInsert` of an
+   * utterance to play out the trailing tail without ending the stream.
+   */
+  public streamFlush(): void {
+    this.nativeModule.streamFlush();
+  }
+
   /**
    * Stops the streaming process if there is any ongoing.
    * @param instant - If true, stops the streaming as soon as possible. Otherwise
-   *                  allows the module to complete processing for the remains of the buffer.
+   *                  drains the current buffer (force-flushing any trailing
+   *                  un-terminated content) before stopping.
    */
   public streamStop(instant: boolean = true): void {
     this.nativeModule.streamStop(instant);
diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts
index a2dbd1905f..811d7d27c2 100644
--- a/packages/react-native-executorch/src/types/tts.ts
+++ b/packages/react-native-executorch/src/types/tts.ts
@@ -135,14 +135,36 @@ export interface TextToSpeechType {
   stream: (input: TextToSpeechStreamingInput) => Promise<void>;
 
   /**
-   * Inserts new text chunk into the buffer to be processed in streaming mode.
+   * Inserts a new text chunk into the buffer to be processed in streaming mode.
+   *
+   * Chunks accumulate until an end-of-sentence character (`.?!;…`) appears in
+   * the buffer, at which point they're partitioned and synthesized. If the
+   * caller stops feeding before an EOS arrives, the trailing tail will sit in
+   * the buffer until `streamFlush()` or `streamStop(false)` is called.
+   * @param textChunk - Text (or IPA phonemes) to append to the streaming buffer.
    */
   streamInsert: (textChunk: string) => void;
 
+  /**
+   * Requests the streaming session to partition and synthesize whatever is
+   * currently buffered, even if no end-of-sentence character is present.
+   *
+   * Call after the final `streamInsert` of an utterance when you want
+   * trailing un-terminated content to play out without ending the stream.
+   * LLM-style callers feeding partial tokens typically should not call this —
+   * model punctuation drives natural EOS partitioning, and the residual tail
+   * is drained by `streamStop(false)` when generation completes.
+   */
+  streamFlush: () => void;
+
   /**
    * Interrupts and stops the currently active audio generation stream.
    * @param instant If true, stops the streaming as soon as possible. Otherwise
-   *                allows the module to complete processing for the remains of the buffer.
+   *                drains the current buffer (force-flushing any trailing
+   *                un-terminated content) before stopping — equivalent to
+   *                calling {@link TextToSpeechType.streamFlush} followed by an
+   *                automatic stop once the buffer empties, so this call
+   *                always returns.
    */
   streamStop: (instant?: boolean) => void;
 }