Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions livekit-agents/livekit/agents/voice/audio_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,10 +877,6 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:

if not self._vad or self._last_speaking_time is None:
# vad disabled, use stt timestamp
# TODO: this would screw up transcription latency metrics
# but we'll live with it for now.
# the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
# and using that timestamp for _last_speaking_time
self._last_speaking_time = time.time()

if self._vad_base_turn_detection or self._user_turn_committed:
Expand Down Expand Up @@ -955,7 +951,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
)
self._audio_interim_transcript = ev.alternatives[0].text

elif ev.type == stt.SpeechEventType.END_OF_SPEECH and self._turn_detection_mode == "stt":
elif ev.type == stt.SpeechEventType.END_OF_SPEECH and (
self._turn_detection_mode == "stt" or self._vad is None
):
with trace.use_span(self._ensure_user_turn_span()):
self._hooks.on_end_of_speech(None)

Expand All @@ -972,14 +970,17 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
self.update_vad(self._vad)

self._speaking = False
self._user_turn_committed = True
if not self._vad or self._last_speaking_time is None:
self._last_speaking_time = time.time()

chat_ctx = self._hooks.retrieve_chat_ctx().copy()
self._run_eou_detection(chat_ctx)
if self._turn_detection_mode == "stt":
self._user_turn_committed = True
chat_ctx = self._hooks.retrieve_chat_ctx().copy()
self._run_eou_detection(chat_ctx)
Comment on lines 970 to +979
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Missing _stt_end_of_speech_received flag implementation causes tests to fail and feature to not work

The tests in TestSttSpeechEndTiming assert on _stt_end_of_speech_received (lines 237, 255, 272) and expect that _last_speaking_time set by END_OF_SPEECH is preserved when FINAL_TRANSCRIPT arrives. However, the production code never declares, initializes, or sets this flag. The git history shows commit 05075f06c added the full implementation (flag in __init__, setting it in END_OF_SPEECH/START_OF_SPEECH handlers, and guarding _last_speaking_time in FINAL_TRANSCRIPT), but the final commit b0164a98c reverted all of that while keeping the tests.

Consequences:

  1. Test test_stt_eos_timestamp_is_preserved_for_final_transcript_without_external_vad will fail: asserts _stt_end_of_speech_received is True but the flag stays False (as set by the test itself, never modified by production code); asserts _last_speaking_time == stt_eos_time but the FINAL_TRANSCRIPT handler at audio_recognition.py:878 unconditionally overwrites it when not self._vad.
  2. The intended behavior (preserving END_OF_SPEECH timestamp for latency metrics when no external VAD is configured) is not implemented — _last_speaking_time is always overwritten by transcript arrival time.

(Refers to lines 954-979)

Prompt for agents
The PR is missing the _stt_end_of_speech_received flag that the tests depend on. The intermediate commit 05075f06c had the complete implementation but the final commit b0164a98c reverted it. To fix:

1. In AudioRecognition.__init__ (around line 162), add: self._stt_end_of_speech_received = False
2. In clear_user_turn() (around line 679), add: self._stt_end_of_speech_received = False
3. In the END_OF_SPEECH handler (after line 955), add: self._stt_end_of_speech_received = True
4. In the START_OF_SPEECH handler (after line 982), add: self._stt_end_of_speech_received = False
5. In the FINAL_TRANSCRIPT handler (line 878), change the condition from:
   if not self._vad or self._last_speaking_time is None:
   to:
   if self._last_speaking_time is None or (not self._vad and not self._stt_end_of_speech_received):
6. Similarly in the PREFLIGHT_TRANSCRIPT handler (line 931), apply the same condition change.
7. In _bounce_eou_task cleanup (around line 1194), add: self._stt_end_of_speech_received = False

Refer to the intermediate commit 05075f06c for the complete implementation that matches the tests.
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.


elif ev.type == stt.SpeechEventType.START_OF_SPEECH and self._turn_detection_mode == "stt":
elif ev.type == stt.SpeechEventType.START_OF_SPEECH and (
self._turn_detection_mode == "stt" or self._vad is None
):
# If the plugin provided a server onset timestamp, use it;
# otherwise fall back to message arrival time.
if self._speech_start_time is None:
Expand Down
103 changes: 103 additions & 0 deletions tests/test_speech_start_time_persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import pytest

from livekit.agents import stt
from livekit.agents.vad import VADEvent, VADEventType
from livekit.agents.voice.audio_recognition import AudioRecognition

Expand Down Expand Up @@ -169,3 +170,105 @@ async def test_speech_start_time_updates_per_burst(self):
# _speech_start_time should now reflect the second burst's start, not the first
assert audio_recognition._speech_start_time is not None
assert audio_recognition._speech_start_time > first_burst_speech_start


class TestSttSpeechEndTiming:
"""Test STT-provided END_OF_SPEECH timing when no external VAD is configured."""

def _create_audio_recognition(self, *, vad: object | None = None) -> AudioRecognition:
with patch.object(AudioRecognition, "__init__", lambda self, *args, **kwargs: None):
audio_recognition = AudioRecognition.__new__(AudioRecognition)

audio_recognition._turn_detection_mode = None
audio_recognition._turn_detector = None
audio_recognition._vad = vad
audio_recognition._stt = MagicMock()
audio_recognition._stt_request_ids = []
audio_recognition._interruption_enabled = False
audio_recognition._user_turn_committed = False
audio_recognition._end_of_turn_task = None

audio_recognition._speech_start_time = None
audio_recognition._vad_speech_started = False
audio_recognition._speaking = False
audio_recognition._last_speaking_time = None
audio_recognition._last_final_transcript_time = None
audio_recognition._stt_end_of_speech_received = False

audio_recognition._last_language = None
audio_recognition._audio_transcript = ""
audio_recognition._audio_interim_transcript = ""
audio_recognition._audio_preflight_transcript = ""
audio_recognition._final_transcript_confidence = []
audio_recognition._final_transcript_received = asyncio.Event()
audio_recognition._vad_base_turn_detection = False

audio_recognition._hooks = MagicMock()
audio_recognition._session = MagicMock()
audio_recognition._session.amd = None
audio_recognition._ensure_user_turn_span = MagicMock()

return audio_recognition

@staticmethod
def _stt_event(type_: stt.SpeechEventType, text: str = "hello") -> stt.SpeechEvent:
return stt.SpeechEvent(
type=type_,
alternatives=[stt.SpeechData(text=text, language="en")],
)

@pytest.mark.asyncio
async def test_stt_eos_timestamp_is_preserved_for_final_transcript_without_external_vad(self):
audio_recognition = self._create_audio_recognition()

with patch(
"livekit.agents.voice.audio_recognition.time.time",
side_effect=[10.0, 10.5, 10.6, 10.7],
):
await audio_recognition._on_stt_event(
self._stt_event(stt.SpeechEventType.END_OF_SPEECH)
)
stt_eos_time = audio_recognition._last_speaking_time

await audio_recognition._on_stt_event(
self._stt_event(stt.SpeechEventType.FINAL_TRANSCRIPT)
)

assert audio_recognition._stt_end_of_speech_received is True
assert stt_eos_time == 10.0
assert audio_recognition._last_final_transcript_time == 10.6
assert audio_recognition._last_speaking_time == stt_eos_time

@pytest.mark.asyncio
async def test_final_transcript_falls_back_when_stt_eos_was_not_received(self):
audio_recognition = self._create_audio_recognition()
audio_recognition._last_speaking_time = 10.0

with patch(
"livekit.agents.voice.audio_recognition.time.time",
side_effect=[20.0, 20.1, 20.2],
):
await audio_recognition._on_stt_event(
self._stt_event(stt.SpeechEventType.FINAL_TRANSCRIPT)
)

assert audio_recognition._stt_end_of_speech_received is False
assert audio_recognition._last_final_transcript_time == 20.1
assert audio_recognition._last_speaking_time == 20.2

@pytest.mark.asyncio
async def test_external_vad_timestamp_is_not_overwritten_by_final_transcript(self):
audio_recognition = self._create_audio_recognition(vad=object())
audio_recognition._last_speaking_time = 10.0

with patch(
"livekit.agents.voice.audio_recognition.time.time",
side_effect=[20.0, 20.1],
):
await audio_recognition._on_stt_event(
self._stt_event(stt.SpeechEventType.FINAL_TRANSCRIPT)
)

assert audio_recognition._stt_end_of_speech_received is False
assert audio_recognition._last_final_transcript_time == 20.1
assert audio_recognition._last_speaking_time == 10.0
Loading