Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e1a6f03
feat(tests): add eou feou test for voice transcription
sam-s10s Feb 4, 2026
9b1730c
feat: update and add new audio samples for tests
sam-s10s Feb 4, 2026
0873abb
Squashed commit of the following:
sam-s10s Feb 18, 2026
b6cb0e6
feat: add audio timestamp to ForceEndOfUtterance message
sam-s10s Feb 18, 2026
1567b28
enhance: add tracking of audio bytes sent and timestamps
sam-s10s Feb 18, 2026
f708c65
feat: add timestamp to force_end_of_utterance method
sam-s10s Feb 18, 2026
e9ea4d8
Merge branch 'main' into fix/feou
sam-s10s Feb 18, 2026
35318b2
revert change not meant for this MR
sam-s10s Feb 18, 2026
656ab23
test: add new test for audio timestamp accuracy
sam-s10s Feb 18, 2026
ba2d382
fix: adjust tolerance to zero in timestamp test
sam-s10s Feb 18, 2026
38432c1
refactor: update test for varied audio formats
sam-s10s Feb 18, 2026
21c09c5
updated test URL
sam-s10s Feb 18, 2026
f1a94de
enhance: add custom listeners for improved test logging
sam-s10s Feb 18, 2026
5a5245e
feat: enable additional audio test cases in test_17_eou_feou
sam-s10s Feb 22, 2026
33e693f
refactor: enhance end of turn penalty logic
sam-s10s Feb 23, 2026
301bcf4
refactor: enhance end of turn penalty logic
sam-s10s Feb 23, 2026
d9de589
Add Penalty when Smart Turn hasn't been run (#86)
LArmstrongDev Feb 25, 2026
3375c3d
Merge branch 'fix/smart-turn' of https://github.com/speechmatics/spee…
sam-s10s Feb 25, 2026
7a52b3f
test: add `test_no_feou_fix` for FEOU disabled
sam-s10s Mar 2, 2026
1443b33
feat: integrate config validation and improve presets
sam-s10s Mar 2, 2026
386f37b
fix: enforce use of forced end of utterance
sam-s10s Mar 3, 2026
155fceb
refactor: simplify EOU and VAD logic, improve readability
sam-s10s Mar 3, 2026
0b28473
refactor: remove forced end-of-utterance config from tests
sam-s10s Mar 3, 2026
31aa3ac
remove: Delete outdated conditional validation for 'use_forced_eou' i…
sam-s10s Mar 3, 2026
ca0f22f
fix: handle forced EOU more securely in turn management
sam-s10s Mar 3, 2026
95dda05
manually set FEOU to be disabled for the tests.
sam-s10s Mar 3, 2026
5ecc473
remove `ws_headers` as part of a different PR
sam-s10s Mar 3, 2026
e30cc5e
fix: correct logic for end of utterance handling
sam-s10s Mar 3, 2026
cd7de39
`refactor: extract config setup and ensure client disconnect`
sam-s10s Mar 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions sdk/rt/speechmatics/rt/_async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ._exceptions import TimeoutError
from ._exceptions import TranscriptionError
from ._logging import get_logger
from ._models import AudioEncoding
from ._models import AudioEventsConfig
from ._models import AudioFormat
from ._models import ClientMessageType
Expand Down Expand Up @@ -97,6 +98,8 @@ def __init__(
self.on(ServerMessageType.WARNING, self._on_warning)
self.on(ServerMessageType.AUDIO_ADDED, self._on_audio_added)

self._audio_format = AudioFormat(encoding=AudioEncoding.PCM_S16LE, sample_rate=44100, chunk_size=4096)

self._logger.debug("AsyncClient initialized (request_id=%s)", self._session.request_id)

async def start_session(
Expand Down Expand Up @@ -133,6 +136,9 @@ async def start_session(
... await client.start_session()
... await client.send_audio(frame)
"""
if audio_format is not None:
self._audio_format = audio_format

await self._start_recognition_session(
transcription_config=transcription_config,
audio_format=audio_format,
Expand Down Expand Up @@ -161,16 +167,24 @@ async def stop_session(self) -> None:
await self._session_done_evt.wait() # Wait for end of transcript event to indicate we can stop listening
await self.close()

async def force_end_of_utterance(self) -> None:
async def force_end_of_utterance(self, timestamp: Optional[float] = None) -> None:
"""
This method sends a ForceEndOfUtterance message to the server to signal
the end of an utterance. Forcing end of utterance will cause the final
transcript to be sent to the client early.

Takes an optional timestamp parameter to specify a marker for the engine
to use for timing of the end of the utterance. If not provided, the timestamp
will be calculated based on the cumulative audio sent to the server.

Args:
timestamp: Optional timestamp for the request.

Raises:
ConnectionError: If the WebSocket connection fails.
TranscriptionError: If the server reports an error during teardown.
TimeoutError: If the connection or teardown times out.
ValueError: If the audio format does not have an encoding set.

Examples:
Basic streaming:
Expand All @@ -179,7 +193,19 @@ async def force_end_of_utterance(self) -> None:
... await client.send_audio(frame)
... await client.force_end_of_utterance()
"""
await self.send_message({"message": ClientMessageType.FORCE_END_OF_UTTERANCE})
if timestamp is None:
timestamp = self.audio_seconds_sent

await self.send_message({"message": ClientMessageType.FORCE_END_OF_UTTERANCE, "timestamp": timestamp})

@property
def audio_seconds_sent(self) -> float:
"""Number of audio seconds sent to the server.

Raises:
ValueError: If the audio format does not have an encoding set.
"""
return self._audio_bytes_sent / (self._audio_format.sample_rate * self._audio_format.bytes_per_sample)

async def transcribe(
self,
Expand Down
7 changes: 7 additions & 0 deletions sdk/rt/speechmatics/rt/_base_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(self, transport: Transport) -> None:
self._recv_task: Optional[asyncio.Task[None]] = None
self._closed_evt = asyncio.Event()
self._eos_sent = False
self._audio_bytes_sent = 0
self._seq_no = 0

self._logger = get_logger("speechmatics.rt.base_client")
Expand Down Expand Up @@ -122,11 +123,17 @@ async def send_audio(self, payload: bytes) -> None:

try:
await self._transport.send_message(payload)
self._audio_bytes_sent += len(payload)
self._seq_no += 1
except Exception:
self._closed_evt.set()
raise

@property
def audio_bytes_sent(self) -> int:
"""Number of audio bytes sent to the server."""
return self._audio_bytes_sent

async def send_message(self, message: dict[str, Any]) -> None:
"""
Send a message through the WebSocket.
Expand Down
23 changes: 23 additions & 0 deletions sdk/rt/speechmatics/rt/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,29 @@ class AudioFormat:
sample_rate: int = 44100
chunk_size: int = 4096

_BYTES_PER_SAMPLE = {
AudioEncoding.PCM_F32LE: 4,
AudioEncoding.PCM_S16LE: 2,
AudioEncoding.MULAW: 1,
}

@property
def bytes_per_sample(self) -> int:
"""Number of bytes per audio sample based on encoding.

Raises:
ValueError: If encoding is None (file type) or unrecognized.
"""
if self.encoding is None:
raise ValueError(
"Cannot determine bytes per sample for file-type audio format. "
"Set an explicit encoding on AudioFormat."
)
try:
return self._BYTES_PER_SAMPLE[self.encoding]
except KeyError:
raise ValueError(f"Unknown encoding: {self.encoding}")

def to_dict(self) -> dict[str, Any]:
"""
Convert audio format to dictionary.
Expand Down
52 changes: 26 additions & 26 deletions sdk/voice/speechmatics/voice/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ def __init__(
preset_config = VoiceAgentConfigPreset.load(preset)
config = VoiceAgentConfigPreset._merge_configs(preset_config, config)

# Validate the final config (deferred to allow overlay/preset merging first)
if config is not None:
config.validate_config()

# Process the config
self._config, self._transcription_config, self._audio_format = self._prepare_config(config)

Expand Down Expand Up @@ -310,20 +314,16 @@ def __init__(
self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize)
self._eot_calculation_task: Optional[asyncio.Task] = None

# Uses fixed EndOfUtterance message from STT
self._uses_fixed_eou: bool = (
self._eou_mode == EndOfUtteranceMode.FIXED
and not self._silero_detector
and not self._config.end_of_turn_config.use_forced_eou
)
# # Uses fixed EndOfUtterance message from STT
self._listen_to_eou_messages: bool = self._eou_mode == EndOfUtteranceMode.FIXED and not self._silero_detector

# Uses ForceEndOfUtterance message
self._uses_forced_eou: bool = not self._uses_fixed_eou
# Forced end of utterance handling
self._forced_eou_active: bool = False
self._last_forced_eou_latency: float = 0.0
self._disable_feou_for_testing: bool = False

# Emit EOT prediction (uses _uses_forced_eou)
self._uses_eot_prediction: bool = self._eou_mode not in [
# Emit EOT prediction
self._emit_eot_predictions: bool = self._eou_mode not in [
EndOfUtteranceMode.FIXED,
EndOfUtteranceMode.EXTERNAL,
]
Expand Down Expand Up @@ -447,9 +447,7 @@ def _prepare_config(
)

# Fixed end of Utterance
if bool(
config.end_of_utterance_mode == EndOfUtteranceMode.FIXED and not config.end_of_turn_config.use_forced_eou
):
if config.end_of_utterance_mode == EndOfUtteranceMode.FIXED:
transcription_config.conversation_config = ConversationConfig(
end_of_utterance_silence_trigger=config.end_of_utterance_silence_trigger,
)
Expand Down Expand Up @@ -717,14 +715,11 @@ def update_diarization_config(self, config: SpeakerFocusConfig) -> None:
# PUBLIC UTTERANCE / TURN MANAGEMENT
# ============================================================================

def finalize(self, end_of_turn: bool = False) -> None:
def finalize(self) -> None:
"""Finalize segments.

This function will emit segments in the buffer without any further checks
on the contents of the segments.

Args:
end_of_turn: Whether to emit an end of turn message.
"""

# Clear smart turn cutoff
Expand All @@ -738,7 +733,7 @@ async def emit() -> None:
"""Wait for EndOfUtterance if needed, then emit segments."""

# Forced end of utterance message (only when no speaker is detected)
if self._config.end_of_turn_config.use_forced_eou:
if not self._disable_feou_for_testing:
await self._await_forced_eou()

# Check if the turn has changed
Expand All @@ -749,7 +744,7 @@ async def emit() -> None:
self._stt_message_queue.put_nowait(lambda: self._emit_segments(finalize=True, is_eou=True))

# Call async task (only if not already waiting for forced EOU)
if not (self._config.end_of_turn_config.use_forced_eou and self._forced_eou_active):
if not self._forced_eou_active:
asyncio.create_task(emit())

# ============================================================================
Expand Down Expand Up @@ -789,7 +784,7 @@ def _evt_on_final_transcript(message: dict[str, Any]) -> None:
self._stt_message_queue.put_nowait(lambda: self._handle_transcript(message, is_final=True))

# End of Utterance (FIXED mode only)
if self._uses_fixed_eou:
if self._listen_to_eou_messages:

@self.on(ServerMessageType.END_OF_UTTERANCE) # type: ignore[misc]
def _evt_on_end_of_utterance(message: dict[str, Any]) -> None:
Expand Down Expand Up @@ -1216,7 +1211,7 @@ async def _process_speech_fragments(self, change_filter: Optional[list[Annotatio
return

# Turn prediction
if self._uses_eot_prediction and self._uses_forced_eou and not self._forced_eou_active:
if self._emit_eot_predictions and not self._forced_eou_active and not self._disable_feou_for_testing:

async def fn() -> None:
ttl = await self._calculate_finalize_delay()
Expand Down Expand Up @@ -1526,6 +1521,12 @@ async def _calculate_finalize_delay(
# Smart Turn enabled
if self._smart_turn_detector:
annotation.add(AnnotationFlags.SMART_TURN_ACTIVE)
# If Smart Turn hasn't returned a result yet but is enabled, add NO_SIGNAL annotation.
# This covers the case where the TTL fires before VAD triggers Smart Turn inference.
if not annotation.has(AnnotationFlags.SMART_TURN_TRUE) and not annotation.has(
AnnotationFlags.SMART_TURN_FALSE
):
annotation.add(AnnotationFlags.SMART_TURN_NO_SIGNAL)
else:
annotation.add(AnnotationFlags.SMART_TURN_INACTIVE)

Expand All @@ -1551,8 +1552,7 @@ async def _calculate_finalize_delay(
delay = round(self._config.end_of_utterance_silence_trigger * multiplier, 3)

# Trim off the most recent forced EOU delay if we're in forced EOU mode
if self._uses_forced_eou:
delay -= self._last_forced_eou_latency
delay -= self._last_forced_eou_latency

# Clamp to max delay and adjust for TTFB
clamped_delay = min(delay, self._config.end_of_utterance_max_delay)
Expand Down Expand Up @@ -1850,7 +1850,7 @@ def _handle_silero_vad_result(self, result: SileroVADResult) -> None:
annotation.add(AnnotationFlags.VAD_STARTED)

# If speech has ended, we need to predict the end of turn
if result.speech_ended and self._uses_eot_prediction:
if result.speech_ended and self._emit_eot_predictions:
"""VAD-based end of turn prediction."""

# Set cutoff to prevent late transcripts from cancelling finalization
Expand Down Expand Up @@ -1878,7 +1878,7 @@ async def _handle_speaker_started(self, speaker: Optional[str], event_time: floa
await self._emit_start_of_turn(event_time)

# Update the turn handler
if self._uses_forced_eou:
if not self._disable_feou_for_testing:
self._turn_handler.reset()

# Emit the event
Expand All @@ -1902,7 +1902,7 @@ async def _handle_speaker_stopped(self, speaker: Optional[str], event_time: floa
self._last_speak_end_latency = self._total_time - event_time

# Turn prediction
if self._uses_eot_prediction and not self._forced_eou_active:
if self._emit_eot_predictions and not self._forced_eou_active:

async def fn() -> None:
ttl = await self._eot_prediction(event_time, speaker)
Expand Down
Loading