speechmatics · sam-s10s · Feb 4, 2026 · Feb 4, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/sdk/rt/speechmatics/rt/_async_client.py b/sdk/rt/speechmatics/rt/_async_client.py
@@ -12,6 +12,7 @@
 from ._exceptions import TimeoutError
 from ._exceptions import TranscriptionError
 from ._logging import get_logger
+from ._models import AudioEncoding
 from ._models import AudioEventsConfig
 from ._models import AudioFormat
 from ._models import ClientMessageType
@@ -97,6 +98,8 @@ def __init__(
         self.on(ServerMessageType.WARNING, self._on_warning)
         self.on(ServerMessageType.AUDIO_ADDED, self._on_audio_added)
 
+        self._audio_format = AudioFormat(encoding=AudioEncoding.PCM_S16LE, sample_rate=44100, chunk_size=4096)
+
         self._logger.debug("AsyncClient initialized (request_id=%s)", self._session.request_id)
 
     async def start_session(
@@ -133,6 +136,9 @@ async def start_session(
                 ...     await client.start_session()
                 ...     await client.send_audio(frame)
         """
+        if audio_format is not None:
+            self._audio_format = audio_format
+
         await self._start_recognition_session(
             transcription_config=transcription_config,
             audio_format=audio_format,
@@ -161,16 +167,24 @@ async def stop_session(self) -> None:
         await self._session_done_evt.wait()  # Wait for end of transcript event to indicate we can stop listening
         await self.close()
 
-    async def force_end_of_utterance(self) -> None:
+    async def force_end_of_utterance(self, timestamp: Optional[float] = None) -> None:
         """
         This method sends a ForceEndOfUtterance message to the server to signal
         the end of an utterance. Forcing end of utterance will cause the final
         transcript to be sent to the client early.
 
+        Takes an optional timestamp parameter to specify a marker for the engine
+        to use for timing of the end of the utterance. If not provided, the timestamp
+        will be calculated based on the cumulative audio sent to the server.
+
+        Args:
+            timestamp: Optional timestamp for the request.
+
         Raises:
             ConnectionError: If the WebSocket connection fails.
             TranscriptionError: If the server reports an error during teardown.
             TimeoutError: If the connection or teardown times out.
+            ValueError: If the audio format does not have an encoding set.
 
         Examples:
             Basic streaming:
@@ -179,7 +193,19 @@ async def force_end_of_utterance(self) -> None:
                 ...     await client.send_audio(frame)
                 ...     await client.force_end_of_utterance()
         """
-        await self.send_message({"message": ClientMessageType.FORCE_END_OF_UTTERANCE})
+        if timestamp is None:
+            timestamp = self.audio_seconds_sent
+
+        await self.send_message({"message": ClientMessageType.FORCE_END_OF_UTTERANCE, "timestamp": timestamp})
+
+    @property
+    def audio_seconds_sent(self) -> float:
+        """Number of audio seconds sent to the server.
+
+        Raises:
+            ValueError: If the audio format does not have an encoding set.
+        """
+        return self._audio_bytes_sent / (self._audio_format.sample_rate * self._audio_format.bytes_per_sample)
 
     async def transcribe(
         self,

diff --git a/sdk/rt/speechmatics/rt/_base_client.py b/sdk/rt/speechmatics/rt/_base_client.py
@@ -42,6 +42,7 @@ def __init__(self, transport: Transport) -> None:
         self._recv_task: Optional[asyncio.Task[None]] = None
         self._closed_evt = asyncio.Event()
         self._eos_sent = False
+        self._audio_bytes_sent = 0
         self._seq_no = 0
 
         self._logger = get_logger("speechmatics.rt.base_client")
@@ -122,11 +123,17 @@ async def send_audio(self, payload: bytes) -> None:
 
         try:
             await self._transport.send_message(payload)
+            self._audio_bytes_sent += len(payload)
             self._seq_no += 1
         except Exception:
             self._closed_evt.set()
             raise
 
+    @property
+    def audio_bytes_sent(self) -> int:
+        """Number of audio bytes sent to the server."""
+        return self._audio_bytes_sent
+
     async def send_message(self, message: dict[str, Any]) -> None:
         """
         Send a message through the WebSocket.

diff --git a/sdk/rt/speechmatics/rt/_models.py b/sdk/rt/speechmatics/rt/_models.py
@@ -183,6 +183,29 @@ class AudioFormat:
     sample_rate: int = 44100
     chunk_size: int = 4096
 
+    _BYTES_PER_SAMPLE = {
+        AudioEncoding.PCM_F32LE: 4,
+        AudioEncoding.PCM_S16LE: 2,
+        AudioEncoding.MULAW: 1,
+    }
+
+    @property
+    def bytes_per_sample(self) -> int:
+        """Number of bytes per audio sample based on encoding.
+
+        Raises:
+            ValueError: If encoding is None (file type) or unrecognized.
+        """
+        if self.encoding is None:
+            raise ValueError(
+                "Cannot determine bytes per sample for file-type audio format. "
+                "Set an explicit encoding on AudioFormat."
+            )
+        try:
+            return self._BYTES_PER_SAMPLE[self.encoding]
+        except KeyError:
+            raise ValueError(f"Unknown encoding: {self.encoding}")
+
     def to_dict(self) -> dict[str, Any]:
         """
         Convert audio format to dictionary.

diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py
@@ -176,6 +176,10 @@ def __init__(
             preset_config = VoiceAgentConfigPreset.load(preset)
             config = VoiceAgentConfigPreset._merge_configs(preset_config, config)
 
+        # Validate the final config (deferred to allow overlay/preset merging first)
+        if config is not None:
+            config.validate_config()
+
         # Process the config
         self._config, self._transcription_config, self._audio_format = self._prepare_config(config)
 
@@ -310,20 +314,16 @@ def __init__(
         self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize)
         self._eot_calculation_task: Optional[asyncio.Task] = None
 
-        # Uses fixed EndOfUtterance message from STT
-        self._uses_fixed_eou: bool = (
-            self._eou_mode == EndOfUtteranceMode.FIXED
-            and not self._silero_detector
-            and not self._config.end_of_turn_config.use_forced_eou
-        )
+        # # Uses fixed EndOfUtterance message from STT
+        self._listen_to_eou_messages: bool = self._eou_mode == EndOfUtteranceMode.FIXED and not self._silero_detector
 
-        # Uses ForceEndOfUtterance message
-        self._uses_forced_eou: bool = not self._uses_fixed_eou
+        # Forced end of utterance handling
         self._forced_eou_active: bool = False
         self._last_forced_eou_latency: float = 0.0
+        self._disable_feou_for_testing: bool = False
 
-        # Emit EOT prediction (uses _uses_forced_eou)
-        self._uses_eot_prediction: bool = self._eou_mode not in [
+        # Emit EOT prediction
+        self._emit_eot_predictions: bool = self._eou_mode not in [
             EndOfUtteranceMode.FIXED,
             EndOfUtteranceMode.EXTERNAL,
         ]
@@ -447,9 +447,7 @@ def _prepare_config(
             )
 
         # Fixed end of Utterance
-        if bool(
-            config.end_of_utterance_mode == EndOfUtteranceMode.FIXED and not config.end_of_turn_config.use_forced_eou
-        ):
+        if config.end_of_utterance_mode == EndOfUtteranceMode.FIXED:
             transcription_config.conversation_config = ConversationConfig(
                 end_of_utterance_silence_trigger=config.end_of_utterance_silence_trigger,
             )
@@ -717,14 +715,11 @@ def update_diarization_config(self, config: SpeakerFocusConfig) -> None:
     # PUBLIC UTTERANCE / TURN MANAGEMENT
     # ============================================================================
 
-    def finalize(self, end_of_turn: bool = False) -> None:
+    def finalize(self) -> None:
         """Finalize segments.
 
         This function will emit segments in the buffer without any further checks
         on the contents of the segments.
-
-        Args:
-            end_of_turn: Whether to emit an end of turn message.
         """
 
         # Clear smart turn cutoff
@@ -738,7 +733,7 @@ async def emit() -> None:
             """Wait for EndOfUtterance if needed, then emit segments."""
 
             # Forced end of utterance message (only when no speaker is detected)
-            if self._config.end_of_turn_config.use_forced_eou:
+            if not self._disable_feou_for_testing:
                 await self._await_forced_eou()
 
             # Check if the turn has changed
@@ -749,7 +744,7 @@ async def emit() -> None:
             self._stt_message_queue.put_nowait(lambda: self._emit_segments(finalize=True, is_eou=True))
 
         # Call async task (only if not already waiting for forced EOU)
-        if not (self._config.end_of_turn_config.use_forced_eou and self._forced_eou_active):
+        if not self._forced_eou_active:
             asyncio.create_task(emit())
 
     # ============================================================================
@@ -789,7 +784,7 @@ def _evt_on_final_transcript(message: dict[str, Any]) -> None:
             self._stt_message_queue.put_nowait(lambda: self._handle_transcript(message, is_final=True))
 
         # End of Utterance (FIXED mode only)
-        if self._uses_fixed_eou:
+        if self._listen_to_eou_messages:
 
             @self.on(ServerMessageType.END_OF_UTTERANCE)  # type: ignore[misc]
             def _evt_on_end_of_utterance(message: dict[str, Any]) -> None:
@@ -1216,7 +1211,7 @@ async def _process_speech_fragments(self, change_filter: Optional[list[Annotatio
                 return
 
         # Turn prediction
-        if self._uses_eot_prediction and self._uses_forced_eou and not self._forced_eou_active:
+        if self._emit_eot_predictions and not self._forced_eou_active and not self._disable_feou_for_testing:
 
             async def fn() -> None:
                 ttl = await self._calculate_finalize_delay()
@@ -1526,6 +1521,12 @@ async def _calculate_finalize_delay(
         # Smart Turn enabled
         if self._smart_turn_detector:
             annotation.add(AnnotationFlags.SMART_TURN_ACTIVE)
+            # If Smart Turn hasn't returned a result yet but is enabled, add NO_SIGNAL annotation.
+            # This covers the case where the TTL fires before VAD triggers Smart Turn inference.
+            if not annotation.has(AnnotationFlags.SMART_TURN_TRUE) and not annotation.has(
+                AnnotationFlags.SMART_TURN_FALSE
+            ):
+                annotation.add(AnnotationFlags.SMART_TURN_NO_SIGNAL)
         else:
             annotation.add(AnnotationFlags.SMART_TURN_INACTIVE)
 
@@ -1551,8 +1552,7 @@ async def _calculate_finalize_delay(
         delay = round(self._config.end_of_utterance_silence_trigger * multiplier, 3)
 
         # Trim off the most recent forced EOU delay if we're in forced EOU mode
-        if self._uses_forced_eou:
-            delay -= self._last_forced_eou_latency
+        delay -= self._last_forced_eou_latency
 
         # Clamp to max delay and adjust for TTFB
         clamped_delay = min(delay, self._config.end_of_utterance_max_delay)
@@ -1850,7 +1850,7 @@ def _handle_silero_vad_result(self, result: SileroVADResult) -> None:
             annotation.add(AnnotationFlags.VAD_STARTED)
 
         # If speech has ended, we need to predict the end of turn
-        if result.speech_ended and self._uses_eot_prediction:
+        if result.speech_ended and self._emit_eot_predictions:
             """VAD-based end of turn prediction."""
 
             # Set cutoff to prevent late transcripts from cancelling finalization
@@ -1878,7 +1878,7 @@ async def _handle_speaker_started(self, speaker: Optional[str], event_time: floa
             await self._emit_start_of_turn(event_time)
 
         # Update the turn handler
-        if self._uses_forced_eou:
+        if not self._disable_feou_for_testing:
             self._turn_handler.reset()
 
         # Emit the event
@@ -1902,7 +1902,7 @@ async def _handle_speaker_stopped(self, speaker: Optional[str], event_time: floa
         self._last_speak_end_latency = self._total_time - event_time
 
         # Turn prediction
-        if self._uses_eot_prediction and not self._forced_eou_active:
+        if self._emit_eot_predictions and not self._forced_eou_active:
 
             async def fn() -> None:
                 ttl = await self._eot_prediction(event_time, speaker)