diff --git a/sdk/rt/speechmatics/rt/_async_client.py b/sdk/rt/speechmatics/rt/_async_client.py index 5e581e15..892e2549 100644 --- a/sdk/rt/speechmatics/rt/_async_client.py +++ b/sdk/rt/speechmatics/rt/_async_client.py @@ -12,6 +12,7 @@ from ._exceptions import TimeoutError from ._exceptions import TranscriptionError from ._logging import get_logger +from ._models import AudioEncoding from ._models import AudioEventsConfig from ._models import AudioFormat from ._models import ClientMessageType @@ -97,6 +98,8 @@ def __init__( self.on(ServerMessageType.WARNING, self._on_warning) self.on(ServerMessageType.AUDIO_ADDED, self._on_audio_added) + self._audio_format = AudioFormat(encoding=AudioEncoding.PCM_S16LE, sample_rate=44100, chunk_size=4096) + self._logger.debug("AsyncClient initialized (request_id=%s)", self._session.request_id) async def start_session( @@ -133,6 +136,9 @@ async def start_session( ... await client.start_session() ... await client.send_audio(frame) """ + if audio_format is not None: + self._audio_format = audio_format + await self._start_recognition_session( transcription_config=transcription_config, audio_format=audio_format, @@ -161,16 +167,24 @@ async def stop_session(self) -> None: await self._session_done_evt.wait() # Wait for end of transcript event to indicate we can stop listening await self.close() - async def force_end_of_utterance(self) -> None: + async def force_end_of_utterance(self, timestamp: Optional[float] = None) -> None: """ This method sends a ForceEndOfUtterance message to the server to signal the end of an utterance. Forcing end of utterance will cause the final transcript to be sent to the client early. + Takes an optional timestamp parameter to specify a marker for the engine + to use for timing of the end of the utterance. If not provided, the timestamp + will be calculated based on the cumulative audio sent to the server. + + Args: + timestamp: Optional timestamp for the request. + Raises: ConnectionError: If the WebSocket connection fails. TranscriptionError: If the server reports an error during teardown. TimeoutError: If the connection or teardown times out. + ValueError: If the audio format does not have an encoding set. Examples: Basic streaming: @@ -179,7 +193,19 @@ async def force_end_of_utterance(self) -> None: ... await client.send_audio(frame) ... await client.force_end_of_utterance() """ - await self.send_message({"message": ClientMessageType.FORCE_END_OF_UTTERANCE}) + if timestamp is None: + timestamp = self.audio_seconds_sent + + await self.send_message({"message": ClientMessageType.FORCE_END_OF_UTTERANCE, "timestamp": timestamp}) + + @property + def audio_seconds_sent(self) -> float: + """Number of audio seconds sent to the server. + + Raises: + ValueError: If the audio format does not have an encoding set. + """ + return self._audio_bytes_sent / (self._audio_format.sample_rate * self._audio_format.bytes_per_sample) async def transcribe( self, diff --git a/sdk/rt/speechmatics/rt/_base_client.py b/sdk/rt/speechmatics/rt/_base_client.py index 0ac6d085..89167e20 100644 --- a/sdk/rt/speechmatics/rt/_base_client.py +++ b/sdk/rt/speechmatics/rt/_base_client.py @@ -42,6 +42,7 @@ def __init__(self, transport: Transport) -> None: self._recv_task: Optional[asyncio.Task[None]] = None self._closed_evt = asyncio.Event() self._eos_sent = False + self._audio_bytes_sent = 0 self._seq_no = 0 self._logger = get_logger("speechmatics.rt.base_client") @@ -122,11 +123,17 @@ async def send_audio(self, payload: bytes) -> None: try: await self._transport.send_message(payload) + self._audio_bytes_sent += len(payload) self._seq_no += 1 except Exception: self._closed_evt.set() raise + @property + def audio_bytes_sent(self) -> int: + """Number of audio bytes sent to the server.""" + return self._audio_bytes_sent + async def send_message(self, message: dict[str, Any]) -> None: """ Send a message through the WebSocket. diff --git a/sdk/rt/speechmatics/rt/_models.py b/sdk/rt/speechmatics/rt/_models.py index 84e57204..d1f6acbf 100644 --- a/sdk/rt/speechmatics/rt/_models.py +++ b/sdk/rt/speechmatics/rt/_models.py @@ -183,6 +183,29 @@ class AudioFormat: sample_rate: int = 44100 chunk_size: int = 4096 + _BYTES_PER_SAMPLE = { + AudioEncoding.PCM_F32LE: 4, + AudioEncoding.PCM_S16LE: 2, + AudioEncoding.MULAW: 1, + } + + @property + def bytes_per_sample(self) -> int: + """Number of bytes per audio sample based on encoding. + + Raises: + ValueError: If encoding is None (file type) or unrecognized. + """ + if self.encoding is None: + raise ValueError( + "Cannot determine bytes per sample for file-type audio format. " + "Set an explicit encoding on AudioFormat." + ) + try: + return self._BYTES_PER_SAMPLE[self.encoding] + except KeyError: + raise ValueError(f"Unknown encoding: {self.encoding}") + def to_dict(self) -> dict[str, Any]: """ Convert audio format to dictionary. diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py index c0988dd3..37fa9df7 100644 --- a/sdk/voice/speechmatics/voice/_client.py +++ b/sdk/voice/speechmatics/voice/_client.py @@ -176,6 +176,10 @@ def __init__( preset_config = VoiceAgentConfigPreset.load(preset) config = VoiceAgentConfigPreset._merge_configs(preset_config, config) + # Validate the final config (deferred to allow overlay/preset merging first) + if config is not None: + config.validate_config() + # Process the config self._config, self._transcription_config, self._audio_format = self._prepare_config(config) @@ -310,20 +314,16 @@ def __init__( self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize) self._eot_calculation_task: Optional[asyncio.Task] = None - # Uses fixed EndOfUtterance message from STT - self._uses_fixed_eou: bool = ( - self._eou_mode == EndOfUtteranceMode.FIXED - and not self._silero_detector - and not self._config.end_of_turn_config.use_forced_eou - ) + # # Uses fixed EndOfUtterance message from STT + self._listen_to_eou_messages: bool = self._eou_mode == EndOfUtteranceMode.FIXED and not self._silero_detector - # Uses ForceEndOfUtterance message - self._uses_forced_eou: bool = not self._uses_fixed_eou + # Forced end of utterance handling self._forced_eou_active: bool = False self._last_forced_eou_latency: float = 0.0 + self._disable_feou_for_testing: bool = False - # Emit EOT prediction (uses _uses_forced_eou) - self._uses_eot_prediction: bool = self._eou_mode not in [ + # Emit EOT prediction + self._emit_eot_predictions: bool = self._eou_mode not in [ EndOfUtteranceMode.FIXED, EndOfUtteranceMode.EXTERNAL, ] @@ -447,9 +447,7 @@ def _prepare_config( ) # Fixed end of Utterance - if bool( - config.end_of_utterance_mode == EndOfUtteranceMode.FIXED and not config.end_of_turn_config.use_forced_eou - ): + if config.end_of_utterance_mode == EndOfUtteranceMode.FIXED: transcription_config.conversation_config = ConversationConfig( end_of_utterance_silence_trigger=config.end_of_utterance_silence_trigger, ) @@ -717,14 +715,11 @@ def update_diarization_config(self, config: SpeakerFocusConfig) -> None: # PUBLIC UTTERANCE / TURN MANAGEMENT # ============================================================================ - def finalize(self, end_of_turn: bool = False) -> None: + def finalize(self) -> None: """Finalize segments. This function will emit segments in the buffer without any further checks on the contents of the segments. - - Args: - end_of_turn: Whether to emit an end of turn message. """ # Clear smart turn cutoff @@ -738,7 +733,7 @@ async def emit() -> None: """Wait for EndOfUtterance if needed, then emit segments.""" # Forced end of utterance message (only when no speaker is detected) - if self._config.end_of_turn_config.use_forced_eou: + if not self._disable_feou_for_testing: await self._await_forced_eou() # Check if the turn has changed @@ -749,7 +744,7 @@ async def emit() -> None: self._stt_message_queue.put_nowait(lambda: self._emit_segments(finalize=True, is_eou=True)) # Call async task (only if not already waiting for forced EOU) - if not (self._config.end_of_turn_config.use_forced_eou and self._forced_eou_active): + if not self._forced_eou_active: asyncio.create_task(emit()) # ============================================================================ @@ -789,7 +784,7 @@ def _evt_on_final_transcript(message: dict[str, Any]) -> None: self._stt_message_queue.put_nowait(lambda: self._handle_transcript(message, is_final=True)) # End of Utterance (FIXED mode only) - if self._uses_fixed_eou: + if self._listen_to_eou_messages: @self.on(ServerMessageType.END_OF_UTTERANCE) # type: ignore[misc] def _evt_on_end_of_utterance(message: dict[str, Any]) -> None: @@ -1216,7 +1211,7 @@ async def _process_speech_fragments(self, change_filter: Optional[list[Annotatio return # Turn prediction - if self._uses_eot_prediction and self._uses_forced_eou and not self._forced_eou_active: + if self._emit_eot_predictions and not self._forced_eou_active and not self._disable_feou_for_testing: async def fn() -> None: ttl = await self._calculate_finalize_delay() @@ -1526,6 +1521,12 @@ async def _calculate_finalize_delay( # Smart Turn enabled if self._smart_turn_detector: annotation.add(AnnotationFlags.SMART_TURN_ACTIVE) + # If Smart Turn hasn't returned a result yet but is enabled, add NO_SIGNAL annotation. + # This covers the case where the TTL fires before VAD triggers Smart Turn inference. + if not annotation.has(AnnotationFlags.SMART_TURN_TRUE) and not annotation.has( + AnnotationFlags.SMART_TURN_FALSE + ): + annotation.add(AnnotationFlags.SMART_TURN_NO_SIGNAL) else: annotation.add(AnnotationFlags.SMART_TURN_INACTIVE) @@ -1551,8 +1552,7 @@ async def _calculate_finalize_delay( delay = round(self._config.end_of_utterance_silence_trigger * multiplier, 3) # Trim off the most recent forced EOU delay if we're in forced EOU mode - if self._uses_forced_eou: - delay -= self._last_forced_eou_latency + delay -= self._last_forced_eou_latency # Clamp to max delay and adjust for TTFB clamped_delay = min(delay, self._config.end_of_utterance_max_delay) @@ -1850,7 +1850,7 @@ def _handle_silero_vad_result(self, result: SileroVADResult) -> None: annotation.add(AnnotationFlags.VAD_STARTED) # If speech has ended, we need to predict the end of turn - if result.speech_ended and self._uses_eot_prediction: + if result.speech_ended and self._emit_eot_predictions: """VAD-based end of turn prediction.""" # Set cutoff to prevent late transcripts from cancelling finalization @@ -1878,7 +1878,7 @@ async def _handle_speaker_started(self, speaker: Optional[str], event_time: floa await self._emit_start_of_turn(event_time) # Update the turn handler - if self._uses_forced_eou: + if not self._disable_feou_for_testing: self._turn_handler.reset() # Emit the event @@ -1902,7 +1902,7 @@ async def _handle_speaker_stopped(self, speaker: Optional[str], event_time: floa self._last_speak_end_latency = self._total_time - event_time # Turn prediction - if self._uses_eot_prediction and not self._forced_eou_active: + if self._emit_eot_predictions and not self._forced_eou_active: async def fn() -> None: ttl = await self._eot_prediction(event_time, speaker) diff --git a/sdk/voice/speechmatics/voice/_models.py b/sdk/voice/speechmatics/voice/_models.py index b4a432c2..c58a7ca6 100644 --- a/sdk/voice/speechmatics/voice/_models.py +++ b/sdk/voice/speechmatics/voice/_models.py @@ -13,7 +13,6 @@ from pydantic import BaseModel as PydanticBaseModel from pydantic import ConfigDict from pydantic import Field -from pydantic import model_validator from typing_extensions import Self from speechmatics.rt import AudioEncoding @@ -261,6 +260,7 @@ class AnnotationFlags(str, Enum): SMART_TURN_INACTIVE = "smart_turn_inactive" SMART_TURN_TRUE = "smart_turn_true" SMART_TURN_FALSE = "smart_turn_false" + SMART_TURN_NO_SIGNAL = "smart_turn_no_signal" # ============================================================================== @@ -410,35 +410,57 @@ class EndOfTurnConfig(BaseModel): base_multiplier: Base multiplier for end of turn delay. min_end_of_turn_delay: Minimum end of turn delay. penalties: List of end of turn penalty items. - use_forced_eou: Whether to use forced end of utterance detection. + use_forced_eou: Whether to use forced end of utterance detection. (SHOULD ONLY EVER BE TRUE) """ base_multiplier: float = 1.0 min_end_of_turn_delay: float = 0.01 penalties: list[EndOfTurnPenaltyItem] = Field( default_factory=lambda: [ - # Increase delay + # + # Speaker rate increases expected TTL EndOfTurnPenaltyItem(penalty=3.0, annotation=[AnnotationFlags.VERY_SLOW_SPEAKER]), EndOfTurnPenaltyItem(penalty=2.0, annotation=[AnnotationFlags.SLOW_SPEAKER]), + # + # High / low rate of disfluencies EndOfTurnPenaltyItem(penalty=2.5, annotation=[AnnotationFlags.ENDS_WITH_DISFLUENCY]), EndOfTurnPenaltyItem(penalty=1.1, annotation=[AnnotationFlags.HAS_DISFLUENCY]), + # + # We do NOT have an end of sentence character EndOfTurnPenaltyItem( penalty=2.0, annotation=[AnnotationFlags.ENDS_WITH_EOS], is_not=True, ), - # Decrease delay + # + # We have finals and end of sentence EndOfTurnPenaltyItem( penalty=0.5, annotation=[AnnotationFlags.ENDS_WITH_FINAL, AnnotationFlags.ENDS_WITH_EOS] ), - # Smart Turn + VAD - EndOfTurnPenaltyItem(penalty=0.2, annotation=[AnnotationFlags.SMART_TURN_TRUE]), + # + # Smart Turn - when false, wait longer to prevent premature end of turn EndOfTurnPenaltyItem( - penalty=0.2, annotation=[AnnotationFlags.VAD_STOPPED, AnnotationFlags.SMART_TURN_INACTIVE] + penalty=0.2, annotation=[AnnotationFlags.SMART_TURN_TRUE, AnnotationFlags.SMART_TURN_ACTIVE] + ), + EndOfTurnPenaltyItem( + penalty=2.0, annotation=[AnnotationFlags.SMART_TURN_FALSE, AnnotationFlags.SMART_TURN_ACTIVE] + ), + EndOfTurnPenaltyItem( + penalty=1.5, annotation=[AnnotationFlags.SMART_TURN_NO_SIGNAL, AnnotationFlags.SMART_TURN_ACTIVE] + ), + # + # VAD - only applied when smart turn is not in use and on the speaker stopping + EndOfTurnPenaltyItem( + penalty=0.2, + annotation=[ + AnnotationFlags.VAD_STOPPED, + AnnotationFlags.VAD_ACTIVE, + AnnotationFlags.SMART_TURN_INACTIVE, + ], ), ] ) - use_forced_eou: bool = False + use_forced_eou: bool = True class VoiceActivityConfig(BaseModel): @@ -711,10 +733,16 @@ class VoiceAgentConfig(BaseModel): audio_encoding: AudioEncoding = AudioEncoding.PCM_S16LE chunk_size: int = 160 - # Validation - @model_validator(mode="after") # type: ignore[misc] - def validate_config(self) -> Self: - """Validate the configuration.""" + def validate_config(self) -> None: + """Validate the configuration. + + Cross-field validation is deferred to this method so that configs can be + constructed as overlays (e.g. for presets) without triggering validation + on intermediate states. Call this once the final config is ready. + + Raises: + ValueError: If any validation errors are found. + """ # Validation errors errors: list[str] = [] @@ -723,12 +751,6 @@ def validate_config(self) -> Self: if self.end_of_utterance_mode == EndOfUtteranceMode.EXTERNAL and self.smart_turn_config: errors.append("EXTERNAL mode cannot be used in conjunction with SmartTurnConfig") - # Cannot have FIXED and forced end of utterance enabled without VAD being enabled - if (self.end_of_utterance_mode == EndOfUtteranceMode.FIXED and self.end_of_turn_config.use_forced_eou) and not ( - self.vad_config and self.vad_config.enabled - ): - errors.append("FIXED mode cannot be used in conjunction with forced end of utterance without VAD enabled") - # Cannot use VAD with external end of utterance mode if self.end_of_utterance_mode == EndOfUtteranceMode.EXTERNAL and (self.vad_config and self.vad_config.enabled): errors.append("EXTERNAL mode cannot be used in conjunction with VAD being enabled") @@ -751,13 +773,14 @@ def validate_config(self) -> Self: if self.sample_rate not in [8000, 16000]: errors.append("sample_rate must be 8000 or 16000") + # Check that forced end of utterance is set to True + if not self.end_of_turn_config.use_forced_eou: + errors.append("EndOfTurnConfig.use_forced_eou cannot be False") + # Raise error if any validation errors if errors: raise ValueError(f"{len(errors)} config error(s): {'; '.join(errors)}") - # Return validated config - return self - # ============================================================================== # SESSION & INFO MODELS diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py index 2bcb092f..663b4eff 100644 --- a/sdk/voice/speechmatics/voice/_presets.py +++ b/sdk/voice/speechmatics/voice/_presets.py @@ -232,4 +232,10 @@ def _merge_configs(base: VoiceAgentConfig, overlay: Optional[VoiceAgentConfig]) **base.model_dump(exclude_unset=True, exclude_none=True), **overlay.model_dump(exclude_unset=True, exclude_none=True), } - return VoiceAgentConfig.from_dict(merged_dict) + config = VoiceAgentConfig.from_dict(merged_dict) + + # Validate the merged config + config.validate_config() + + # Return the merged config + return config diff --git a/tests/voice/assets/audio_10_16kHz.wav b/tests/voice/assets/audio_10_16kHz.wav new file mode 100644 index 00000000..a6fe0267 Binary files /dev/null and b/tests/voice/assets/audio_10_16kHz.wav differ diff --git a/tests/voice/test_05_utterance.py b/tests/voice/test_05_utterance.py index 9c3c6604..b9b50f93 100644 --- a/tests/voice/test_05_utterance.py +++ b/tests/voice/test_05_utterance.py @@ -10,7 +10,6 @@ from _utils import log_client_messages from speechmatics.voice import AgentServerMessageType -from speechmatics.voice import EndOfTurnConfig from speechmatics.voice import EndOfUtteranceMode from speechmatics.voice import SpeechSegmentConfig from speechmatics.voice import VoiceAgentConfig @@ -232,11 +231,13 @@ async def test_external_vad(): config=VoiceAgentConfig( end_of_utterance_silence_trigger=adaptive_timeout, end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL, - end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) assert client is not None + # Set FEOU to disabled for offline tests + client._disable_feou_for_testing = True + # Start the queue client._start_stt_queue() @@ -335,7 +336,6 @@ async def test_end_of_utterance_adaptive_vad(): end_of_utterance_silence_trigger=adaptive_timeout, end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, speech_segment_config=SpeechSegmentConfig(emit_sentences=False), - end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) assert client is not None @@ -344,6 +344,9 @@ async def test_end_of_utterance_adaptive_vad(): if SHOW_LOG: log_client_messages(client) + # Set FEOU to disabled for offline tests + client._disable_feou_for_testing = True + # Start the queue client._start_stt_queue() diff --git a/tests/voice/test_07_languages.py b/tests/voice/test_07_languages.py index c83428d5..717efc0f 100644 --- a/tests/voice/test_07_languages.py +++ b/tests/voice/test_07_languages.py @@ -14,7 +14,6 @@ from speechmatics.voice import AdditionalVocabEntry from speechmatics.voice import AgentServerMessageType -from speechmatics.voice import EndOfTurnConfig from speechmatics.voice import EndOfUtteranceMode from speechmatics.voice import SpeechSegmentConfig from speechmatics.voice import VoiceAgentConfig @@ -113,22 +112,24 @@ async def test_transcribe_languages(sample: AudioSample): if not API_KEY: pytest.skip("Valid API key required for test") + # Config + config = VoiceAgentConfig( + max_delay=1.2, + end_of_utterance_mode=EndOfUtteranceMode.FIXED, + end_of_utterance_silence_trigger=1.2, + language=sample.language, + additional_vocab=[AdditionalVocabEntry(content=vocab) for vocab in sample.vocab], + speech_segment_config=SpeechSegmentConfig( + emit_sentences=False, + ), + ) + # Client client = await get_client( api_key=API_KEY, url=URL, connect=False, - config=VoiceAgentConfig( - max_delay=1.2, - end_of_utterance_mode=EndOfUtteranceMode.FIXED, - end_of_utterance_silence_trigger=1.2, - language=sample.language, - additional_vocab=[AdditionalVocabEntry(content=vocab) for vocab in sample.vocab], - end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), - speech_segment_config=SpeechSegmentConfig( - emit_sentences=False, - ), - ), + config=config, ) assert client is not None @@ -188,6 +189,10 @@ def log_segment(message): # Extract the last message assert last_message.get("message") == AgentServerMessageType.ADD_SEGMENT + # Close session + await client.disconnect() + assert not client._is_connected + # Check the segment assert len(segments) >= 1 seg0 = segments[0] @@ -216,7 +221,3 @@ def log_segment(message): print(f"Transcribed: [{str_transcribed}]") print(f"CER: {str_cer}") raise AssertionError("Transcription does not match original") - - # Close session - await client.disconnect() - assert not client._is_connected diff --git a/tests/voice/test_08_multiple_speakers.py b/tests/voice/test_08_multiple_speakers.py index fa662aa5..b031d4a6 100644 --- a/tests/voice/test_08_multiple_speakers.py +++ b/tests/voice/test_08_multiple_speakers.py @@ -46,37 +46,37 @@ class SpeakerTest(BaseModel): segment_regex=["^Welcome to GeoRouter", "Buckingham", "clarify", "Notting Hill", "Rickmansworth"], speakers_present=["S1", "S2"], ), - SpeakerTest( - id="focus_s2", - path="./assets/audio_02_8kHz.wav", - sample_rate=8000, - segment_regex=["^Welcome to GeoRouter", "Buckingham", "clarify", "Notting Hill"], - speaker_config=SpeakerFocusConfig( - focus_speakers=["S2"], - ), - speakers_present=["S1", "S2"], - ), - SpeakerTest( - id="only_s2", - path="./assets/audio_02_8kHz.wav", - sample_rate=8000, - segment_regex=["Buckingham", "Notting Hill"], - speaker_config=SpeakerFocusConfig( - focus_speakers=["S2"], - focus_mode=SpeakerFocusMode.IGNORE, - ), - speakers_present=["S2"], - ), - SpeakerTest( - id="ignore_s2", - path="./assets/audio_02_8kHz.wav", - sample_rate=8000, - segment_regex=["^Welcome to GeoRouter", "clarify", "Rickmansworth"], - speaker_config=SpeakerFocusConfig( - ignore_speakers=["S2"], - ), - speakers_present=["S1"], - ), + # SpeakerTest( + # id="focus_s2", + # path="./assets/audio_02_8kHz.wav", + # sample_rate=8000, + # segment_regex=["^Welcome to GeoRouter", "Buckingham", "clarify", "Notting Hill"], + # speaker_config=SpeakerFocusConfig( + # focus_speakers=["S2"], + # ), + # speakers_present=["S1", "S2"], + # ), + # SpeakerTest( + # id="only_s2", + # path="./assets/audio_02_8kHz.wav", + # sample_rate=8000, + # segment_regex=["Buckingham", "Notting Hill"], + # speaker_config=SpeakerFocusConfig( + # focus_speakers=["S2"], + # focus_mode=SpeakerFocusMode.IGNORE, + # ), + # speakers_present=["S2"], + # ), + # SpeakerTest( + # id="ignore_s2", + # path="./assets/audio_02_8kHz.wav", + # sample_rate=8000, + # segment_regex=["^Welcome to GeoRouter", "clarify", "Rickmansworth"], + # speaker_config=SpeakerFocusConfig( + # ignore_speakers=["S2"], + # ), + # speakers_present=["S1"], + # ), ] @@ -121,6 +121,10 @@ async def test_multiple_speakers(sample: SpeakerTest): config=config, ) + # Debug + print(config.to_json(exclude_none=True, exclude_defaults=True, exclude_unset=True, indent=2)) + print(json.dumps(client._transcription_config.to_dict(), indent=2)) + # Create an event to track when the callback is called messages: list[str] = [] bytes_sent: int = 0 @@ -153,11 +157,21 @@ def log_final_segment(message): client.once(AgentServerMessageType.INFO, log_message) client.on(AgentServerMessageType.WARNING, log_message) client.on(AgentServerMessageType.ERROR, log_message) + client.on(AgentServerMessageType.DIAGNOSTICS, log_message) + + # Transcript + client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, log_message) + client.on(AgentServerMessageType.ADD_TRANSCRIPT, log_message) client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, log_message) client.on(AgentServerMessageType.ADD_SEGMENT, log_message) + + # Turn events + client.on(AgentServerMessageType.VAD_STATUS, log_message) client.on(AgentServerMessageType.SPEAKER_STARTED, log_message) client.on(AgentServerMessageType.SPEAKER_ENDED, log_message) + client.on(AgentServerMessageType.START_OF_TURN, log_message) client.on(AgentServerMessageType.END_OF_TURN, log_message) + client.on(AgentServerMessageType.END_OF_UTTERANCE, log_message) # Log ADD_SEGMENT client.on(AgentServerMessageType.ADD_SEGMENT, log_final_segment) diff --git a/tests/voice/test_09_speaker_id.py b/tests/voice/test_09_speaker_id.py index 6e8dc0bc..71438aa2 100644 --- a/tests/voice/test_09_speaker_id.py +++ b/tests/voice/test_09_speaker_id.py @@ -11,7 +11,6 @@ from speechmatics.rt import ClientMessageType from speechmatics.voice import AdditionalVocabEntry from speechmatics.voice import AgentServerMessageType -from speechmatics.voice import EndOfTurnConfig from speechmatics.voice import EndOfUtteranceMode from speechmatics.voice import SpeakerIdentifier from speechmatics.voice import SpeechSegmentConfig @@ -59,7 +58,6 @@ async def test_extract_speaker_ids(): additional_vocab=[ AdditionalVocabEntry(content="GeoRouter"), ], - end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) @@ -192,7 +190,6 @@ async def test_known_speakers(): additional_vocab=[ AdditionalVocabEntry(content="GeoRouter"), ], - end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) @@ -270,7 +267,6 @@ async def test_ignoring_assistant(): additional_vocab=[ AdditionalVocabEntry(content="GeoRouter"), ], - end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) diff --git a/tests/voice/test_11_audio_buffer.py b/tests/voice/test_11_audio_buffer.py index a10834e9..6472859c 100644 --- a/tests/voice/test_11_audio_buffer.py +++ b/tests/voice/test_11_audio_buffer.py @@ -14,7 +14,6 @@ from speechmatics.voice import AdditionalVocabEntry from speechmatics.voice import AgentServerMessageType -from speechmatics.voice import EndOfTurnConfig from speechmatics.voice import EndOfUtteranceMode from speechmatics.voice import SmartTurnConfig from speechmatics.voice import VoiceAgentConfig @@ -263,7 +262,6 @@ async def save_slice( AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]), ], smart_turn_config=SmartTurnConfig(enabled=True), - end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) @@ -369,7 +367,6 @@ async def save_slice( AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]), ], smart_turn_config=SmartTurnConfig(enabled=True), - end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) diff --git a/tests/voice/test_17_eou_feou.py b/tests/voice/test_17_eou_feou.py index f78c6abe..4e4e50d1 100644 --- a/tests/voice/test_17_eou_feou.py +++ b/tests/voice/test_17_eou_feou.py @@ -48,41 +48,41 @@ class TranscriptionTests(BaseModel): SAMPLES: TranscriptionTests = TranscriptionTests.from_dict( { "samples": [ - # { - # "id": "07b", - # "path": "./assets/audio_07b_16kHz.wav", - # "sample_rate": 16000, - # "language": "en", - # "segments": [ - # {"text": "Hello.", "start_time": 1.05, "end_time": 1.67}, - # {"text": "Tomorrow.", "start_time": 3.5, "end_time": 4.1}, - # {"text": "Wednesday.", "start_time": 6.05, "end_time": 6.73}, - # {"text": "Of course. That's fine.", "start_time": 8.8, "end_time": 9.96}, - # {"text": "Behind.", "start_time": 12.03, "end_time": 12.73}, - # {"text": "In front.", "start_time": 14.84, "end_time": 15.52}, - # {"text": "Do you think so?", "start_time": 17.54, "end_time": 18.32}, - # {"text": "Brilliant.", "start_time": 20.55, "end_time": 21.08}, - # {"text": "Banana.", "start_time": 22.98, "end_time": 23.53}, - # {"text": "When?", "start_time": 25.49, "end_time": 25.96}, - # {"text": "Today.", "start_time": 27.66, "end_time": 28.15}, - # {"text": "This morning.", "start_time": 29.91, "end_time": 30.47}, - # {"text": "Goodbye.", "start_time": 32.21, "end_time": 32.68}, - # ], - # }, - # { - # "id": "08", - # "path": "./assets/audio_08_16kHz.wav", - # "sample_rate": 16000, - # "language": "en", - # "segments": [ - # {"text": "Hello.", "start_time": 0.4, "end_time": 0.75}, - # {"text": "Goodbye.", "start_time": 2.12, "end_time": 2.5}, - # {"text": "Banana.", "start_time": 3.84, "end_time": 4.27}, - # {"text": "Breakaway.", "start_time": 5.62, "end_time": 6.42}, - # {"text": "Before.", "start_time": 7.76, "end_time": 8.16}, - # {"text": "After.", "start_time": 9.56, "end_time": 10.05}, - # ], - # }, + { + "id": "07b", + "path": "./assets/audio_07b_16kHz.wav", + "sample_rate": 16000, + "language": "en", + "segments": [ + {"text": "Hello.", "start_time": 1.05, "end_time": 1.67}, + {"text": "Tomorrow.", "start_time": 3.5, "end_time": 4.1}, + {"text": "Wednesday.", "start_time": 6.05, "end_time": 6.73}, + {"text": "Of course. That's fine.", "start_time": 8.8, "end_time": 9.96}, + {"text": "Behind.", "start_time": 12.03, "end_time": 12.73}, + {"text": "In front.", "start_time": 14.84, "end_time": 15.52}, + {"text": "Do you think so?", "start_time": 17.54, "end_time": 18.32}, + {"text": "Brilliant.", "start_time": 20.55, "end_time": 21.08}, + {"text": "Banana.", "start_time": 22.98, "end_time": 23.53}, + {"text": "When?", "start_time": 25.49, "end_time": 25.96}, + {"text": "Today.", "start_time": 27.66, "end_time": 28.15}, + {"text": "This morning.", "start_time": 29.91, "end_time": 30.47}, + {"text": "Goodbye.", "start_time": 32.21, "end_time": 32.68}, + ], + }, + { + "id": "08", + "path": "./assets/audio_08_16kHz.wav", + "sample_rate": 16000, + "language": "en", + "segments": [ + {"text": "Hello.", "start_time": 0.4, "end_time": 0.75}, + {"text": "Goodbye.", "start_time": 2.12, "end_time": 2.5}, + {"text": "Banana.", "start_time": 3.84, "end_time": 4.27}, + {"text": "Breakaway.", "start_time": 5.62, "end_time": 6.42}, + {"text": "Before.", "start_time": 7.76, "end_time": 8.16}, + {"text": "After.", "start_time": 9.56, "end_time": 10.05}, + ], + }, { "id": "09", "path": "./assets/audio_09_16kHz.wav", @@ -97,12 +97,12 @@ class TranscriptionTests(BaseModel): ) # VAD delay -VAD_DELAY_S: list[float] = [0.18, 0.22] +VAD_DELAY_S: list[float] = [0.18] # , 0.22] # Endpoints ENDPOINTS: list[str] = [ - # "wss://eu-west-2-research.speechmatics.cloud/v2", - "wss://eu.rt.speechmatics.com/v2", + "wss://eu-west-2-research.speechmatics.cloud/v2", + # "wss://eu.rt.speechmatics.com/v2", # "wss://us.rt.speechmatics.com/v2", ] @@ -177,6 +177,11 @@ async def run_test(endpoint: str, sample: TranscriptionTest, config: VoiceAgentC # Start time start_time = datetime.datetime.now() + # Zero time + def zero_time(message): + global start_time + start_time = datetime.datetime.now() + # Finalized segment def add_segments(message): segments = message["segments"] @@ -213,6 +218,13 @@ def log_message(message): log = json.dumps({"ts": round(ts, 3), "payload": message}) print(log) + # Custom listeners + client.on(AgentServerMessageType.RECOGNITION_STARTED, zero_time) + client.on(AgentServerMessageType.END_OF_TURN, eot_detected) + client.on(AgentServerMessageType.ADD_SEGMENT, add_segments) + client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, rx_partial) + client.on(AgentServerMessageType.ADD_TRANSCRIPT, rx_partial) + # Add listeners if SHOW_LOG: message_types = [m for m in AgentServerMessageType if m != AgentServerMessageType.AUDIO_ADDED] @@ -220,12 +232,6 @@ def log_message(message): for message_type in message_types: client.on(message_type, log_message) - # Custom listeners - client.on(AgentServerMessageType.END_OF_TURN, eot_detected) - client.on(AgentServerMessageType.ADD_SEGMENT, add_segments) - client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, rx_partial) - client.on(AgentServerMessageType.ADD_TRANSCRIPT, rx_partial) - # HEADER if SHOW_LOG: print() @@ -326,7 +332,9 @@ def log_message(message): # Calculate the CER cer = TextUtils.cer(normalized_expected, normalized_received) - print(f"[{idx}] `{normalized_expected}` -> `{normalized_received}` (CER: {cer:.1%})") + # Debug metrics + if SHOW_LOG: + print(f"[{idx}] `{normalized_expected}` -> `{normalized_received}` (CER: {cer:.1%})") # Check CER if cer > CER_THRESHOLD: diff --git a/tests/voice/test_18_feou_timestamp.py b/tests/voice/test_18_feou_timestamp.py new file mode 100644 index 00000000..39d85bfe --- /dev/null +++ b/tests/voice/test_18_feou_timestamp.py @@ -0,0 +1,73 @@ +import os + +import pytest +from _utils import get_client +from _utils import send_silence + +from speechmatics.rt import AudioEncoding +from speechmatics.voice import VoiceAgentConfig + +# Constants +API_KEY = os.getenv("SPEECHMATICS_API_KEY") + +# Skip for CI testing +pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping in CI") +pytestmark = pytest.mark.skipif(API_KEY is None, reason="Skipping when no API key is provided") + +# How much silence to send (seconds) +SILENCE_DURATION = 3.0 + +# Tolerance for the timestamp check +TOLERANCE = 0.00 + +# Audio format configurations to test: (encoding, chunk_size, bytes_per_sample) +AUDIO_FORMATS = [ + pytest.param(AudioEncoding.PCM_S16LE, 160, 2, id="s16-chunk160"), + pytest.param(AudioEncoding.PCM_S16LE, 320, 2, id="s16-chunk320"), + pytest.param(AudioEncoding.PCM_F32LE, 160, 4, id="f32-chunk160"), + pytest.param(AudioEncoding.PCM_F32LE, 320, 4, id="f32-chunk320"), +] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("encoding,chunk_size,sample_size", AUDIO_FORMATS) +async def test_feou_timestamp(encoding: AudioEncoding, chunk_size: int, sample_size: int): + """Test that audio_seconds_sent correctly computes elapsed audio time. + + Sends 3 seconds of silence (zero bytes) with different audio encodings + and chunk sizes, then verifies that audio_seconds_sent returns the + correct duration. + """ + + # Create and connect client + config = VoiceAgentConfig(audio_encoding=encoding, chunk_size=chunk_size) + client = await get_client( + api_key=API_KEY, + connect=False, + config=config, + ) + + try: + await client.connect() + except Exception: + pytest.skip("Failed to connect to server") + + assert client._is_connected + + # Send 3 seconds of silence + await send_silence( + client, + duration=SILENCE_DURATION, + chunk_size=chunk_size, + sample_size=sample_size, + ) + + # Check the computed audio seconds + actual_seconds = client.audio_seconds_sent + assert ( + abs(actual_seconds - SILENCE_DURATION) <= TOLERANCE + ), f"Expected ~{SILENCE_DURATION}s but got {actual_seconds:.4f}s" + + # Clean up + await client.disconnect() + assert not client._is_connected diff --git a/tests/voice/test_19_no_feou_fix.py b/tests/voice/test_19_no_feou_fix.py new file mode 100644 index 00000000..8ff92870 --- /dev/null +++ b/tests/voice/test_19_no_feou_fix.py @@ -0,0 +1,149 @@ +import json +import os +import shutil +import time + +import pytest +from _utils import get_client +from _utils import send_audio_file + +from speechmatics.voice import AgentServerMessageType +from speechmatics.voice import EndOfTurnConfig +from speechmatics.voice import EndOfUtteranceMode +from speechmatics.voice import SmartTurnConfig +from speechmatics.voice import VoiceActivityConfig +from speechmatics.voice import VoiceAgentConfig + +# Skip for CI testing +pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping transcription tests in CI") + +# Constants +API_KEY = os.getenv("SPEECHMATICS_API_KEY") +SHOW_LOG = os.getenv("SPEECHMATICS_SHOW_LOG", "0").lower() in ["1", "true"] + + +@pytest.mark.asyncio +async def test_no_feou_fix(): + """Test for when FEOU is disabled.""" + + # API key + if not API_KEY: + pytest.skip("Valid API key required for test") + + # Config + # config = VoiceAgentConfigPreset.SMART_TURN( + # VoiceAgentConfig( + # language="en", + # audio_encoding="pcm_s16le", + # sample_rate=16000, + # smart_turn_config=SmartTurnConfig(enabled=True, smart_turn_threshold=0.8), + # end_of_turn_config=EndOfTurnConfig(min_end_of_turn_delay=0.1, use_forced_eou=False), + # additional_vocab=[ + # AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]), + # ], + # ) + # ) + + config = VoiceAgentConfig( + language="en", + end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, + end_of_utterance_silence_trigger=0.5, + smart_turn_config=SmartTurnConfig(enabled=True, smart_turn_threshold=0.80), + vad_config=VoiceActivityConfig(enabled=True), + end_of_turn_config=EndOfTurnConfig(base_multiplier=1.0), + ) + + # Debug config + print( + config.to_json( + indent=2, + exclude_none=True, + exclude_defaults=True, + exclude_unset=True, + ) + ) + + # Client + client = await get_client( + api_key=API_KEY, + connect=False, + config=config, + ) + + # Add listeners + messages = [message for message in AgentServerMessageType if message != AgentServerMessageType.AUDIO_ADDED] + + # Colors for messages + colors = { + "StartOfTurn": "\033[94m", # Blue + "EndOfTurn": "\033[92m", # Green + "AddSegment": "\033[93m", # Yellow + "AddPartialSegment": "\033[38;5;208m", # Orange + "SpeakerStarted": "\033[96m", # Cyan + "SpeakerEnded": "\033[95m", # Magenta + "VadStatus": "\033[91m", # Red + } + + # Callback for each message + term_width = shutil.get_terminal_size().columns + log_start_time = time.monotonic() + + def log_message(message): + """Log a message with color and formatting.""" + + # Elapsed time in seconds (right-aligned, capacity for 100s) + elapsed = time.monotonic() - log_start_time + timestamp = f"{elapsed:>7.3f}" + + # Extract message type and remaining payload (drop noisy keys) + msg_type = message.get("message", "") + rest = {k: v for k, v in message.items() if k not in ("message", "format")} + + # Color based on message type (default: dark gray) + color = colors.get(msg_type, "\033[90m") + reset = "\033[0m" + + # Format: timestamp - fixed-width type label + JSON payload + label = f"{msg_type:<20}" + payload = json.dumps(rest, default=str) + visible = f"{timestamp} - {label} - {payload}" + + # Truncate to terminal width to prevent wrapping + if len(visible) > term_width: + visible = visible[: term_width - 1] + "…" + + # Print with color + print(f"{color}{visible}{reset}") + + # Add listeners + for message_type in messages: + client.on(message_type, log_message) + + # Load the audio file `./assets/audio_01_16kHz.wav` + # audio_file = "../../tmp/feou/recording-appointment.wav" + audio_file = "./assets/audio_10_16kHz.wav" + + # HEADER + if SHOW_LOG: + print() + print() + print("---") + + # Connect + await client.connect() + + # Check we are connected + assert client._is_connected + + # Individual payloads + await send_audio_file(client, audio_file) + + # Close session + await client.disconnect() + assert not client._is_connected + + # FOOTER + if SHOW_LOG: + print("---") + print() + print()