Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import asyncio
import os
import re
from dataclasses import dataclass, replace
from typing import Literal

Expand Down Expand Up @@ -44,6 +45,23 @@
48000: "raw-48khz-16bit-mono-pcm",
}

_VOICE_LOCALE_RE = re.compile(r"^([a-z]{2}-[A-Z]{2})-")


def _voice_locale(voice: str) -> str | None:
match = _VOICE_LOCALE_RE.match(voice)
return match.group(1) if match else None


def _should_wrap_with_lang(voice: str, language: str | None) -> bool:
if not language:
return False

voice_locale = _voice_locale(voice)
return bool(
voice_locale and "MultilingualNeural" in voice and language.lower() != voice_locale.lower()
)


@dataclass
class ProsodyConfig:
Expand Down Expand Up @@ -249,11 +267,15 @@ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions

def _build_ssml(self) -> str:
lang = self._opts.language or "en-US"
voice_locale = _voice_locale(self._opts.voice)
wrap_with_lang = _should_wrap_with_lang(self._opts.voice, self._opts.language)
root_lang = voice_locale if wrap_with_lang and voice_locale else lang

ssml = (
f'<speak version="1.0" '
f'xmlns="http://www.w3.org/2001/10/synthesis" '
f'xmlns:mstts="http://www.w3.org/2001/mstts" '
f'xml:lang="{lang}">'
f'xml:lang="{root_lang}">'
)
ssml += f'<voice name="{self._opts.voice}">'

Expand All @@ -264,6 +286,9 @@ def _build_ssml(self) -> str:
degree = f' styledegree="{self._opts.style.degree}"' if self._opts.style.degree else ""
ssml += f'<mstts:express-as style="{self._opts.style.style}"{degree}>'

if wrap_with_lang:
ssml += f'<lang xml:lang="{lang}">'

if is_given(self._opts.prosody):
p = self._opts.prosody

Expand All @@ -274,6 +299,9 @@ def _build_ssml(self) -> str:
else:
ssml += self.input_text

if wrap_with_lang:
ssml += "</lang>"

if is_given(self._opts.style):
ssml += "</mstts:express-as>"

Expand Down
77 changes: 77 additions & 0 deletions tests/test_plugin_azure_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from dataclasses import replace

from livekit.plugins import azure
from livekit.plugins.azure import tts as azure_tts


def _build_ssml(*, voice: str, language: str | None = None, text: str = "Merhaba") -> str:
tts = azure.TTS(
voice=voice,
language=language,
speech_key="test-key",
speech_region="westus",
)
stream = object.__new__(azure_tts.ChunkedStream)
stream._opts = replace(tts._opts)
stream._input_text = text
return stream._build_ssml()


def test_azure_tts_wraps_cross_locale_multilingual_voice_with_lang_tag():
ssml = _build_ssml(
voice="en-US-JennyMultilingualNeural",
language="tr-TR",
)

assert '<speak version="1.0"' in ssml
assert 'xml:lang="en-US"' in ssml
assert '<voice name="en-US-JennyMultilingualNeural">' in ssml
assert '<lang xml:lang="tr-TR">Merhaba</lang>' in ssml


def test_azure_tts_does_not_wrap_same_locale_multilingual_voice():
ssml = _build_ssml(
voice="en-US-JennyMultilingualNeural",
language="en-US",
)

assert '<speak version="1.0"' in ssml
assert 'xml:lang="en-US"' in ssml
assert '<voice name="en-US-JennyMultilingualNeural">Merhaba</voice>' in ssml
assert "<lang xml:lang=" not in ssml


def test_azure_tts_does_not_wrap_non_multilingual_voice():
ssml = _build_ssml(
voice="tr-TR-EmelNeural",
language="tr-TR",
)

assert '<speak version="1.0"' in ssml
assert 'xml:lang="tr-TR"' in ssml
assert '<voice name="tr-TR-EmelNeural">Merhaba</voice>' in ssml
assert "<lang xml:lang=" not in ssml


def test_azure_tts_keeps_default_language_without_explicit_language():
ssml = _build_ssml(
voice="tr-TR-EmelNeural",
language=None,
)

assert '<speak version="1.0"' in ssml
assert 'xml:lang="en-US"' in ssml
assert '<voice name="tr-TR-EmelNeural">Merhaba</voice>' in ssml
assert "<lang xml:lang=" not in ssml


def test_azure_tts_does_not_wrap_multilingual_voice_without_explicit_language():
ssml = _build_ssml(
voice="fr-FR-VivienneMultilingualNeural",
language=None,
)

assert '<speak version="1.0"' in ssml
assert 'xml:lang="en-US"' in ssml
assert '<voice name="fr-FR-VivienneMultilingualNeural">Merhaba</voice>' in ssml
assert "<lang xml:lang=" not in ssml
Loading