Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# fmt: off

SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN"]
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN","pt-BR"]

DEFAULT_PUNCTUATION = (
',', '.', '!', '?', '-',
Expand Down Expand Up @@ -106,7 +106,7 @@
'ॅ', 'ॉ', 'ँ', 'ं', 'ः', '्', '़', 'ॊ', 'ॢ', 'ॣ', 'ॆ',
# Danda (period)
'।',
),
)
}

IPA_CHARACTER_SETS = {
Expand Down Expand Up @@ -347,5 +347,12 @@ def get_ipa_punctuation_list(locale):
'・',
]
)
elif locale == "hi-IN":
punct_set.update(
[
'।',
'॥',
]
)
punct_list = sorted(list(punct_set))
return punct_list
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"any_locale_word_tokenize",
"english_word_tokenize",
"LATIN_CHARS_ALL",
"INDIC_CHARS_ALL",
"normalize_unicode_text",
"japanese_text_preprocessing",
]
Expand All @@ -52,11 +53,24 @@
LATIN_ALPHABET_BASIC = "A-Za-z"
ACCENTED_CHARS = "À-ÖØ-öø-ÿ"
LATIN_CHARS_ALL = f"{LATIN_ALPHABET_BASIC}{ACCENTED_CHARS}"

# Indic characters based on https://www.unicode.org/charts/
# Hindi, Marathi, Nepali, Sanskrit https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)
DEVANAGARI_CHARS = (
r'\u0900-\u0963\u0966-\u097F' # excluding danda (U+0964), double danda (U+0965) so they are treated as punctuation
)
BENGALI_CHARS = r'\u0980-\u09FF' # Bengali, Assamese
TAMIL_CHARS = r'\u0B80-\u0BFF' # Tamil
TELUGU_CHARS = r'\u0C00-\u0C7F' # Telugu
KANNADA_CHARS = r'\u0C80-\u0CFF' # Kannada
GUJARATI_CHARS = r'\u0A80-\u0AFF' # Gujarati
INDIC_CHARS_ALL = f"{DEVANAGARI_CHARS}{BENGALI_CHARS}{TAMIL_CHARS}{TELUGU_CHARS}{KANNADA_CHARS}{GUJARATI_CHARS}"

_WORDS_RE_EN = re.compile(
fr"([{LATIN_ALPHABET_BASIC}]+(?:[{LATIN_ALPHABET_BASIC}\-']*[{LATIN_ALPHABET_BASIC}]+)*)|(\|[^|]*\|)|([^{LATIN_ALPHABET_BASIC}|]+)"
)
_WORDS_RE_ANY_LOCALE = re.compile(
fr"([{LATIN_CHARS_ALL}]+(?:[{LATIN_CHARS_ALL}\-']*[{LATIN_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}|]+)"
fr"([{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}]+(?:[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}\-']*[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}|]+)"
)


Expand Down
28 changes: 22 additions & 6 deletions nemo/collections/tts/g2p/models/i18n_ipa.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import validate_locale
from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import (
INDIC_CHARS_ALL,
LATIN_CHARS_ALL,
any_locale_word_tokenize,
english_word_tokenize,
Expand All @@ -29,18 +30,22 @@
from nemo.collections.tts.g2p.utils import GRAPHEME_CASE_MIXED, GRAPHEME_CASE_UPPER, set_grapheme_case
from nemo.utils import logging

# Compiled regex pattern for Indic scripts (used in dictionary parsing)
_INDIC_PATTERN = re.compile(f'^[{INDIC_CHARS_ALL}]')


class IpaG2p(BaseG2p):
# fmt: off
STRESS_SYMBOLS = ["ˈ", "ˌ"]
# Regex for roman characters, accented characters, and locale-agnostic numbers/digits
CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}\d]")
PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}\d]")
CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}\d]")
PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}\d]")
# fmt: on

def __init__(
self,
phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]],
# phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]],
phoneme_dict: Union[str, pathlib.Path, List[Union[str, pathlib.Path]], Dict[str, List[List[str]]]],
locale: str = "en-US",
apply_to_oov_word: Optional[Callable[[str], str]] = None,
ignore_ambiguous_words: bool = True,
Expand Down Expand Up @@ -154,10 +159,10 @@ def __init__(

@staticmethod
def _parse_phoneme_dict(
phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]]
phoneme_dict: Union[str, pathlib.Path, List[Union[str, pathlib.Path]], Dict[str, List[List[str]]]]
) -> Dict[str, List[List[str]]]:
"""
parse an input IPA dictionary and save it as a dict object.
parse an input IPA dictionary (or multiple) and save it as a dict object.

Args:
phoneme_dict (Union[str, pathlib.Path, dict]): Path to file in CMUdict format or an IPA dict object with
Expand All @@ -167,6 +172,14 @@ def _parse_phoneme_dict(

Returns: a dict object (Dict[str, List[List[str]]]).
"""
if isinstance(phoneme_dict, list):
merged = defaultdict(list)
for path in phoneme_dict:
parsed = IpaG2p._parse_phoneme_dict(path)
for word, prons in parsed.items():
merged[word].extend(prons)
return merged

if isinstance(phoneme_dict, str) or isinstance(phoneme_dict, pathlib.Path):
# load the dictionary file where there may exist a digit suffix after a word, e.g. "Word(2)", which
# represents the pronunciation variant of that word.
Expand All @@ -190,6 +203,7 @@ def _parse_phoneme_dict(
or 'À' <= line[0] <= 'Ö'
or 'Ø' <= line[0] <= 'ö'
or 'ø' <= line[0] <= 'ÿ'
or _INDIC_PATTERN.match(line[0])
or line[0] == "'"
):
parts = line.strip().split(maxsplit=1)
Expand Down Expand Up @@ -217,7 +231,9 @@ def _parse_phoneme_dict(

return phoneme_dict_obj

def replace_dict(self, phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]]):
def replace_dict(
self, phoneme_dict: Union[str, pathlib.Path, List[Union[str, pathlib.Path]], Dict[str, List[List[str]]]]
):
"""
Replace model's phoneme dictionary with a custom one
"""
Expand Down
Loading
Loading