Skip to content

Commit 23b2422

Browse files
author
Jason Roche
committed
Add Kannada (kn-IN) G2P support for TTS
- Add KannadaG2p class with hybrid dictionary + rule-based IPA conversion - Add Kannada grapheme and IPA character sets to ipa_lexicon.py - Add kn-IN locale support with punctuation handling - Include lexicon with 4264 Kannada words - Add test script with assertions for validation The G2P module handles: - All Kannada vowels, consonants, matras (dependent vowels) - Virama (halant), anusvara, visarga - Anusvara place assimilation based on following consonant Signed-off-by: Jason Roche <jas.tech23@gmail.com>
1 parent 8e2905c commit 23b2422

4 files changed

Lines changed: 4955 additions & 1 deletion

File tree

nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
# fmt: off
1717

18-
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN"]
18+
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN", "kn-IN"]
1919

2020
DEFAULT_PUNCTUATION = (
2121
',', '.', '!', '?', '-',
@@ -107,6 +107,31 @@
107107
# Danda (period)
108108
'।',
109109
),
110+
"kn-IN": (
111+
# Independent Vowels (Swaras)
112+
'ಅ', 'ಆ', 'ಇ', 'ಈ', 'ಉ', 'ಊ', 'ಋ', 'ೠ', 'ಌ', 'ೡ',
113+
'ಎ', 'ಏ', 'ಐ', 'ಒ', 'ಓ', 'ಔ',
114+
# Consonants (Vyanjanas)
115+
# Velar
116+
'ಕ', 'ಖ', 'ಗ', 'ಘ', 'ಙ',
117+
# Palatal
118+
'ಚ', 'ಛ', 'ಜ', 'ಝ', 'ಞ',
119+
# Retroflex
120+
'ಟ', 'ಠ', 'ಡ', 'ಢ', 'ಣ',
121+
# Dental
122+
'ತ', 'ಥ', 'ದ', 'ಧ', 'ನ',
123+
# Labial
124+
'ಪ', 'ಫ', 'ಬ', 'ಭ', 'ಮ',
125+
# Approximants and others
126+
'ಯ', 'ರ', 'ಱ', 'ಲ', 'ಳ', 'ೞ', 'ವ', 'ಶ', 'ಷ', 'ಸ', 'ಹ',
127+
# Dependent Vowel Signs (Matras)
128+
'ಾ', 'ಿ', 'ೀ', 'ು', 'ೂ', 'ೃ', 'ೄ', 'ೆ', 'ೇ', 'ೈ', 'ೊ', 'ೋ', 'ೌ',
129+
# Various Signs
130+
'ಂ', # Anusvara
131+
'ಃ', # Visarga
132+
'್', # Virama (Halant)
133+
'ಽ', # Avagraha
134+
),
110135
}
111136

112137
IPA_CHARACTER_SETS = {
@@ -183,6 +208,30 @@
183208
'ɡ', 'ɣ', 'ɪ', 'ɭ', 'ɲ', 'ɳ', 'ɾ', 'ʂ', 'ʃ', 'ʈ',
184209
'ʊ', 'ʋ', 'ʌ', 'ʰ', 'ː', '̃', '̩', 'χ',
185210
),
211+
# Kannada IPA phoneme set (split form - all modifiers as separate tokens)
212+
"kn-IN": (
213+
# Vowels (monophthongs) - base forms only
214+
'a', 'i', 'u', 'e', 'o',
215+
'ə', # schwa (inherent vowel, sometimes realized)
216+
'ɯ', # close back unrounded vowel (for vocalic R: ಋ, ೃ)
217+
# Consonants - Stops (base forms only)
218+
'k', 'g', # Velar
219+
'ʈ', 'ɖ', # Retroflex
220+
't', 'd', # Dental
221+
'p', 'b', # Labial
222+
# Nasals
223+
'ŋ', 'ɲ', 'ɳ', 'n', 'm',
224+
# Approximants
225+
'j', 'ʋ', 'w',
226+
# Liquids
227+
'r', 'ɾ', 'l', 'ɭ', 'ɻ',
228+
# Fricatives/Affricates (ʃ, ʒ used in affricates tʃ, dʒ)
229+
'ʃ', 'ʒ', 'ʂ', 's', 'h',
230+
# Modifiers (separate tokens, like Hindi/Japanese)
231+
'ʰ', # Aspiration marker
232+
'ː', # Length marker
233+
'̃', # Nasalization (combining tilde)
234+
),
186235
}
187236

188237
GRAPHEME_CHARACTER_CASES = ["upper", "lower", "mixed"]
@@ -347,5 +396,13 @@ def get_ipa_punctuation_list(locale):
347396
'・',
348397
]
349398
)
399+
elif locale == "kn-IN":
400+
# Kannada punctuation
401+
punct_set.update(
402+
[
403+
'।', # Devanagari Danda (single)
404+
'॥', # Devanagari Double Danda
405+
]
406+
)
350407
punct_list = sorted(list(punct_set))
351408
return punct_list

0 commit comments

Comments
 (0)