-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathlanguage_detector.py
More file actions
116 lines (95 loc) · 3.26 KB
/
language_detector.py
File metadata and controls
116 lines (95 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""Language detection utility for multilingual support.
Detects Estonian, Russian, and English based on character patterns and common words.
"""
import re
from typing import Literal
from loguru import logger
LanguageCode = Literal["et", "ru", "en"]
def detect_language(text: str) -> LanguageCode:
"""
Detect language from input text.
Detection Strategy:
1. Check for Cyrillic characters (Russian)
2. Check for Estonian-specific characters
3. Check for Estonian common words
4. Default to English
Args:
text: Input text to analyze
Returns:
Language code: 'et' (Estonian), 'ru' (Russian), 'en' (English)
Examples:
>>> detect_language("Mis on sünnitoetus?")
'et'
>>> detect_language("Что такое пособие?")
'ru'
>>> detect_language("What is the benefit?")
'en'
"""
if not text or not text.strip():
logger.warning(
"Empty text provided for language detection, defaulting to English"
)
return "en"
text_sample = text.strip()[:500] # Use first 500 chars for detection
# Check for Cyrillic characters (Russian) - use percentage-based detection
cyrillic_count = len(re.findall(r"[а-яА-ЯёЁ]", text_sample))
total_alpha = len(re.findall(r"[a-zA-Zа-яА-ЯёЁõäöüšžÕÄÖÜŠŽ]", text_sample))
if (
total_alpha > 0 and cyrillic_count / total_alpha > 0.25
): # 25% Cyrillic threshold
logger.debug(
f"Detected Russian (Cyrillic: {cyrillic_count}/{total_alpha} = {cyrillic_count / total_alpha:.1%})"
)
return "ru"
# Check for Estonian-specific characters (õ, ä, ö, ü, š, ž)
estonian_chars = re.findall(r"[õäöüšž]", text_sample, re.IGNORECASE)
if len(estonian_chars) > 0:
logger.debug(f"Detected Estonian (special chars: {len(estonian_chars)})")
return "et"
# Check for Estonian common words - use distinctive markers to avoid English false positives
estonian_markers = [
"kuidas",
"miks",
"kus",
"millal",
"kes",
"võib",
"olen",
"oled",
"see",
"seda",
"jah",
"või",
"ning",
"siis",
"veel",
"aga",
"kuid",
"nii",
"nagu",
"oli",
"mis",
]
# Tokenize and check for Estonian markers
words = re.findall(r"\b\w+\b", text_sample.lower())
estonian_word_count = sum(1 for word in words if word in estonian_markers)
# Scale threshold based on text length for better accuracy
threshold = 1 if len(words) < 10 else 2
if estonian_word_count >= threshold:
logger.debug(
f"Detected Estonian (marker words: {estonian_word_count}/{len(words)}, threshold: {threshold})"
)
return "et"
# Default to English
logger.debug("Detected English (default)")
return "en"
def get_language_name(language_code: LanguageCode) -> str:
"""
Get human-readable language name from code.
Args:
language_code: ISO 639-1 language code
Returns:
Full language name
"""
language_names = {"et": "Estonian", "ru": "Russian", "en": "English"}
return language_names.get(language_code, "Unknown")