-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconfig.yaml
More file actions
105 lines (94 loc) · 4.17 KB
/
config.yaml
File metadata and controls
105 lines (94 loc) · 4.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Configuration for Voxtral Real-time Streaming (FIXED)
server:
host: "0.0.0.0"
http_port: 8000 # Main UI server with embedded WebSocket at /ws
health_port: 8005
tcp_ports: # Optional separate services (not needed for main WebSocket)
- 8765 # Separate WebSocket server (optional)
- 8766 # TCP server (optional)
model:
name: "mistralai/Voxtral-Mini-3B-2507"
cache_dir: "./model_cache"
device: "cuda"
torch_dtype: "float16" # OPTIMIZED: Use float16 for maximum GPU performance (faster than bfloat16)
max_memory_per_gpu: "6GB"
# ADDED: Authentication and loading options
require_auth_token: true # Set to true if HF_TOKEN is required
use_safetensors: true # Enable safetensors by default, fallback if fails
audio:
sample_rate: 16000
chunk_size: 256 # ULTRA-LOW LATENCY: Further reduced for <500ms latency
format: "int16"
channels: 1
frame_duration_ms: 10 # ULTRA-LOW LATENCY: Minimal frame duration
# ULTRA-LOW LATENCY VAD Configuration
vad:
threshold: 0.005 # Lower threshold for faster detection
min_voice_duration_ms: 150 # Reduced from 200ms
min_silence_duration_ms: 600 # Reduced from 800ms
chunk_size_ms: 20 # Smaller chunks for faster processing
overlap_ms: 5 # Minimal overlap
sensitivity: "high" # High sensitivity for real-time
# Mel spectrogram configuration matching Voxtral (FIXED)
spectrogram:
n_mels: 128
hop_length: 160
win_length: 400
n_fft: 1024 # FIXED: Increased to 1024 to fully resolve mel filterbank warning
streaming:
max_connections: 100
buffer_size: 1024 # ULTRA-OPTIMIZED: Minimal buffer for lowest latency (was 2048)
timeout_seconds: 300
latency_target_ms: 50 # ULTRA-OPTIMIZED: Aggressive target (was 100)
# TTS (Text-to-Speech) configuration - Kokoro TTS
tts:
engine: "kokoro"
default_voice: "hf_alpha" # OPTIMIZED: Hindi female voice for Indian accent (was "af_heart")
sample_rate: 16000 # CRITICAL FIX: Standardized to 16kHz to match audio pipeline
enabled: true
# Kokoro TTS settings
voice: "hf_alpha" # OPTIMIZED: Hindi female voice for Indian accent English (was "af_heart")
speed: 1.0 # Kokoro speech speed
lang_code: "h" # OPTIMIZED: Hindi language code for Indian accent (was "a")
# Available voices by language - Kokoro voices only
voices:
english: ["af_heart", "af_bella", "af_nicole", "af_sarah"] # Kokoro English voices
hindi: ["hm_omega", "hf_alpha", "hf_beta", "hm_psi"] # Kokoro Hindi voices
# ULTRA-LOW LATENCY Performance settings
performance:
batch_size: 1 # Single batch for lowest latency
max_queue_size: 2 # OPTIMIZED: Minimal queue for ultra-low latency
num_workers: 1 # Single worker for consistency
target_latency_ms: 150 # TARGET: <200ms chunking requirement
# ULTRA-LOW LATENCY Speech-to-Speech pipeline configuration
speech_to_speech:
enabled: true
latency_target_ms: 200 # TARGET: <200ms TTS chunking as per requirements
buffer_size: 2048 # OPTIMIZED: Smaller buffer for lower latency
output_format: "wav"
quality: "balanced" # OPTIMIZED: Balanced for speed and quality
emotional_expression: true
# STREAMING CONFIGURATION
streaming:
enabled: true
mode: "word_level" # OPTIMIZED: Word level for ultra-low latency
words_trigger_threshold: 2 # OPTIMIZED: Start TTS after 2 words for faster response
max_tokens: 100 # OPTIMIZED: Shorter responses for faster processing
interruption_detection: true
interruption_threshold_ms: 50 # OPTIMIZED: Fast interruption detection
# VOICE AGENT OPTIMIZATION - Aligned with <500ms end-to-end target
voice_agent:
first_word_target_ms: 100 # TARGET: First word within 100ms
word_to_audio_target_ms: 150 # TARGET: Word to audio within 150ms
interruption_response_ms: 50 # TARGET: Fast interruption response
concurrent_tts_streams: 1 # OPTIMIZED: Single stream for consistency
# UI Configuration
ui:
mode: "simple" # "simple" or "advanced" - simple has only Connect, Start, Status
theme: "dark" # "dark" or "light"
show_latency_details: true
enable_performance_indicators: true
logging:
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file: "./logs/voxtral_streaming.log"