Voxtral-Final/config.yaml at main · devasphn/Voxtral-Final · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Configuration for Voxtral Real-time Streaming (FIXED)
server:
  host: "0.0.0.0"
  http_port: 8000  # Main UI server with embedded WebSocket at /ws
  health_port: 8005
  tcp_ports:  # Optional separate services (not needed for main WebSocket)
    - 8765  # Separate WebSocket server (optional)
    - 8766  # TCP server (optional)

model:
  name: "mistralai/Voxtral-Mini-3B-2507"
  cache_dir: "./model_cache"
  device: "cuda"
  torch_dtype: "float16"  # OPTIMIZED: Use float16 for maximum GPU performance (faster than bfloat16)
  max_memory_per_gpu: "6GB"
  # ADDED: Authentication and loading options
  require_auth_token: true  # Set to true if HF_TOKEN is required
  use_safetensors: true     # Enable safetensors by default, fallback if fails

audio:
  sample_rate: 16000
  chunk_size: 256  # ULTRA-LOW LATENCY: Further reduced for <500ms latency
  format: "int16"
  channels: 1
  frame_duration_ms: 10  # ULTRA-LOW LATENCY: Minimal frame duration

# ULTRA-LOW LATENCY VAD Configuration
vad:
  threshold: 0.005          # Lower threshold for faster detection
  min_voice_duration_ms: 150  # Reduced from 200ms
  min_silence_duration_ms: 600  # Reduced from 800ms
  chunk_size_ms: 20         # Smaller chunks for faster processing
  overlap_ms: 5             # Minimal overlap
  sensitivity: "high"       # High sensitivity for real-time

# Mel spectrogram configuration matching Voxtral (FIXED)
spectrogram:
  n_mels: 128
  hop_length: 160
  win_length: 400
  n_fft: 1024  # FIXED: Increased to 1024 to fully resolve mel filterbank warning

streaming:
  max_connections: 100
  buffer_size: 1024  # ULTRA-OPTIMIZED: Minimal buffer for lowest latency (was 2048)
  timeout_seconds: 300
  latency_target_ms: 50  # ULTRA-OPTIMIZED: Aggressive target (was 100)

# TTS (Text-to-Speech) configuration - Kokoro TTS
tts:
  engine: "kokoro"
  default_voice: "hf_alpha"  # OPTIMIZED: Hindi female voice for Indian accent (was "af_heart")
  sample_rate: 16000  # CRITICAL FIX: Standardized to 16kHz to match audio pipeline
  enabled: true
  # Kokoro TTS settings
  voice: "hf_alpha"  # OPTIMIZED: Hindi female voice for Indian accent English (was "af_heart")
  speed: 1.0  # Kokoro speech speed
  lang_code: "h"  # OPTIMIZED: Hindi language code for Indian accent (was "a")
  # Available voices by language - Kokoro voices only
  voices:
    english: ["af_heart", "af_bella", "af_nicole", "af_sarah"]  # Kokoro English voices
    hindi: ["hm_omega", "hf_alpha", "hf_beta", "hm_psi"]  # Kokoro Hindi voices
  # ULTRA-LOW LATENCY Performance settings
  performance:
    batch_size: 1  # Single batch for lowest latency
    max_queue_size: 2  # OPTIMIZED: Minimal queue for ultra-low latency
    num_workers: 1  # Single worker for consistency
    target_latency_ms: 150  # TARGET: <200ms chunking requirement

# ULTRA-LOW LATENCY Speech-to-Speech pipeline configuration
speech_to_speech:
  enabled: true
  latency_target_ms: 200  # TARGET: <200ms TTS chunking as per requirements
  buffer_size: 2048  # OPTIMIZED: Smaller buffer for lower latency
  output_format: "wav"
  quality: "balanced"  # OPTIMIZED: Balanced for speed and quality
  emotional_expression: true

  # STREAMING CONFIGURATION
  streaming:
    enabled: true
    mode: "word_level"  # OPTIMIZED: Word level for ultra-low latency
    words_trigger_threshold: 2  # OPTIMIZED: Start TTS after 2 words for faster response
    max_tokens: 100  # OPTIMIZED: Shorter responses for faster processing
    interruption_detection: true
    interruption_threshold_ms: 50  # OPTIMIZED: Fast interruption detection

  # VOICE AGENT OPTIMIZATION - Aligned with <500ms end-to-end target
  voice_agent:
    first_word_target_ms: 100  # TARGET: First word within 100ms
    word_to_audio_target_ms: 150  # TARGET: Word to audio within 150ms
    interruption_response_ms: 50  # TARGET: Fast interruption response
    concurrent_tts_streams: 1  # OPTIMIZED: Single stream for consistency

# UI Configuration
ui:
  mode: "simple"  # "simple" or "advanced" - simple has only Connect, Start, Status
  theme: "dark"   # "dark" or "light"
  show_latency_details: true
  enable_performance_indicators: true

logging:
  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  file: "./logs/voxtral_streaming.log"