From da4c8d258f6dfe892c94afc0494b315155a66417 Mon Sep 17 00:00:00 2001 From: team-coding-agent-1 Date: Wed, 11 Mar 2026 13:20:11 +0000 Subject: [PATCH] Add Voxtral-Mini-4B-Realtime model support via vLLM backend - Add gallery definition for Voxtral-Mini-4B-Realtime-2602 model - Configure vLLM backend with recommended settings for real-time ASR - Update gallery index to point to new model configuration - Model supports multilingual transcription with <500ms latency - Uses vLLM's Realtime API for streaming audio processing References: - https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602 - https://github.com/mudler/LocalAI/issues/8401 Signed-off-by: team-coding-agent-1 --- gallery/index.yaml | 24 ++++++------------------ gallery/voxtral-mini-4b-realtime.yaml | 27 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 18 deletions(-) create mode 100644 gallery/voxtral-mini-4b-realtime.yaml diff --git a/gallery/index.yaml b/gallery/index.yaml index 91c8eb8bfd93..3cafd282662d 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -478,34 +478,22 @@ model: nvidia/parakeet-tdt-0.6b-v3 - name: voxtral-mini-4b-realtime license: apache-2.0 - url: "github:mudler/LocalAI/gallery/virtual.yaml@master" + url: "github:mudler/LocalAI/gallery/voxtral-mini-4b-realtime.yaml@master" description: | - Voxtral Mini 4B Realtime is a speech-to-text model from Mistral AI. It is a 4B parameter model optimized for fast, accurate audio transcription with low latency, making it ideal for real-time applications. The model uses the Voxtral architecture for efficient audio processing. + Voxtral Mini 4B Realtime is a multilingual, realtime speech-transcription model from Mistral AI. + It achieves accuracy comparable to offline systems with a delay of <500ms and supports 13 languages. + This model is designed for real-time automatic speech recognition (ASR) with streaming capabilities + and benefits from vLLM's Realtime API for low-latency transcription workflows. urls: - https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602 - - https://github.com/antirez/voxtral.c tags: - stt - speech-to-text - audio-transcription + - vllm - cpu - metal - mistral - overrides: - backend: voxtral - known_usecases: - - transcript - parameters: - model: voxtral-model - files: - - filename: voxtral-model/consolidated.safetensors - uri: https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main/consolidated.safetensors - sha256: 263f178fe752c90a2ae58f037a95ed092db8b14768b0978b8c48f66979c8345d - - filename: voxtral-model/params.json - uri: https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main/params.json - - filename: voxtral-model/tekken.json - uri: https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main/tekken.json - sha256: 8434af1d39eba99f0ef46cf1450bf1a63fa941a26933a1ef5dbbf4adf0d00e44 - name: moonshine-tiny license: apache-2.0 size: "108MB" diff --git a/gallery/voxtral-mini-4b-realtime.yaml b/gallery/voxtral-mini-4b-realtime.yaml new file mode 100644 index 000000000000..a4feacd099c9 --- /dev/null +++ b/gallery/voxtral-mini-4b-realtime.yaml @@ -0,0 +1,27 @@ +--- +name: "voxtral-mini-4b-realtime" + +description: | + Voxtral Mini 4B Realtime is a multilingual, realtime speech-transcription model from Mistral AI. + It achieves accuracy comparable to offline systems with a delay of <500ms and supports 13 languages. + This model is designed for real-time automatic speech recognition (ASR) with streaming capabilities + and benefits from vLLM's Realtime API for low-latency transcription workflows. + +config_file: | + name: voxtral-mini-4b-realtime + description: Voxtral Mini 4B Realtime - Real-time ASR model via vLLM + backend: vllm + parameters: + model: mistralai/Voxtral-Mini-4B-Realtime-2602 + known_usecases: + - transcript + template: + use_tokenizer_template: true + prediction: + max_tokens: 45000 + backend_options: + vllm: + # Recommended settings for Voxtral Realtime + # --max-model-len: 131072 (default, supports ~3h of transcription) + # Temperature should be set to 0.0 for ASR + compilation_config: '{"cudagraph_mode": "PIECEWISE"}'