From da4c8d258f6dfe892c94afc0494b315155a66417 Mon Sep 17 00:00:00 2001
From: team-coding-agent-1 <team-coding-agent-1@localai.dev>
Date: Wed, 11 Mar 2026 13:20:11 +0000
Subject: [PATCH] Add Voxtral-Mini-4B-Realtime model support via vLLM backend

- Add gallery definition for Voxtral-Mini-4B-Realtime-2602 model
- Configure vLLM backend with recommended settings for real-time ASR
- Update gallery index to point to new model configuration
- Model supports multilingual transcription with <500ms latency
- Uses vLLM's Realtime API for streaming audio processing

References:
- https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602
- https://github.com/mudler/LocalAI/issues/8401

Signed-off-by: team-coding-agent-1 <team-coding-agent-1@localai.dev>
---
 gallery/index.yaml                    | 24 ++++++------------------
 gallery/voxtral-mini-4b-realtime.yaml | 27 +++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 18 deletions(-)
 create mode 100644 gallery/voxtral-mini-4b-realtime.yaml

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 91c8eb8bfd93..3cafd282662d 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -478,34 +478,22 @@
       model: nvidia/parakeet-tdt-0.6b-v3
 - name: voxtral-mini-4b-realtime
   license: apache-2.0
-  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  url: "github:mudler/LocalAI/gallery/voxtral-mini-4b-realtime.yaml@master"
   description: |
-    Voxtral Mini 4B Realtime is a speech-to-text model from Mistral AI. It is a 4B parameter model optimized for fast, accurate audio transcription with low latency, making it ideal for real-time applications. The model uses the Voxtral architecture for efficient audio processing.
+    Voxtral Mini 4B Realtime is a multilingual, realtime speech-transcription model from Mistral AI.
+    It achieves accuracy comparable to offline systems with a delay of <500ms and supports 13 languages.
+    This model is designed for real-time automatic speech recognition (ASR) with streaming capabilities
+    and benefits from vLLM's Realtime API for low-latency transcription workflows.
   urls:
     - https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602
-    - https://github.com/antirez/voxtral.c
   tags:
     - stt
     - speech-to-text
     - audio-transcription
+    - vllm
     - cpu
     - metal
     - mistral
-  overrides:
-    backend: voxtral
-    known_usecases:
-      - transcript
-    parameters:
-      model: voxtral-model
-  files:
-    - filename: voxtral-model/consolidated.safetensors
-      uri: https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main/consolidated.safetensors
-      sha256: 263f178fe752c90a2ae58f037a95ed092db8b14768b0978b8c48f66979c8345d
-    - filename: voxtral-model/params.json
-      uri: https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main/params.json
-    - filename: voxtral-model/tekken.json
-      uri: https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main/tekken.json
-      sha256: 8434af1d39eba99f0ef46cf1450bf1a63fa941a26933a1ef5dbbf4adf0d00e44
 - name: moonshine-tiny
   license: apache-2.0
   size: "108MB"
diff --git a/gallery/voxtral-mini-4b-realtime.yaml b/gallery/voxtral-mini-4b-realtime.yaml
new file mode 100644
index 000000000000..a4feacd099c9
--- /dev/null
+++ b/gallery/voxtral-mini-4b-realtime.yaml
@@ -0,0 +1,27 @@
+---
+name: "voxtral-mini-4b-realtime"
+
+description: |
+  Voxtral Mini 4B Realtime is a multilingual, realtime speech-transcription model from Mistral AI.
+  It achieves accuracy comparable to offline systems with a delay of <500ms and supports 13 languages.
+  This model is designed for real-time automatic speech recognition (ASR) with streaming capabilities
+  and benefits from vLLM's Realtime API for low-latency transcription workflows.
+
+config_file: |
+  name: voxtral-mini-4b-realtime
+  description: Voxtral Mini 4B Realtime - Real-time ASR model via vLLM
+  backend: vllm
+  parameters:
+    model: mistralai/Voxtral-Mini-4B-Realtime-2602
+  known_usecases:
+    - transcript
+  template:
+    use_tokenizer_template: true
+  prediction:
+    max_tokens: 45000
+  backend_options:
+    vllm:
+      # Recommended settings for Voxtral Realtime
+      # --max-model-len: 131072 (default, supports ~3h of transcription)
+      # Temperature should be set to 0.0 for ASR
+      compilation_config: '{"cudagraph_mode": "PIECEWISE"}'