ModelTC · hiworldwzj · Mar 13, 2026 · Mar 12, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/docs/CN/source/models/supported_models.rst b/docs/CN/source/models/supported_models.rst
@@ -68,17 +68,17 @@ lightllm 支持大多数的主流的开源大语言模型以及多模态模型
   * - 模型
     - 备注
   * - `Qwen-VL <https://huggingface.co/Qwen/Qwen-VL>`_
-    -  :code:`--trust_remote_code --enable_multimodal`
+    -  :code:`--trust_remote_code `
   * - `Qwen-VL-Chat <https://huggingface.co/Qwen/Qwen-VL-Chat>`_
-    -  :code:`--trust_remote_code --enable_multimodal`
+    -  :code:`--trust_remote_code `
   * - `Llava-7b <https://huggingface.co/liuhaotian/llava-v1.5-7b>`_
-    -  :code:`--enable_multimodal`
+    - 
   * - `Llava-13b <https://huggingface.co/liuhaotian/llava-v1.5-13b>`_
-    -  :code:`--enable_multimodal`
+    - 
   * - `Qwen2-VL <https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct>`_
-    -  :code:`--enable_multimodal`
+    - 
   * - `Google Gemma3 <https://huggingface.co/google/gemma-3-12b-it>`_
-    -  :code:`--enable_multimodal`
+    -
 
 
 Reward模型

diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst
@@ -248,13 +248,13 @@ PD 分离模式参数
 多模态参数
 ----------
 
-.. option:: --enable_multimodal
+.. option:: --disable_vision
 
-    是否允许加载额外的视觉模型
+    如果模型是多模态模型，设置此参数将不加载视觉部分模型（默认为None，会根据模型自动检测）
 
-.. option:: --enable_multimodal_audio
+.. option:: --disable_audio
 
-    是否允许加载额外的音频模型（需要 --enable_multimodal）
+    如果模型是多模态模型，设置此参数将不加载音频部分模型（默认为None，会根据模型自动检测）
 
 .. option:: --enable_mps
 

diff --git a/docs/CN/source/tutorial/multimodal.rst b/docs/CN/source/tutorial/multimodal.rst
@@ -15,8 +15,7 @@ LightLLM支持多种多模态模型的推理，下面以InternVL为例，对多
     --tp 2 \
     --model_dir ${MODEL_PATH} \
     --mem_fraction 0.8 \
-    --trust_remote_code \
-    --enable_multimodal
+    --trust_remote_code
 
 核心参数说明
 ------------
@@ -28,14 +27,13 @@ LightLLM支持多种多模态模型的推理，下面以InternVL为例，对多
 - **LOADWORKER**: 设置模型加载的工作进程数
 
 基础服务参数
-^^^^^^^^^^^^
+^^^^^^^^^^^
 
 - **--port 8080**: API服务器监听端口
 - **--tp 2**: 张量并行度(Tensor Parallelism)
 - **--model_dir**: InternVL模型文件路径
 - **--mem_fraction 0.8**: GPU显存使用比例
 - **--trust_remote_code**: 允许加载自定义模型代码
-- **--enable_multimodal**: 启用多模态功能
 
 高级配置参数
 ------------

diff --git a/docs/EN/source/models/supported_models.rst b/docs/EN/source/models/supported_models.rst
@@ -54,6 +54,8 @@ Large Language Models
     - 
   * - `Qwen3-Moe <https://github.com/QwenLM/Qwen3>`_ 
     - 
+  * - `DeepSeek-V3.2 `_
+    - 
 
 Multimodal Models
 ^^^^^^^^^^^^^^^^^
@@ -65,17 +67,19 @@ Multimodal Models
   * - Model
     - Notes
   * - `Qwen-VL <https://huggingface.co/Qwen/Qwen-VL>`_
-    -  :code:`--trust_remote_code --enable_multimodal`
+    -  :code:`--trust_remote_code`
   * - `Qwen-VL-Chat <https://huggingface.co/Qwen/Qwen-VL-Chat>`_
-    -  :code:`--trust_remote_code --enable_multimodal`
+    -  :code:`--trust_remote_code`
   * - `Llava-7b <https://huggingface.co/liuhaotian/llava-v1.5-7b>`_
-    -  :code:`--enable_multimodal`
+    - 
   * - `Llava-13b <https://huggingface.co/liuhaotian/llava-v1.5-13b>`_
-    -  :code:`--enable_multimodal`
+    - 
   * - `Qwen2-VL <https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct>`_
-    -  :code:`--enable_multimodal`
+    - 
   * - `Google Gemma3 <https://huggingface.co/google/gemma-3-12b-it>`_
-    -  :code:`--enable_multimodal`
+    - 
+  * - `Qwen3-Vl `_
+    -
 
 Reward Models
 ^^^^^^^^^^^^^

diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst
@@ -246,13 +246,13 @@ Output Constraint Parameters
 Multimodal Parameters
 ---------------------
 
-.. option:: --enable_multimodal
+.. option:: --disable_vision
 
-    Whether to allow loading additional visual models
+    If the model is a multimodal model, set this to not load the vision part model (default is None, auto-detected based on model)
 
-.. option:: --enable_multimodal_audio
+.. option:: --disable_audio
 
-    Whether to allow loading additional audio models (requires --enable_multimodal)
+    If the model is a multimodal model, set this to not load the audio part model (default is None, auto-detected based on model)
 
 .. option:: --enable_mps
 

diff --git a/docs/EN/source/tutorial/multimodal.rst b/docs/EN/source/tutorial/multimodal.rst
@@ -15,14 +15,13 @@ Basic Launch Command
     --tp 2 \
     --model_dir ${MODEL_PATH} \
     --mem_fraction 0.8 \
-    --trust_remote_code \
-    --enable_multimodal
+    --trust_remote_code
 
 Core Parameter Description
 --------------------------
 
 Environment Variables
-^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^
 
 - **INTERNVL_IMAGE_LENGTH**: Set the image token length for InternVL model, default is 256
 - **LOADWORKER**: Set the number of worker processes for model loading
@@ -35,7 +34,6 @@ Basic Service Parameters
 - **--model_dir**: InternVL model file path
 - **--mem_fraction 0.8**: GPU memory usage ratio
 - **--trust_remote_code**: Allow loading custom model code
-- **--enable_multimodal**: Enable multimodal functionality
 
 Advanced Configuration Parameters
 ---------------------------------
@@ -55,16 +53,16 @@ Advanced Configuration Parameters
 .. note:: To ensure equal memory load on each GPU, visual_dp * visual_tp = tp is required. For example, if tp=2, then visual_dp=1, visual_tp=2.
 
 ViT Deployment Methods
-----------------------
+-----------------------
 
 ViT TP (Tensor Parallel)
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 - Default usage
 - --visual_tp tp_size enables tensor parallelism
 
 ViT DP (Data Parallel)
-^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^
 
 - Distribute different image batches to multiple GPUs
 - Each GPU runs a complete ViT model copy
@@ -136,4 +134,4 @@ Testing
     if response.status_code == 200:
         print(f"Result: {response.json()}")
     else:
-        print(f"Error: {response.status_code}, {response.text}") 
+        print(f"Error: {response.status_code}, {response.text}")
diff --git a/lightllm/common/basemodel/attention_vit/fa3/fp.py b/lightllm/common/basemodel/attention_vit/fa3/fp.py
@@ -45,7 +45,8 @@ def _vit_att_fwd(
             False,
             window_size[0],
             window_size[1],
-            0.0,
+            attention_chunk=0,
+            softcap=0.0,
             is_rotary_interleaved=False,
             scheduler_metadata=None,
             num_splits=1,

diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py
@@ -62,6 +62,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(
             -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
         )
+        # Use channels_last_3d to enable cuDNN optimized Conv3D path
+        hidden_states = hidden_states.contiguous(memory_format=torch.channels_last_3d)
         hidden_states = self.proj(hidden_states).view(-1, self.embed_dim)
         return hidden_states
 

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py
@@ -68,6 +68,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(
             -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
         )
+        # Use channels_last_3d to enable cuDNN optimized Conv3D path
+        hidden_states = hidden_states.contiguous(memory_format=torch.channels_last_3d)
         hidden_states = self.proj(hidden_states).view(-1, self.embed_dim)
         return hidden_states
 

diff --git a/lightllm/models/qwen3_vl/qwen3_visual.py b/lightllm/models/qwen3_vl/qwen3_visual.py
@@ -68,6 +68,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(
             -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
         )
+        # Use channels_last_3d to enable cuDNN optimized Conv3D path
+        hidden_states = hidden_states.contiguous(memory_format=torch.channels_last_3d)
         hidden_states = self.proj(hidden_states).view(-1, self.embed_dim)
         return hidden_states
 

diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -279,12 +279,16 @@ def make_argument_parser() -> argparse.ArgumentParser:
                         use env FIRST_ALLOWED_TOKENS to set the range, like FIRST_ALLOWED_TOKENS=1,2 ..""",
     )
     parser.add_argument(
-        "--enable_multimodal", action="store_true", help="Whether or not to allow to load additional visual models."
+        "--disable_vision",
+        action="store_true",
+        default=None,
+        help="if the model is a multimodal model, set to not load vision part model.",
     )
     parser.add_argument(
-        "--enable_multimodal_audio",
+        "--disable_audio",
         action="store_true",
-        help="Whether or not to allow to load additional audio models (requird --enable_multimodal).",
+        default=None,
+        help="if the model is a multimodal model, set to not load audio part model.",
     )
     parser.add_argument(
         "--enable_mps", action="store_true", help="Whether to enable nvidia mps for multimodal service."

diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -16,6 +16,7 @@
 from lightllm.utils.process_check import is_process_active
 from lightllm.utils.multinode_utils import send_and_receive_node_ip
 from lightllm.utils.shm_size_check import check_recommended_shm_size
+from lightllm.utils.config_utils import has_audio_module, has_vision_module
 
 logger = init_logger(__name__)
 
@@ -67,9 +68,6 @@ def normal_or_p_d_start(args):
 
     set_unique_server_name(args)
 
-    if not args.disable_shm_warning:
-        check_recommended_shm_size(args)
-
     if args.enable_mps:
         from lightllm.utils.device_utils import enable_mps
 
@@ -78,13 +76,38 @@ def normal_or_p_d_start(args):
     if args.run_mode not in ["normal", "prefill", "decode", "nixl_prefill", "nixl_decode"]:
         return
 
+    # 通过模型的参数判断是否是多模态模型，包含哪几种模态, 并设置是否启动相应得模块
+    if args.disable_vision is None:
+        if has_vision_module(args.model_dir):
+            args.disable_vision = False
+        else:
+            args.disable_vision = True
+    if args.disable_audio is None:
+        if has_audio_module(args.model_dir):
+            args.disable_audio = False
+        else:
+            args.disable_audio = True
+
+    # pd 分离模式下，不启动多模态的模块
+    if args.run_mode in ["decode", "nixl_decode"]:
+        args.disable_audio = True
+        args.disable_vision = True
+
+    if args.disable_vision and args.disable_audio:
+        args.enable_multimodal = False
+    else:
+        args.enable_multimodal = True
+
     if args.enable_cpu_cache:
         # 生成一个用于创建cpu kv cache的共享内存id。
         args.cpu_kv_cache_shm_id = uuid.uuid1().int % 123456789
 
     if args.enable_multimodal:
         args.multi_modal_cache_shm_id = uuid.uuid1().int % 123456789
 
+    if not args.disable_shm_warning:
+        check_recommended_shm_size(args)
+
     assert args.zmq_mode in ["tcp://", "ipc:///tmp/"]
     # 确保单机上多实列不冲突
     if args.zmq_mode == "ipc:///tmp/":
@@ -279,14 +302,16 @@ def normal_or_p_d_start(args):
     ports_locker.release_port()
 
     if args.enable_multimodal:
-        from .visualserver.manager import start_visual_process
-
         process_manager.start_submodule_processes(
             start_funcs=[
                 start_cache_manager,
             ],
             start_args=[(args,)],
         )
+
+    if not args.disable_vision:
+        from .visualserver.manager import start_visual_process
+
         process_manager.start_submodule_processes(
             start_funcs=[
                 start_visual_process,
@@ -296,17 +321,17 @@ def normal_or_p_d_start(args):
             ],
         )
 
-        if args.enable_multimodal_audio:
-            from .audioserver.manager import start_audio_process
-
-            process_manager.start_submodule_processes(
-                start_funcs=[
-                    start_audio_process,
-                ],
-                start_args=[
-                    (args,),
-                ],
-            )
+    if not args.disable_audio:
+        from .audioserver.manager import start_audio_process
+
+        process_manager.start_submodule_processes(
+            start_funcs=[
+                start_audio_process,
+            ],
+            start_args=[
+                (args,),
+            ],
+        )
 
     if args.enable_cpu_cache:
         from .multi_level_kv_cache.manager import start_multi_level_kv_cache_manager

diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
@@ -138,6 +138,10 @@ async def loop_for_netio_req(self):
         while True:
             recv_req: GroupReqIndexes = await self.zmq_recv_socket.recv_pyobj()
             if isinstance(recv_req, GroupReqIndexes):
+                logger.info(
+                    f"audio recv req id {recv_req.group_req_id} "
+                    f"audio count {len(recv_req.multimodal_params.audios)}"
+                )
                 self.waiting_reqs.append(recv_req)
             else:
                 assert False, f"Error Req Inf {recv_req}"

diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
@@ -79,7 +79,8 @@ class StartArgs:
     output_constraint_mode: str = field(default="none", metadata={"choices": ["none", "simple", "xgrammar"]})
     first_token_constraint_mode: bool = field(default=False)
     enable_multimodal: bool = field(default=False)
-    enable_multimodal_audio: bool = field(default=False)
+    disable_vision: Optional[bool] = field(default=None)
+    disable_audio: Optional[bool] = field(default=None)
     enable_tpsp_mix_mode: bool = field(default=False)
     enable_dp_prefill_balance: bool = field(default=False)
     enable_decode_microbatch_overlap: bool = field(default=False)
@@ -160,6 +161,3 @@ class StartArgs:
     metric_port: int = field(default=None)
     multinode_httpmanager_port: int = field(default=12345)
     multi_level_kv_cache_port: int = field(default=None)
-    # multi_modal
-    enable_multimodal: bool = field(default=False)
-    enable_multimodal_audio: bool = field(default=False)