From f82436d516f29a66395d4cbfa049da4e88765ecc Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Thu, 12 Mar 2026 08:01:27 +0000 Subject: [PATCH 1/7] fix enable_multimodal_visual or enable_multimodal_audio --- lightllm/server/api_cli.py | 6 ++-- lightllm/server/api_start.py | 31 ++++++++++++-------- lightllm/server/core/objs/start_args_type.py | 1 + lightllm/server/httpserver/manager.py | 19 ++++++++---- 4 files changed, 37 insertions(+), 20 deletions(-) diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index 37b2bb3ef3..ea347081a8 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -279,12 +279,14 @@ def make_argument_parser() -> argparse.ArgumentParser: use env FIRST_ALLOWED_TOKENS to set the range, like FIRST_ALLOWED_TOKENS=1,2 ..""", ) parser.add_argument( - "--enable_multimodal", action="store_true", help="Whether or not to allow to load additional visual models." + "--enable_multimodal_visual", + action="store_true", + help="Whether or not to allow to load additional visual model module.", ) parser.add_argument( "--enable_multimodal_audio", action="store_true", - help="Whether or not to allow to load additional audio models (requird --enable_multimodal).", + help="Whether or not to allow to load additional audio model module.", ) parser.add_argument( "--enable_mps", action="store_true", help="Whether to enable nvidia mps for multimodal service." diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index 111def60c2..bf5ba48c35 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -78,6 +78,9 @@ def normal_or_p_d_start(args): if args.run_mode not in ["normal", "prefill", "decode", "nixl_prefill", "nixl_decode"]: return + if args.enable_multimodal_visual or args.enable_multimodal_audio: + args.enable_multimodal = True + if args.enable_cpu_cache: # 生成一个用于创建cpu kv cache的共享内存id。 args.cpu_kv_cache_shm_id = uuid.uuid1().int % 123456789 @@ -279,14 +282,16 @@ def normal_or_p_d_start(args): ports_locker.release_port() if args.enable_multimodal: - from .visualserver.manager import start_visual_process - process_manager.start_submodule_processes( start_funcs=[ start_cache_manager, ], start_args=[(args,)], ) + + if args.enable_multimodal_visual: + from .visualserver.manager import start_visual_process + process_manager.start_submodule_processes( start_funcs=[ start_visual_process, @@ -296,17 +301,17 @@ def normal_or_p_d_start(args): ], ) - if args.enable_multimodal_audio: - from .audioserver.manager import start_audio_process - - process_manager.start_submodule_processes( - start_funcs=[ - start_audio_process, - ], - start_args=[ - (args,), - ], - ) + if args.enable_multimodal_audio: + from .audioserver.manager import start_audio_process + + process_manager.start_submodule_processes( + start_funcs=[ + start_audio_process, + ], + start_args=[ + (args,), + ], + ) if args.enable_cpu_cache: from .multi_level_kv_cache.manager import start_multi_level_kv_cache_manager diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py index 751cb60ecd..ca3d64651c 100644 --- a/lightllm/server/core/objs/start_args_type.py +++ b/lightllm/server/core/objs/start_args_type.py @@ -79,6 +79,7 @@ class StartArgs: output_constraint_mode: str = field(default="none", metadata={"choices": ["none", "simple", "xgrammar"]}) first_token_constraint_mode: bool = field(default=False) enable_multimodal: bool = field(default=False) + enable_multimodal_visual: bool = field(default=False) enable_multimodal_audio: bool = field(default=False) enable_tpsp_mix_mode: bool = field(default=False) enable_dp_prefill_balance: bool = field(default=False) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 27709ee32b..b796f5b988 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -78,11 +78,19 @@ def __init__( ) self.enable_multimodal = args.enable_multimodal + if self.enable_multimodal: self.cache_client = rpyc.connect("localhost", args.cache_port, config={"allow_pickle": True}) self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + + if self.args.enable_multimodal_visual: self.send_to_visual = context.socket(zmq.PUSH) self.send_to_visual.connect(f"{args.zmq_mode}127.0.0.1:{args.visual_port}") + + if self.args.enable_multimodal_audio: + self.send_to_audio = context.socket(zmq.PUSH) + self.send_to_audio.connect(f"{args.zmq_mode}127.0.0.1:{args.audio_port}") + if args.enable_cpu_cache and not self.args.enable_multimodal: self.send_to_multi_level_kv_cache = context.socket(zmq.PUSH) self.send_to_multi_level_kv_cache.connect(f"{args.zmq_mode}127.0.0.1:{args.multi_level_kv_cache_port}") @@ -519,11 +527,12 @@ async def transfer_to_next_module( ): if self.pd_mode.is_P_or_NORMAL(): - if self.enable_multimodal: - self.send_to_visual.send_pyobj( - group_req_objs.to_group_req_index(), - protocol=pickle.HIGHEST_PROTOCOL, - ) + if self.args.enable_multimodal_visual: + self.send_to_visual.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL) + return + + if self.args.enable_multimodal_audio: + self.send_to_audio.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL) return if self.args.enable_cpu_cache: From 6686fc58e0d662162afbeb4dceb78f127179bcce Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Fri, 13 Mar 2026 04:36:24 +0000 Subject: [PATCH 2/7] add docs. --- docs/CN/source/models/supported_models.rst | 12 ++++++------ docs/CN/source/tutorial/api_server_args.rst | 6 +++--- docs/CN/source/tutorial/multimodal.rst | 6 +++--- docs/EN/source/models/supported_models.rst | 12 ++++++------ docs/EN/source/tutorial/api_server_args.rst | 6 +++--- docs/EN/source/tutorial/multimodal.rst | 14 +++++++------- 6 files changed, 28 insertions(+), 28 deletions(-) diff --git a/docs/CN/source/models/supported_models.rst b/docs/CN/source/models/supported_models.rst index 8f567899d9..89398a05c5 100755 --- a/docs/CN/source/models/supported_models.rst +++ b/docs/CN/source/models/supported_models.rst @@ -68,17 +68,17 @@ lightllm 支持大多数的主流的开源大语言模型以及多模态模型 * - 模型 - 备注 * - `Qwen-VL `_ - - :code:`--trust_remote_code --enable_multimodal` + - :code:`--trust_remote_code --enable_multimodal_visual` * - `Qwen-VL-Chat `_ - - :code:`--trust_remote_code --enable_multimodal` + - :code:`--trust_remote_code --enable_multimodal_visual` * - `Llava-7b `_ - - :code:`--enable_multimodal` + - :code:`--enable_multimodal_visual` * - `Llava-13b `_ - - :code:`--enable_multimodal` + - :code:`--enable_multimodal_visual` * - `Qwen2-VL `_ - - :code:`--enable_multimodal` + - :code:`--enable_multimodal_visual` * - `Google Gemma3 `_ - - :code:`--enable_multimodal` + - :code:`--enable_multimodal_visual` Reward模型 diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst index 191a84bbe6..a991848a38 100644 --- a/docs/CN/source/tutorial/api_server_args.rst +++ b/docs/CN/source/tutorial/api_server_args.rst @@ -248,13 +248,13 @@ PD 分离模式参数 多模态参数 ---------- -.. option:: --enable_multimodal +.. option:: --enable_multimodal_visual - 是否允许加载额外的视觉模型 + 是否允许加载额外的视觉模型模块 .. option:: --enable_multimodal_audio - 是否允许加载额外的音频模型(需要 --enable_multimodal) + 是否允许加载额外的音频模型模块 .. option:: --enable_mps diff --git a/docs/CN/source/tutorial/multimodal.rst b/docs/CN/source/tutorial/multimodal.rst index 31c9dd144e..4701395ede 100644 --- a/docs/CN/source/tutorial/multimodal.rst +++ b/docs/CN/source/tutorial/multimodal.rst @@ -16,7 +16,7 @@ LightLLM支持多种多模态模型的推理,下面以InternVL为例,对多 --model_dir ${MODEL_PATH} \ --mem_fraction 0.8 \ --trust_remote_code \ - --enable_multimodal + --enable_multimodal_visual 核心参数说明 ------------ @@ -28,14 +28,14 @@ LightLLM支持多种多模态模型的推理,下面以InternVL为例,对多 - **LOADWORKER**: 设置模型加载的工作进程数 基础服务参数 -^^^^^^^^^^^^ +^^^^^^^^^^^ - **--port 8080**: API服务器监听端口 - **--tp 2**: 张量并行度(Tensor Parallelism) - **--model_dir**: InternVL模型文件路径 - **--mem_fraction 0.8**: GPU显存使用比例 - **--trust_remote_code**: 允许加载自定义模型代码 -- **--enable_multimodal**: 启用多模态功能 +- **--enable_multimodal_visual**: 启用视觉多模态功能 高级配置参数 ------------ diff --git a/docs/EN/source/models/supported_models.rst b/docs/EN/source/models/supported_models.rst index bc19456668..193278cc30 100755 --- a/docs/EN/source/models/supported_models.rst +++ b/docs/EN/source/models/supported_models.rst @@ -65,17 +65,17 @@ Multimodal Models * - Model - Notes * - `Qwen-VL `_ - - :code:`--trust_remote_code --enable_multimodal` + - :code:`--trust_remote_code --enable_multimodal_visual` * - `Qwen-VL-Chat `_ - - :code:`--trust_remote_code --enable_multimodal` + - :code:`--trust_remote_code --enable_multimodal_visual` * - `Llava-7b `_ - - :code:`--enable_multimodal` + - :code:`--enable_multimodal_visual` * - `Llava-13b `_ - - :code:`--enable_multimodal` + - :code:`--enable_multimodal_visual` * - `Qwen2-VL `_ - - :code:`--enable_multimodal` + - :code:`--enable_multimodal_visual` * - `Google Gemma3 `_ - - :code:`--enable_multimodal` + - :code:`--enable_multimodal_visual` Reward Models ^^^^^^^^^^^^^ diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst index a7686a44b0..19477857fd 100644 --- a/docs/EN/source/tutorial/api_server_args.rst +++ b/docs/EN/source/tutorial/api_server_args.rst @@ -246,13 +246,13 @@ Output Constraint Parameters Multimodal Parameters --------------------- -.. option:: --enable_multimodal +.. option:: --enable_multimodal_visual - Whether to allow loading additional visual models + Whether to allow loading additional visual model module .. option:: --enable_multimodal_audio - Whether to allow loading additional audio models (requires --enable_multimodal) + Whether to allow loading additional audio model module .. option:: --enable_mps diff --git a/docs/EN/source/tutorial/multimodal.rst b/docs/EN/source/tutorial/multimodal.rst index f8eead96cf..b0e60f59e8 100755 --- a/docs/EN/source/tutorial/multimodal.rst +++ b/docs/EN/source/tutorial/multimodal.rst @@ -16,13 +16,13 @@ Basic Launch Command --model_dir ${MODEL_PATH} \ --mem_fraction 0.8 \ --trust_remote_code \ - --enable_multimodal + --enable_multimodal_visual Core Parameter Description -------------------------- Environment Variables -^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^ - **INTERNVL_IMAGE_LENGTH**: Set the image token length for InternVL model, default is 256 - **LOADWORKER**: Set the number of worker processes for model loading @@ -35,7 +35,7 @@ Basic Service Parameters - **--model_dir**: InternVL model file path - **--mem_fraction 0.8**: GPU memory usage ratio - **--trust_remote_code**: Allow loading custom model code -- **--enable_multimodal**: Enable multimodal functionality +- **--enable_multimodal_visual**: Enable visual multimodal functionality Advanced Configuration Parameters --------------------------------- @@ -55,16 +55,16 @@ Advanced Configuration Parameters .. note:: To ensure equal memory load on each GPU, visual_dp * visual_tp = tp is required. For example, if tp=2, then visual_dp=1, visual_tp=2. ViT Deployment Methods ----------------------- +----------------------- ViT TP (Tensor Parallel) -^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^ - Default usage - --visual_tp tp_size enables tensor parallelism ViT DP (Data Parallel) -^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^ - Distribute different image batches to multiple GPUs - Each GPU runs a complete ViT model copy @@ -136,4 +136,4 @@ Testing if response.status_code == 200: print(f"Result: {response.json()}") else: - print(f"Error: {response.status_code}, {response.text}") \ No newline at end of file + print(f"Error: {response.status_code}, {response.text}") From 86ef75f059f58aaba943b94ba399a1a87b558826 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Fri, 13 Mar 2026 05:18:30 +0000 Subject: [PATCH 3/7] fix all --- lightllm/common/basemodel/attention_vit/fa3/fp.py | 3 ++- lightllm/models/qwen2_vl/qwen2_visual.py | 2 ++ .../qwen3_omni_moe_thinker/qwen3_omni_visual.py | 2 ++ lightllm/models/qwen3_vl/qwen3_visual.py | 2 ++ lightllm/server/audioserver/manager.py | 4 ++++ lightllm/server/visualserver/manager.py | 4 ++++ test/acc/test_qwen3_vl.sh | 5 +++++ test/acc/test_vlm_models.py | 12 +++--------- 8 files changed, 24 insertions(+), 10 deletions(-) create mode 100644 test/acc/test_qwen3_vl.sh diff --git a/lightllm/common/basemodel/attention_vit/fa3/fp.py b/lightllm/common/basemodel/attention_vit/fa3/fp.py index 406ff7408d..f804116f1f 100644 --- a/lightllm/common/basemodel/attention_vit/fa3/fp.py +++ b/lightllm/common/basemodel/attention_vit/fa3/fp.py @@ -45,7 +45,8 @@ def _vit_att_fwd( False, window_size[0], window_size[1], - 0.0, + attention_chunk=0, + softcap=0.0, is_rotary_interleaved=False, scheduler_metadata=None, num_splits=1, diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py index 0e2af0cbb2..a29cb8758b 100644 --- a/lightllm/models/qwen2_vl/qwen2_visual.py +++ b/lightllm/models/qwen2_vl/qwen2_visual.py @@ -62,6 +62,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view( -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size ) + # Use channels_last_3d to enable cuDNN optimized Conv3D path + hidden_states = hidden_states.contiguous(memory_format=torch.channels_last_3d) hidden_states = self.proj(hidden_states).view(-1, self.embed_dim) return hidden_states diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py index ffa2e19bd6..c20c227996 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py @@ -68,6 +68,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view( -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size ) + # Use channels_last_3d to enable cuDNN optimized Conv3D path + hidden_states = hidden_states.contiguous(memory_format=torch.channels_last_3d) hidden_states = self.proj(hidden_states).view(-1, self.embed_dim) return hidden_states diff --git a/lightllm/models/qwen3_vl/qwen3_visual.py b/lightllm/models/qwen3_vl/qwen3_visual.py index 00ad6c05a7..d389c853d5 100644 --- a/lightllm/models/qwen3_vl/qwen3_visual.py +++ b/lightllm/models/qwen3_vl/qwen3_visual.py @@ -68,6 +68,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view( -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size ) + # Use channels_last_3d to enable cuDNN optimized Conv3D path + hidden_states = hidden_states.contiguous(memory_format=torch.channels_last_3d) hidden_states = self.proj(hidden_states).view(-1, self.embed_dim) return hidden_states diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py index 945e67681d..bb0a745302 100644 --- a/lightllm/server/audioserver/manager.py +++ b/lightllm/server/audioserver/manager.py @@ -138,6 +138,10 @@ async def loop_for_netio_req(self): while True: recv_req: GroupReqIndexes = await self.zmq_recv_socket.recv_pyobj() if isinstance(recv_req, GroupReqIndexes): + logger.info( + f"audio recv req id {recv_req.group_req_id} " + f"audio count {len(recv_req.multimodal_params.audios)}" + ) self.waiting_reqs.append(recv_req) else: assert False, f"Error Req Inf {recv_req}" diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py index 202c2fc453..f1cc62aa23 100644 --- a/lightllm/server/visualserver/manager.py +++ b/lightllm/server/visualserver/manager.py @@ -189,6 +189,10 @@ async def loop_for_netio_req(self): for _ in range(self.visual_recv_max_count): recv_req: GroupReqIndexes = self.zmq_recv_socket.recv_pyobj(zmq.NOBLOCK) if isinstance(recv_req, GroupReqIndexes): + logger.info( + f"visual recv req id {recv_req.group_req_id} " + f"img count {len(recv_req.multimodal_params.images)}" + ) self.waiting_reqs.append(recv_req) else: assert False, f"Error Req Inf {recv_req}" diff --git a/test/acc/test_qwen3_vl.sh b/test/acc/test_qwen3_vl.sh new file mode 100644 index 0000000000..6cb223cf85 --- /dev/null +++ b/test/acc/test_qwen3_vl.sh @@ -0,0 +1,5 @@ +# first +LOADWORKER=18 CUDA_VISIBLE_DEVICES=6,7 python -m lightllm.server.api_server --model_dir /mtc/models/Qwen3-VL-8B-Instruct --tp 2 --port 8089 --enable_multimodal_visual + +# second +python test_vlm_models.py \ No newline at end of file diff --git a/test/acc/test_vlm_models.py b/test/acc/test_vlm_models.py index 738cc5f64a..64ceede02f 100644 --- a/test/acc/test_vlm_models.py +++ b/test/acc/test_vlm_models.py @@ -14,14 +14,8 @@ """ # VLM models for testing -MODELS = [ - SimpleNamespace( - model="Qwen/Qwen2.5-VL-7B-Instruct", - mmmu_accuracy=0.4, - ), -] os.environ["OPENAI_API_KEY"] = "lightllm123" -os.environ["OPENAI_API_BASE"] = "http://localhost:8000/v1" +os.environ["OPENAI_API_BASE"] = "http://localhost:8089/v1" def run_mmmu_eval( @@ -37,7 +31,7 @@ def run_mmmu_eval( model = "openai_compatible" tp = 1 tasks = "mmmu_val" - batch_size = 16 + batch_size = 900 log_suffix = "openai_compatible" os.makedirs(output_path, exist_ok=True) @@ -72,4 +66,4 @@ def run_mmmu_eval( ) -run_mmmu_eval("/mtc/sangchengmeng/models/Qwen3-VL-8B-Instruct/", "./logs") +run_mmmu_eval("/mtc/models/Qwen3-VL-8B-Instruct", "./logs") From 0afe722a2b46e5d73cb117a640c184b96fdb558b Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Fri, 13 Mar 2026 07:40:12 +0000 Subject: [PATCH 4/7] fix --- lightllm/server/api_cli.py | 10 +-- lightllm/server/api_start.py | 21 ++++++- lightllm/server/core/objs/start_args_type.py | 4 +- lightllm/server/httpserver/manager.py | 8 +-- lightllm/server/visualserver/manager.py | 4 +- lightllm/utils/config_utils.py | 66 ++++++++++++++++++++ 6 files changed, 98 insertions(+), 15 deletions(-) diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index ea347081a8..762d84575b 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -279,14 +279,16 @@ def make_argument_parser() -> argparse.ArgumentParser: use env FIRST_ALLOWED_TOKENS to set the range, like FIRST_ALLOWED_TOKENS=1,2 ..""", ) parser.add_argument( - "--enable_multimodal_visual", + "--disable_vision", action="store_true", - help="Whether or not to allow to load additional visual model module.", + default=None, + help="if the model is a multimodal model, set to not load vision part model.", ) parser.add_argument( - "--enable_multimodal_audio", + "--disable_audio", action="store_true", - help="Whether or not to allow to load additional audio model module.", + default=None, + help="if the model is a multimodal model, set to not load audio part model.", ) parser.add_argument( "--enable_mps", action="store_true", help="Whether to enable nvidia mps for multimodal service." diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index bf5ba48c35..400c022e25 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -16,6 +16,7 @@ from lightllm.utils.process_check import is_process_active from lightllm.utils.multinode_utils import send_and_receive_node_ip from lightllm.utils.shm_size_check import check_recommended_shm_size +from lightllm.utils.config_utils import has_audio_module, has_vision_module logger = init_logger(__name__) @@ -78,7 +79,21 @@ def normal_or_p_d_start(args): if args.run_mode not in ["normal", "prefill", "decode", "nixl_prefill", "nixl_decode"]: return - if args.enable_multimodal_visual or args.enable_multimodal_audio: + # 通过模型的参数判断是否是多模态模型,包含哪几种模态, 并设置是否启动相应得模块 + if args.disable_vision is None: + if has_vision_module(args.model_dir): + args.disable_vision = False + else: + args.disable_vision = True + if args.disable_audio is None: + if has_audio_module(args.model_dir): + args.disable_audio = False + else: + args.disable_audio = True + + if args.disable_vision and args.disable_audio: + args.enable_multimodal = False + else: args.enable_multimodal = True if args.enable_cpu_cache: @@ -289,7 +304,7 @@ def normal_or_p_d_start(args): start_args=[(args,)], ) - if args.enable_multimodal_visual: + if not args.disable_vision: from .visualserver.manager import start_visual_process process_manager.start_submodule_processes( @@ -301,7 +316,7 @@ def normal_or_p_d_start(args): ], ) - if args.enable_multimodal_audio: + if not args.disable_audio: from .audioserver.manager import start_audio_process process_manager.start_submodule_processes( diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py index ca3d64651c..087ba02505 100644 --- a/lightllm/server/core/objs/start_args_type.py +++ b/lightllm/server/core/objs/start_args_type.py @@ -79,8 +79,8 @@ class StartArgs: output_constraint_mode: str = field(default="none", metadata={"choices": ["none", "simple", "xgrammar"]}) first_token_constraint_mode: bool = field(default=False) enable_multimodal: bool = field(default=False) - enable_multimodal_visual: bool = field(default=False) - enable_multimodal_audio: bool = field(default=False) + disable_vision: Optional[bool] = field(default=None) + disable_audio: Optional[bool] = field(default=None) enable_tpsp_mix_mode: bool = field(default=False) enable_dp_prefill_balance: bool = field(default=False) enable_decode_microbatch_overlap: bool = field(default=False) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index b796f5b988..470ea1d3d3 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -83,11 +83,11 @@ def __init__( self.cache_client = rpyc.connect("localhost", args.cache_port, config={"allow_pickle": True}) self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) - if self.args.enable_multimodal_visual: + if not self.args.disable_vision: self.send_to_visual = context.socket(zmq.PUSH) self.send_to_visual.connect(f"{args.zmq_mode}127.0.0.1:{args.visual_port}") - if self.args.enable_multimodal_audio: + if not self.args.disable_audio: self.send_to_audio = context.socket(zmq.PUSH) self.send_to_audio.connect(f"{args.zmq_mode}127.0.0.1:{args.audio_port}") @@ -527,11 +527,11 @@ async def transfer_to_next_module( ): if self.pd_mode.is_P_or_NORMAL(): - if self.args.enable_multimodal_visual: + if not self.args.disable_vision: self.send_to_visual.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL) return - if self.args.enable_multimodal_audio: + if not self.args.disable_audio: self.send_to_audio.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL) return diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py index f1cc62aa23..ae68b82647 100644 --- a/lightllm/server/visualserver/manager.py +++ b/lightllm/server/visualserver/manager.py @@ -32,8 +32,8 @@ def __init__( visual_model_rpc_ports, ): context = zmq.Context(2) - - if args.enable_multimodal_audio: + enable_audio = not args.disable_audio + if enable_audio: self.send_to_next_module = context.socket(zmq.PUSH) self.send_to_next_module.connect(f"{args.zmq_mode}127.0.0.1:{args.audio_port}") else: diff --git a/lightllm/utils/config_utils.py b/lightllm/utils/config_utils.py index 790f185f25..77add61eef 100644 --- a/lightllm/utils/config_utils.py +++ b/lightllm/utils/config_utils.py @@ -145,3 +145,69 @@ def get_fixed_kv_len(): return len(model_cfg["prompt_cache_token_ids"]) else: return 0 + + +@lru_cache(maxsize=None) +def has_vision_module(model_path: str) -> bool: + try: + from transformers.configuration_utils import PretrainedConfig + + model_cfg, _ = PretrainedConfig.get_config_dict(model_path) + model_type = model_cfg["model_type"] + if model_type == "qwen": + # QWenVisionTransformer + model_cfg["visual"] + return True + elif model_type == "qwen2_vl": + # Qwen2VisionTransformerPretrainedModel + model_cfg["vision_config"] + return True + elif model_type == "qwen2_5_vl": + # Qwen2_5_VisionTransformerPretrainedModel + model_cfg["vision_config"] + return True + elif model_type in ["qwen3_vl", "qwen3_vl_moe"]: + # Qwen3VisionTransformerPretrainedModel + model_cfg["vision_config"] + return True + elif model_cfg["architectures"][0] == "TarsierForConditionalGeneration": + # TarsierVisionTransformerPretrainedModel + return True + elif model_type == "internvl_chat": + return True + elif model_type == "gemma3": + return True + elif ( + model_cfg.get("thinker_config", {}).get("vision_config", {}).get("model_type") + == "qwen3_omni_moe_vision_encoder" + ): + # Qwen3OmniMoeVisionTransformerPretrainedModel + return True + else: + raise Exception("unknown vision model type") + except: + logger.info(f"model path: {model_path} does not has vision module") + return False + + +@lru_cache(maxsize=None) +def has_audio_module(model_path: str) -> bool: + try: + from transformers.configuration_utils import PretrainedConfig + + model_cfg, _ = PretrainedConfig.get_config_dict(model_path) + if model_cfg.get("thinker_config") is not None: + model_cfg = model_cfg["thinker_config"] + audio_config = model_cfg["audio_config"] + model_type = audio_config["model_type"] + if model_type == "clap_audio_model" or model_type == "whisper": + # WhisperAudioModel + return True + elif model_type == "qwen3_omni_moe_audio_encoder": + # Qwen3OmniMoeAudioEncoder + return True + else: + raise Exception("unknown audio model type") + except: + logger.info(f"model path: {model_path} does not has audio module") + return False From 8d7d453c01fa207fa29b84fff6d25f177dda8eb8 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Fri, 13 Mar 2026 07:50:34 +0000 Subject: [PATCH 5/7] fix --- docs/CN/source/models/supported_models.rst | 12 ++++++------ docs/CN/source/tutorial/api_server_args.rst | 8 ++++---- docs/CN/source/tutorial/multimodal.rst | 4 +--- docs/EN/source/models/supported_models.rst | 12 ++++++------ docs/EN/source/tutorial/api_server_args.rst | 8 ++++---- docs/EN/source/tutorial/multimodal.rst | 4 +--- lightllm/server/api_start.py | 5 +++++ 7 files changed, 27 insertions(+), 26 deletions(-) diff --git a/docs/CN/source/models/supported_models.rst b/docs/CN/source/models/supported_models.rst index 89398a05c5..3d66d2e073 100755 --- a/docs/CN/source/models/supported_models.rst +++ b/docs/CN/source/models/supported_models.rst @@ -68,17 +68,17 @@ lightllm 支持大多数的主流的开源大语言模型以及多模态模型 * - 模型 - 备注 * - `Qwen-VL `_ - - :code:`--trust_remote_code --enable_multimodal_visual` + - :code:`--trust_remote_code ` * - `Qwen-VL-Chat `_ - - :code:`--trust_remote_code --enable_multimodal_visual` + - :code:`--trust_remote_code ` * - `Llava-7b `_ - - :code:`--enable_multimodal_visual` + - * - `Llava-13b `_ - - :code:`--enable_multimodal_visual` + - * - `Qwen2-VL `_ - - :code:`--enable_multimodal_visual` + - * - `Google Gemma3 `_ - - :code:`--enable_multimodal_visual` + - Reward模型 diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst index a991848a38..2354818efa 100644 --- a/docs/CN/source/tutorial/api_server_args.rst +++ b/docs/CN/source/tutorial/api_server_args.rst @@ -248,13 +248,13 @@ PD 分离模式参数 多模态参数 ---------- -.. option:: --enable_multimodal_visual +.. option:: --disable_vision - 是否允许加载额外的视觉模型模块 + 如果模型是多模态模型,设置此参数将不加载视觉部分模型(默认为None,会根据模型自动检测) -.. option:: --enable_multimodal_audio +.. option:: --disable_audio - 是否允许加载额外的音频模型模块 + 如果模型是多模态模型,设置此参数将不加载音频部分模型(默认为None,会根据模型自动检测) .. option:: --enable_mps diff --git a/docs/CN/source/tutorial/multimodal.rst b/docs/CN/source/tutorial/multimodal.rst index 4701395ede..26f34dd1c8 100644 --- a/docs/CN/source/tutorial/multimodal.rst +++ b/docs/CN/source/tutorial/multimodal.rst @@ -15,8 +15,7 @@ LightLLM支持多种多模态模型的推理,下面以InternVL为例,对多 --tp 2 \ --model_dir ${MODEL_PATH} \ --mem_fraction 0.8 \ - --trust_remote_code \ - --enable_multimodal_visual + --trust_remote_code 核心参数说明 ------------ @@ -35,7 +34,6 @@ LightLLM支持多种多模态模型的推理,下面以InternVL为例,对多 - **--model_dir**: InternVL模型文件路径 - **--mem_fraction 0.8**: GPU显存使用比例 - **--trust_remote_code**: 允许加载自定义模型代码 -- **--enable_multimodal_visual**: 启用视觉多模态功能 高级配置参数 ------------ diff --git a/docs/EN/source/models/supported_models.rst b/docs/EN/source/models/supported_models.rst index 193278cc30..791373866a 100755 --- a/docs/EN/source/models/supported_models.rst +++ b/docs/EN/source/models/supported_models.rst @@ -65,17 +65,17 @@ Multimodal Models * - Model - Notes * - `Qwen-VL `_ - - :code:`--trust_remote_code --enable_multimodal_visual` + - :code:`--trust_remote_code` * - `Qwen-VL-Chat `_ - - :code:`--trust_remote_code --enable_multimodal_visual` + - :code:`--trust_remote_code` * - `Llava-7b `_ - - :code:`--enable_multimodal_visual` + - * - `Llava-13b `_ - - :code:`--enable_multimodal_visual` + - * - `Qwen2-VL `_ - - :code:`--enable_multimodal_visual` + - * - `Google Gemma3 `_ - - :code:`--enable_multimodal_visual` + - Reward Models ^^^^^^^^^^^^^ diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst index 19477857fd..ac4c1b87ec 100644 --- a/docs/EN/source/tutorial/api_server_args.rst +++ b/docs/EN/source/tutorial/api_server_args.rst @@ -246,13 +246,13 @@ Output Constraint Parameters Multimodal Parameters --------------------- -.. option:: --enable_multimodal_visual +.. option:: --disable_vision - Whether to allow loading additional visual model module + If the model is a multimodal model, set this to not load the vision part model (default is None, auto-detected based on model) -.. option:: --enable_multimodal_audio +.. option:: --disable_audio - Whether to allow loading additional audio model module + If the model is a multimodal model, set this to not load the audio part model (default is None, auto-detected based on model) .. option:: --enable_mps diff --git a/docs/EN/source/tutorial/multimodal.rst b/docs/EN/source/tutorial/multimodal.rst index b0e60f59e8..6df11de847 100755 --- a/docs/EN/source/tutorial/multimodal.rst +++ b/docs/EN/source/tutorial/multimodal.rst @@ -15,8 +15,7 @@ Basic Launch Command --tp 2 \ --model_dir ${MODEL_PATH} \ --mem_fraction 0.8 \ - --trust_remote_code \ - --enable_multimodal_visual + --trust_remote_code Core Parameter Description -------------------------- @@ -35,7 +34,6 @@ Basic Service Parameters - **--model_dir**: InternVL model file path - **--mem_fraction 0.8**: GPU memory usage ratio - **--trust_remote_code**: Allow loading custom model code -- **--enable_multimodal_visual**: Enable visual multimodal functionality Advanced Configuration Parameters --------------------------------- diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index 400c022e25..d790e9adfc 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -91,6 +91,11 @@ def normal_or_p_d_start(args): else: args.disable_audio = True + # pd 分离模式下,不启动多模态的模块 + if args.run_mode in ["decode", "nixl_decode"]: + args.disable_audio = True + args.disable_vision = True + if args.disable_vision and args.disable_audio: args.enable_multimodal = False else: From f8c7214ff69fc631cf8ea378df4afa5175627756 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Fri, 13 Mar 2026 08:08:23 +0000 Subject: [PATCH 6/7] fix --- lightllm/server/api_start.py | 6 +++--- lightllm/server/core/objs/start_args_type.py | 3 --- lightllm/server/httpserver/manager.py | 2 +- test/acc/test_qwen3_vl.sh | 2 +- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index d790e9adfc..0db786d0bf 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -68,9 +68,6 @@ def normal_or_p_d_start(args): set_unique_server_name(args) - if not args.disable_shm_warning: - check_recommended_shm_size(args) - if args.enable_mps: from lightllm.utils.device_utils import enable_mps @@ -108,6 +105,9 @@ def normal_or_p_d_start(args): if args.enable_multimodal: args.multi_modal_cache_shm_id = uuid.uuid1().int % 123456789 + if not args.disable_shm_warning: + check_recommended_shm_size(args) + assert args.zmq_mode in ["tcp://", "ipc:///tmp/"] # 确保单机上多实列不冲突 if args.zmq_mode == "ipc:///tmp/": diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py index 087ba02505..4b54cdccef 100644 --- a/lightllm/server/core/objs/start_args_type.py +++ b/lightllm/server/core/objs/start_args_type.py @@ -161,6 +161,3 @@ class StartArgs: metric_port: int = field(default=None) multinode_httpmanager_port: int = field(default=12345) multi_level_kv_cache_port: int = field(default=None) - # multi_modal - enable_multimodal: bool = field(default=False) - enable_multimodal_audio: bool = field(default=False) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 470ea1d3d3..6481098eb9 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -444,7 +444,7 @@ async def _encode( len(multimodal_params.images + multimodal_params.audios) <= self.args.cache_capacity ), "too many multimodal items!" if multimodal_params.audios: - assert self.args.enable_multimodal_audio, "audio multimodal not enabled" + assert not self.args.disable_audio, "audio multimodal not enabled" await self._alloc_multimodal_resources(multimodal_params, sampling_params) prompt_ids = self.tokenizer.encode( prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens diff --git a/test/acc/test_qwen3_vl.sh b/test/acc/test_qwen3_vl.sh index 6cb223cf85..ec5521f225 100644 --- a/test/acc/test_qwen3_vl.sh +++ b/test/acc/test_qwen3_vl.sh @@ -1,5 +1,5 @@ # first -LOADWORKER=18 CUDA_VISIBLE_DEVICES=6,7 python -m lightllm.server.api_server --model_dir /mtc/models/Qwen3-VL-8B-Instruct --tp 2 --port 8089 --enable_multimodal_visual +LOADWORKER=18 CUDA_VISIBLE_DEVICES=6,7 python -m lightllm.server.api_server --model_dir /mtc/models/Qwen3-VL-8B-Instruct --tp 2 --port 8089 # second python test_vlm_models.py \ No newline at end of file From 07129835ac0ab0104b7a50d23f880a4a209dc979 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Fri, 13 Mar 2026 08:31:46 +0000 Subject: [PATCH 7/7] fix --- docs/EN/source/models/supported_models.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/EN/source/models/supported_models.rst b/docs/EN/source/models/supported_models.rst index 791373866a..1b1d4fcd03 100755 --- a/docs/EN/source/models/supported_models.rst +++ b/docs/EN/source/models/supported_models.rst @@ -54,6 +54,8 @@ Large Language Models - * - `Qwen3-Moe `_ - + * - `DeepSeek-V3.2 `_ + - Multimodal Models ^^^^^^^^^^^^^^^^^ @@ -75,6 +77,8 @@ Multimodal Models * - `Qwen2-VL `_ - * - `Google Gemma3 `_ + - + * - `Qwen3-Vl `_ - Reward Models