diff --git a/docs/CN/source/models/supported_models.rst b/docs/CN/source/models/supported_models.rst index 8f567899d9..3d66d2e073 100755 --- a/docs/CN/source/models/supported_models.rst +++ b/docs/CN/source/models/supported_models.rst @@ -68,17 +68,17 @@ lightllm 支持大多数的主流的开源大语言模型以及多模态模型 * - 模型 - 备注 * - `Qwen-VL `_ - - :code:`--trust_remote_code --enable_multimodal` + - :code:`--trust_remote_code ` * - `Qwen-VL-Chat `_ - - :code:`--trust_remote_code --enable_multimodal` + - :code:`--trust_remote_code ` * - `Llava-7b `_ - - :code:`--enable_multimodal` + - * - `Llava-13b `_ - - :code:`--enable_multimodal` + - * - `Qwen2-VL `_ - - :code:`--enable_multimodal` + - * - `Google Gemma3 `_ - - :code:`--enable_multimodal` + - Reward模型 diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst index 191a84bbe6..2354818efa 100644 --- a/docs/CN/source/tutorial/api_server_args.rst +++ b/docs/CN/source/tutorial/api_server_args.rst @@ -248,13 +248,13 @@ PD 分离模式参数 多模态参数 ---------- -.. option:: --enable_multimodal +.. option:: --disable_vision - 是否允许加载额外的视觉模型 + 如果模型是多模态模型,设置此参数将不加载视觉部分模型(默认为None,会根据模型自动检测) -.. option:: --enable_multimodal_audio +.. option:: --disable_audio - 是否允许加载额外的音频模型(需要 --enable_multimodal) + 如果模型是多模态模型,设置此参数将不加载音频部分模型(默认为None,会根据模型自动检测) .. option:: --enable_mps diff --git a/docs/CN/source/tutorial/multimodal.rst b/docs/CN/source/tutorial/multimodal.rst index 31c9dd144e..26f34dd1c8 100644 --- a/docs/CN/source/tutorial/multimodal.rst +++ b/docs/CN/source/tutorial/multimodal.rst @@ -15,8 +15,7 @@ LightLLM支持多种多模态模型的推理,下面以InternVL为例,对多 --tp 2 \ --model_dir ${MODEL_PATH} \ --mem_fraction 0.8 \ - --trust_remote_code \ - --enable_multimodal + --trust_remote_code 核心参数说明 ------------ @@ -28,14 +27,13 @@ LightLLM支持多种多模态模型的推理,下面以InternVL为例,对多 - **LOADWORKER**: 设置模型加载的工作进程数 基础服务参数 -^^^^^^^^^^^^ +^^^^^^^^^^^ - **--port 8080**: API服务器监听端口 - **--tp 2**: 张量并行度(Tensor Parallelism) - **--model_dir**: InternVL模型文件路径 - **--mem_fraction 0.8**: GPU显存使用比例 - **--trust_remote_code**: 允许加载自定义模型代码 -- **--enable_multimodal**: 启用多模态功能 高级配置参数 ------------ diff --git a/docs/EN/source/models/supported_models.rst b/docs/EN/source/models/supported_models.rst index bc19456668..1b1d4fcd03 100755 --- a/docs/EN/source/models/supported_models.rst +++ b/docs/EN/source/models/supported_models.rst @@ -54,6 +54,8 @@ Large Language Models - * - `Qwen3-Moe `_ - + * - `DeepSeek-V3.2 `_ + - Multimodal Models ^^^^^^^^^^^^^^^^^ @@ -65,17 +67,19 @@ Multimodal Models * - Model - Notes * - `Qwen-VL `_ - - :code:`--trust_remote_code --enable_multimodal` + - :code:`--trust_remote_code` * - `Qwen-VL-Chat `_ - - :code:`--trust_remote_code --enable_multimodal` + - :code:`--trust_remote_code` * - `Llava-7b `_ - - :code:`--enable_multimodal` + - * - `Llava-13b `_ - - :code:`--enable_multimodal` + - * - `Qwen2-VL `_ - - :code:`--enable_multimodal` + - * - `Google Gemma3 `_ - - :code:`--enable_multimodal` + - + * - `Qwen3-Vl `_ + - Reward Models ^^^^^^^^^^^^^ diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst index a7686a44b0..ac4c1b87ec 100644 --- a/docs/EN/source/tutorial/api_server_args.rst +++ b/docs/EN/source/tutorial/api_server_args.rst @@ -246,13 +246,13 @@ Output Constraint Parameters Multimodal Parameters --------------------- -.. option:: --enable_multimodal +.. option:: --disable_vision - Whether to allow loading additional visual models + If the model is a multimodal model, set this to not load the vision part model (default is None, auto-detected based on model) -.. option:: --enable_multimodal_audio +.. option:: --disable_audio - Whether to allow loading additional audio models (requires --enable_multimodal) + If the model is a multimodal model, set this to not load the audio part model (default is None, auto-detected based on model) .. option:: --enable_mps diff --git a/docs/EN/source/tutorial/multimodal.rst b/docs/EN/source/tutorial/multimodal.rst index f8eead96cf..6df11de847 100755 --- a/docs/EN/source/tutorial/multimodal.rst +++ b/docs/EN/source/tutorial/multimodal.rst @@ -15,14 +15,13 @@ Basic Launch Command --tp 2 \ --model_dir ${MODEL_PATH} \ --mem_fraction 0.8 \ - --trust_remote_code \ - --enable_multimodal + --trust_remote_code Core Parameter Description -------------------------- Environment Variables -^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^ - **INTERNVL_IMAGE_LENGTH**: Set the image token length for InternVL model, default is 256 - **LOADWORKER**: Set the number of worker processes for model loading @@ -35,7 +34,6 @@ Basic Service Parameters - **--model_dir**: InternVL model file path - **--mem_fraction 0.8**: GPU memory usage ratio - **--trust_remote_code**: Allow loading custom model code -- **--enable_multimodal**: Enable multimodal functionality Advanced Configuration Parameters --------------------------------- @@ -55,16 +53,16 @@ Advanced Configuration Parameters .. note:: To ensure equal memory load on each GPU, visual_dp * visual_tp = tp is required. For example, if tp=2, then visual_dp=1, visual_tp=2. ViT Deployment Methods ----------------------- +----------------------- ViT TP (Tensor Parallel) -^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^ - Default usage - --visual_tp tp_size enables tensor parallelism ViT DP (Data Parallel) -^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^ - Distribute different image batches to multiple GPUs - Each GPU runs a complete ViT model copy @@ -136,4 +134,4 @@ Testing if response.status_code == 200: print(f"Result: {response.json()}") else: - print(f"Error: {response.status_code}, {response.text}") \ No newline at end of file + print(f"Error: {response.status_code}, {response.text}") diff --git a/lightllm/common/basemodel/attention_vit/fa3/fp.py b/lightllm/common/basemodel/attention_vit/fa3/fp.py index 406ff7408d..f804116f1f 100644 --- a/lightllm/common/basemodel/attention_vit/fa3/fp.py +++ b/lightllm/common/basemodel/attention_vit/fa3/fp.py @@ -45,7 +45,8 @@ def _vit_att_fwd( False, window_size[0], window_size[1], - 0.0, + attention_chunk=0, + softcap=0.0, is_rotary_interleaved=False, scheduler_metadata=None, num_splits=1, diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py index 0e2af0cbb2..a29cb8758b 100644 --- a/lightllm/models/qwen2_vl/qwen2_visual.py +++ b/lightllm/models/qwen2_vl/qwen2_visual.py @@ -62,6 +62,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view( -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size ) + # Use channels_last_3d to enable cuDNN optimized Conv3D path + hidden_states = hidden_states.contiguous(memory_format=torch.channels_last_3d) hidden_states = self.proj(hidden_states).view(-1, self.embed_dim) return hidden_states diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py index ffa2e19bd6..c20c227996 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py @@ -68,6 +68,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view( -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size ) + # Use channels_last_3d to enable cuDNN optimized Conv3D path + hidden_states = hidden_states.contiguous(memory_format=torch.channels_last_3d) hidden_states = self.proj(hidden_states).view(-1, self.embed_dim) return hidden_states diff --git a/lightllm/models/qwen3_vl/qwen3_visual.py b/lightllm/models/qwen3_vl/qwen3_visual.py index 00ad6c05a7..d389c853d5 100644 --- a/lightllm/models/qwen3_vl/qwen3_visual.py +++ b/lightllm/models/qwen3_vl/qwen3_visual.py @@ -68,6 +68,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view( -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size ) + # Use channels_last_3d to enable cuDNN optimized Conv3D path + hidden_states = hidden_states.contiguous(memory_format=torch.channels_last_3d) hidden_states = self.proj(hidden_states).view(-1, self.embed_dim) return hidden_states diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index 37b2bb3ef3..762d84575b 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -279,12 +279,16 @@ def make_argument_parser() -> argparse.ArgumentParser: use env FIRST_ALLOWED_TOKENS to set the range, like FIRST_ALLOWED_TOKENS=1,2 ..""", ) parser.add_argument( - "--enable_multimodal", action="store_true", help="Whether or not to allow to load additional visual models." + "--disable_vision", + action="store_true", + default=None, + help="if the model is a multimodal model, set to not load vision part model.", ) parser.add_argument( - "--enable_multimodal_audio", + "--disable_audio", action="store_true", - help="Whether or not to allow to load additional audio models (requird --enable_multimodal).", + default=None, + help="if the model is a multimodal model, set to not load audio part model.", ) parser.add_argument( "--enable_mps", action="store_true", help="Whether to enable nvidia mps for multimodal service." diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index 111def60c2..0db786d0bf 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -16,6 +16,7 @@ from lightllm.utils.process_check import is_process_active from lightllm.utils.multinode_utils import send_and_receive_node_ip from lightllm.utils.shm_size_check import check_recommended_shm_size +from lightllm.utils.config_utils import has_audio_module, has_vision_module logger = init_logger(__name__) @@ -67,9 +68,6 @@ def normal_or_p_d_start(args): set_unique_server_name(args) - if not args.disable_shm_warning: - check_recommended_shm_size(args) - if args.enable_mps: from lightllm.utils.device_utils import enable_mps @@ -78,6 +76,28 @@ def normal_or_p_d_start(args): if args.run_mode not in ["normal", "prefill", "decode", "nixl_prefill", "nixl_decode"]: return + # 通过模型的参数判断是否是多模态模型,包含哪几种模态, 并设置是否启动相应得模块 + if args.disable_vision is None: + if has_vision_module(args.model_dir): + args.disable_vision = False + else: + args.disable_vision = True + if args.disable_audio is None: + if has_audio_module(args.model_dir): + args.disable_audio = False + else: + args.disable_audio = True + + # pd 分离模式下,不启动多模态的模块 + if args.run_mode in ["decode", "nixl_decode"]: + args.disable_audio = True + args.disable_vision = True + + if args.disable_vision and args.disable_audio: + args.enable_multimodal = False + else: + args.enable_multimodal = True + if args.enable_cpu_cache: # 生成一个用于创建cpu kv cache的共享内存id。 args.cpu_kv_cache_shm_id = uuid.uuid1().int % 123456789 @@ -85,6 +105,9 @@ def normal_or_p_d_start(args): if args.enable_multimodal: args.multi_modal_cache_shm_id = uuid.uuid1().int % 123456789 + if not args.disable_shm_warning: + check_recommended_shm_size(args) + assert args.zmq_mode in ["tcp://", "ipc:///tmp/"] # 确保单机上多实列不冲突 if args.zmq_mode == "ipc:///tmp/": @@ -279,14 +302,16 @@ def normal_or_p_d_start(args): ports_locker.release_port() if args.enable_multimodal: - from .visualserver.manager import start_visual_process - process_manager.start_submodule_processes( start_funcs=[ start_cache_manager, ], start_args=[(args,)], ) + + if not args.disable_vision: + from .visualserver.manager import start_visual_process + process_manager.start_submodule_processes( start_funcs=[ start_visual_process, @@ -296,17 +321,17 @@ def normal_or_p_d_start(args): ], ) - if args.enable_multimodal_audio: - from .audioserver.manager import start_audio_process - - process_manager.start_submodule_processes( - start_funcs=[ - start_audio_process, - ], - start_args=[ - (args,), - ], - ) + if not args.disable_audio: + from .audioserver.manager import start_audio_process + + process_manager.start_submodule_processes( + start_funcs=[ + start_audio_process, + ], + start_args=[ + (args,), + ], + ) if args.enable_cpu_cache: from .multi_level_kv_cache.manager import start_multi_level_kv_cache_manager diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py index 945e67681d..bb0a745302 100644 --- a/lightllm/server/audioserver/manager.py +++ b/lightllm/server/audioserver/manager.py @@ -138,6 +138,10 @@ async def loop_for_netio_req(self): while True: recv_req: GroupReqIndexes = await self.zmq_recv_socket.recv_pyobj() if isinstance(recv_req, GroupReqIndexes): + logger.info( + f"audio recv req id {recv_req.group_req_id} " + f"audio count {len(recv_req.multimodal_params.audios)}" + ) self.waiting_reqs.append(recv_req) else: assert False, f"Error Req Inf {recv_req}" diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py index 751cb60ecd..4b54cdccef 100644 --- a/lightllm/server/core/objs/start_args_type.py +++ b/lightllm/server/core/objs/start_args_type.py @@ -79,7 +79,8 @@ class StartArgs: output_constraint_mode: str = field(default="none", metadata={"choices": ["none", "simple", "xgrammar"]}) first_token_constraint_mode: bool = field(default=False) enable_multimodal: bool = field(default=False) - enable_multimodal_audio: bool = field(default=False) + disable_vision: Optional[bool] = field(default=None) + disable_audio: Optional[bool] = field(default=None) enable_tpsp_mix_mode: bool = field(default=False) enable_dp_prefill_balance: bool = field(default=False) enable_decode_microbatch_overlap: bool = field(default=False) @@ -160,6 +161,3 @@ class StartArgs: metric_port: int = field(default=None) multinode_httpmanager_port: int = field(default=12345) multi_level_kv_cache_port: int = field(default=None) - # multi_modal - enable_multimodal: bool = field(default=False) - enable_multimodal_audio: bool = field(default=False) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 27709ee32b..6481098eb9 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -78,11 +78,19 @@ def __init__( ) self.enable_multimodal = args.enable_multimodal + if self.enable_multimodal: self.cache_client = rpyc.connect("localhost", args.cache_port, config={"allow_pickle": True}) self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + + if not self.args.disable_vision: self.send_to_visual = context.socket(zmq.PUSH) self.send_to_visual.connect(f"{args.zmq_mode}127.0.0.1:{args.visual_port}") + + if not self.args.disable_audio: + self.send_to_audio = context.socket(zmq.PUSH) + self.send_to_audio.connect(f"{args.zmq_mode}127.0.0.1:{args.audio_port}") + if args.enable_cpu_cache and not self.args.enable_multimodal: self.send_to_multi_level_kv_cache = context.socket(zmq.PUSH) self.send_to_multi_level_kv_cache.connect(f"{args.zmq_mode}127.0.0.1:{args.multi_level_kv_cache_port}") @@ -436,7 +444,7 @@ async def _encode( len(multimodal_params.images + multimodal_params.audios) <= self.args.cache_capacity ), "too many multimodal items!" if multimodal_params.audios: - assert self.args.enable_multimodal_audio, "audio multimodal not enabled" + assert not self.args.disable_audio, "audio multimodal not enabled" await self._alloc_multimodal_resources(multimodal_params, sampling_params) prompt_ids = self.tokenizer.encode( prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens @@ -519,11 +527,12 @@ async def transfer_to_next_module( ): if self.pd_mode.is_P_or_NORMAL(): - if self.enable_multimodal: - self.send_to_visual.send_pyobj( - group_req_objs.to_group_req_index(), - protocol=pickle.HIGHEST_PROTOCOL, - ) + if not self.args.disable_vision: + self.send_to_visual.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL) + return + + if not self.args.disable_audio: + self.send_to_audio.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL) return if self.args.enable_cpu_cache: diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py index 202c2fc453..ae68b82647 100644 --- a/lightllm/server/visualserver/manager.py +++ b/lightllm/server/visualserver/manager.py @@ -32,8 +32,8 @@ def __init__( visual_model_rpc_ports, ): context = zmq.Context(2) - - if args.enable_multimodal_audio: + enable_audio = not args.disable_audio + if enable_audio: self.send_to_next_module = context.socket(zmq.PUSH) self.send_to_next_module.connect(f"{args.zmq_mode}127.0.0.1:{args.audio_port}") else: @@ -189,6 +189,10 @@ async def loop_for_netio_req(self): for _ in range(self.visual_recv_max_count): recv_req: GroupReqIndexes = self.zmq_recv_socket.recv_pyobj(zmq.NOBLOCK) if isinstance(recv_req, GroupReqIndexes): + logger.info( + f"visual recv req id {recv_req.group_req_id} " + f"img count {len(recv_req.multimodal_params.images)}" + ) self.waiting_reqs.append(recv_req) else: assert False, f"Error Req Inf {recv_req}" diff --git a/lightllm/utils/config_utils.py b/lightllm/utils/config_utils.py index 790f185f25..77add61eef 100644 --- a/lightllm/utils/config_utils.py +++ b/lightllm/utils/config_utils.py @@ -145,3 +145,69 @@ def get_fixed_kv_len(): return len(model_cfg["prompt_cache_token_ids"]) else: return 0 + + +@lru_cache(maxsize=None) +def has_vision_module(model_path: str) -> bool: + try: + from transformers.configuration_utils import PretrainedConfig + + model_cfg, _ = PretrainedConfig.get_config_dict(model_path) + model_type = model_cfg["model_type"] + if model_type == "qwen": + # QWenVisionTransformer + model_cfg["visual"] + return True + elif model_type == "qwen2_vl": + # Qwen2VisionTransformerPretrainedModel + model_cfg["vision_config"] + return True + elif model_type == "qwen2_5_vl": + # Qwen2_5_VisionTransformerPretrainedModel + model_cfg["vision_config"] + return True + elif model_type in ["qwen3_vl", "qwen3_vl_moe"]: + # Qwen3VisionTransformerPretrainedModel + model_cfg["vision_config"] + return True + elif model_cfg["architectures"][0] == "TarsierForConditionalGeneration": + # TarsierVisionTransformerPretrainedModel + return True + elif model_type == "internvl_chat": + return True + elif model_type == "gemma3": + return True + elif ( + model_cfg.get("thinker_config", {}).get("vision_config", {}).get("model_type") + == "qwen3_omni_moe_vision_encoder" + ): + # Qwen3OmniMoeVisionTransformerPretrainedModel + return True + else: + raise Exception("unknown vision model type") + except: + logger.info(f"model path: {model_path} does not has vision module") + return False + + +@lru_cache(maxsize=None) +def has_audio_module(model_path: str) -> bool: + try: + from transformers.configuration_utils import PretrainedConfig + + model_cfg, _ = PretrainedConfig.get_config_dict(model_path) + if model_cfg.get("thinker_config") is not None: + model_cfg = model_cfg["thinker_config"] + audio_config = model_cfg["audio_config"] + model_type = audio_config["model_type"] + if model_type == "clap_audio_model" or model_type == "whisper": + # WhisperAudioModel + return True + elif model_type == "qwen3_omni_moe_audio_encoder": + # Qwen3OmniMoeAudioEncoder + return True + else: + raise Exception("unknown audio model type") + except: + logger.info(f"model path: {model_path} does not has audio module") + return False diff --git a/test/acc/test_qwen3_vl.sh b/test/acc/test_qwen3_vl.sh new file mode 100644 index 0000000000..ec5521f225 --- /dev/null +++ b/test/acc/test_qwen3_vl.sh @@ -0,0 +1,5 @@ +# first +LOADWORKER=18 CUDA_VISIBLE_DEVICES=6,7 python -m lightllm.server.api_server --model_dir /mtc/models/Qwen3-VL-8B-Instruct --tp 2 --port 8089 + +# second +python test_vlm_models.py \ No newline at end of file diff --git a/test/acc/test_vlm_models.py b/test/acc/test_vlm_models.py index 738cc5f64a..64ceede02f 100644 --- a/test/acc/test_vlm_models.py +++ b/test/acc/test_vlm_models.py @@ -14,14 +14,8 @@ """ # VLM models for testing -MODELS = [ - SimpleNamespace( - model="Qwen/Qwen2.5-VL-7B-Instruct", - mmmu_accuracy=0.4, - ), -] os.environ["OPENAI_API_KEY"] = "lightllm123" -os.environ["OPENAI_API_BASE"] = "http://localhost:8000/v1" +os.environ["OPENAI_API_BASE"] = "http://localhost:8089/v1" def run_mmmu_eval( @@ -37,7 +31,7 @@ def run_mmmu_eval( model = "openai_compatible" tp = 1 tasks = "mmmu_val" - batch_size = 16 + batch_size = 900 log_suffix = "openai_compatible" os.makedirs(output_path, exist_ok=True) @@ -72,4 +66,4 @@ def run_mmmu_eval( ) -run_mmmu_eval("/mtc/sangchengmeng/models/Qwen3-VL-8B-Instruct/", "./logs") +run_mmmu_eval("/mtc/models/Qwen3-VL-8B-Instruct", "./logs")