refine

niushengxiao · niushengxiao · commit c25c078ec09c · 2026-03-16T19:02:23.000+08:00
diff --git a/lightllm/common/basemodel/attention_vit/fa3/fp.py b/lightllm/common/basemodel/attention_vit/fa3/fp.py
@@ -18,7 +18,7 @@ def _vit_att_fwd(
         head_dim = q.shape[-1]
         softmax_scale = head_dim ** -0.5
         window_size = (-1, -1)
-        attn_output = flash_attn_varlen_func(
+        o = flash_attn_varlen_func(
             q,
             k,
             v,
@@ -29,8 +29,7 @@ def _vit_att_fwd(
             softmax_scale=softmax_scale,
             causal=False,
             window_size=window_size,
+            attention_chunk=0,
             softcap=0.0,
         )
-        o.copy_(attn_output)
-
         return o
diff --git a/lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py b/lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py
@@ -6,7 +6,7 @@
 from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
 from lightllm.models.llama.infer_struct import LlamaInferStateInfo
 from lightllm.models.llama.layer_infer.pre_layer_infer import LlamaPreLayerInfer
-from lightllm.server.embed_cache.utils import bytes2tensor, read_shm, get_shm_name_embed, load_tensor_afs
+from lightllm.server.embed_cache.utils import get_shm_name_embed, load_tensor_afs
 from lightllm.common.basemodel.triton_kernel.multimodal_emb import multimodal_emb
 from lightllm.distributed.communication_op import all_reduce
 from lightllm.utils.envs_utils import get_env_start_args
@@ -41,9 +41,7 @@ def _copy_loaded_embed_to_cache(
         self, embed_tensor: torch.Tensor, cpu_embed_cache_tensor: torch.Tensor, start_index: int
     ):
         if embed_tensor.ndim == 2:
-            token_num, hidden_size = embed_tensor.shape
-            cpu_embed_cache_tensor[start_index : start_index + token_num, 0, :hidden_size].copy_(embed_tensor)
-            return
+            embed_tensor = embed_tensor.unsqueeze(1)
 
         token_num, layer_num, hidden_size = embed_tensor.shape
         cpu_embed_cache_tensor[start_index : start_index + token_num, :layer_num, :hidden_size].copy_(embed_tensor)
@@ -53,18 +51,20 @@ def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_wei
         img_start_token_ids = []
         img_token_lens = []
         img_start_locs_in_cache = []
+        unique_uids = []
         device = layer_weight.wte_weight_.weight.device
         dtype = layer_weight.wte_weight_.weight.dtype
         hidden_size = layer_weight.wte_weight_.weight.shape[1]
 
-        for batch_id, p in enumerate(infer_state.multimodal_params):
+        for _, p in enumerate(infer_state.multimodal_params):
             for img in p["images"] + p["audios"]:
                 # skip the same image
                 if img["token_id"] in img_start_token_ids:
                     continue
                 img_start_token_ids.append(img["token_id"])
                 img_token_lens.append(img["token_num"])
                 img_start_locs_in_cache.append(img["start_index_in_embed_cache"])
+                unique_uids.append(img["uuid"])
         out = torch.zeros((len(input_ids), hidden_size), dtype=dtype, device=device)
 
         from lightllm.server.router.model_infer.infer_batch import g_infer_context
@@ -77,33 +77,13 @@ def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_wei
         )
 
         if self.args.enable_remote_vit:
-            unique_multimodal_items = []
-            seen_uuids = set()
             release_ids = []
-            for batch_id, p in enumerate(infer_state.multimodal_params):
+            for _, p in enumerate(infer_state.multimodal_params):
                 for img in p["images"] + p["audios"]:
-                    if img["token_num"] is None:
-                        continue
-                    uid = img["uuid"]
-                    release_ids.append(uid)
-                    if uid in seen_uuids:
-                        continue
-                    seen_uuids.add(uid)
-                    unique_multimodal_items.append((uid, img["start_index_in_embed_cache"]))
+                    release_ids.append(img["uuid"])
 
-            if self.args.image_embed_dir:
-                image_embed_dir = self.args.image_embed_dir
-
-                def load_embed_tensor(uid):
-                    return load_tensor_afs(get_shm_name_embed(uid), image_embed_dir)
-
-            else:
-
-                def load_embed_tensor(uid):
-                    return bytes2tensor(read_shm(get_shm_name_embed(uid)))
-
-            for uid, start_index_in_embed_cache in unique_multimodal_items:
-                embed_tensor = load_embed_tensor(uid)
+            for uid, start_index_in_embed_cache in zip(unique_uids, img_start_locs_in_cache):
+                embed_tensor = load_tensor_afs(get_shm_name_embed(uid), self.args.image_embed_dir)
                 self._copy_loaded_embed_to_cache(embed_tensor, cpu_embed_cache_tensor, start_index_in_embed_cache)
 
             if release_ids:
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -167,7 +167,7 @@ def flash_attention_v3_fwd(
         head_dim = q.shape[-1]
         softmax_scale = head_dim ** -0.5
         window_size = (-1, -1)
-        attn_output = flash_attn_varlen_func(
+        o = flash_attn_varlen_func(
             q,
             k,
             v,
@@ -180,9 +180,7 @@ def flash_attention_v3_fwd(
             window_size=window_size,
             softcap=0.0,
         )
-        o.copy_(attn_output)
-
-        return
+        return o
 
 except ImportError:
     print("Failed to import _flash_attn_forward from hopper.flash_attn_interface.")
diff --git a/lightllm/server/api_lightllm.py b/lightllm/server/api_lightllm.py
@@ -1,11 +1,10 @@
 import collections
 from typing import AsyncGenerator
 from fastapi import BackgroundTasks, Request
-from fastapi.responses import Response, StreamingResponse
+from fastapi.responses import Response, StreamingResponse, JSONResponse
 from lightllm.server.core.objs.sampling_params import SamplingParams
 from .multimodal_params import MultimodalParams
 from .httpserver.manager import HttpServerManager
-from fastapi.responses import JSONResponse
 import ujson as json
 
 
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -87,7 +87,7 @@ def signal_handler(sig, frame):
     return
 
 
-def check_and_set_args(args):
+def normal_or_p_d_start(args, only_prepare=False):
     from lightllm.server.core.objs.start_args_type import StartArgs
 
     args: StartArgs = args
@@ -219,20 +219,18 @@ def check_and_set_args(args):
         if args.batch_max_tokens is None:
             args.batch_max_tokens = args.max_req_total_len
         else:
-            assert args.batch_max_tokens >= args.max_req_total_len, (
-                f"batch_max_tokens must >= max_req_total_len"
-                f"but got {args.batch_max_tokens}, {args.max_req_total_len}"
-            )
+            assert args.batch_max_tokens >= args.max_req_total_len, f"batch_max_tokens must >= max_req_total_len"
+            f"but got {args.batch_max_tokens}, {args.max_req_total_len}"
     else:
         # chunked 模式下
         if args.batch_max_tokens is None:
             args.batch_max_tokens = 16384 // args.dp
         if args.chunked_prefill_size is None:
             args.chunked_prefill_size = args.batch_max_tokens // 2
-        assert args.batch_max_tokens >= args.chunked_prefill_size, (
-            "chunked prefill mode, batch_max_tokens must >= chunked_prefill_size, "
-            f"but got {args.batch_max_tokens}, {args.chunked_prefill_size}"
-        )
+        assert (
+            args.batch_max_tokens >= args.chunked_prefill_size
+        ), "chunked prefill mode, batch_max_tokens must >= chunked_prefill_size, "
+        f"but got {args.batch_max_tokens}, {args.chunked_prefill_size}"
 
     # help to manage data stored on Ceph
     if "s3://" in args.model_dir:
@@ -252,9 +250,8 @@ def check_and_set_args(args):
         args.data_type = get_dtype(args.model_dir)
         assert args.data_type in ["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"]
 
-
-def normal_or_p_d_start(args):
-    check_and_set_args(args)
+    if only_prepare:
+        return
 
     already_uesd_ports = [args.port]
     if args.nccl_port is not None:
@@ -291,17 +288,19 @@ def normal_or_p_d_start(args):
     can_use_ports = can_use_ports[10:]
 
     visual_model_tp_ports = []
+    visual_nccl_ports = []
     for _ in range(args.visual_dp):
         tp_ports_for_dp = can_use_ports[0 : args.visual_tp]
         visual_model_tp_ports.append(tp_ports_for_dp)
         can_use_ports = can_use_ports[args.visual_tp :]
+        if args.visual_nccl_ports is None:
+            visual_nccl_ports.append(can_use_ports[0])
+            can_use_ports = can_use_ports[1:]
 
-    if args.visual_nccl_ports is None:
-        visual_nccl_ports = can_use_ports[0 : args.visual_dp]
-        can_use_ports = can_use_ports[args.visual_dp :]
-    else:
-        visual_nccl_ports = args.visual_nccl_ports[: args.visual_dp]
+    if args.visual_nccl_ports is not None:
+        args.visual_nccl_ports = args.visual_nccl_ports[: args.visual_dp]
 
+    # 将申请好的端口放入args参数中
     if args.nccl_port is None:
         args.nccl_port = nccl_port
     if args.pd_decode_rpyc_port is None:
@@ -328,6 +327,7 @@ def normal_or_p_d_start(args):
         args.router_max_wait_tokens = 0
 
     send_and_receive_node_ip(args)  # 多机用于收发node ip
+    # dp 必须 > 1
     if args.enable_dp_prompt_cache_fetch and args.dp <= 1:
         args.enable_dp_prompt_cache_fetch = False
         logger.warning(
@@ -491,7 +491,7 @@ def pd_master_start(args):
 
 
 def visual_start(args):
-    check_and_set_args(args)
+    normal_or_p_d_start(args, only_prepare=True)
 
     already_uesd_ports = [args.remote_vit_port]
     if args.nccl_port is not None:
@@ -515,23 +515,23 @@ def visual_start(args):
     can_use_ports = can_use_ports[5:]
 
     visual_model_tp_ports = []
+    visual_nccl_ports = []
     for _ in range(args.visual_dp):
         tp_ports_for_dp = can_use_ports[0 : args.visual_tp]
-        can_use_ports = can_use_ports[args.visual_tp :]
         visual_model_tp_ports.append(tp_ports_for_dp)
+        can_use_ports = can_use_ports[args.visual_tp :]
+        if args.visual_nccl_ports is None:
+            visual_nccl_ports.append(can_use_ports[0])
+            can_use_ports = can_use_ports[1:]
 
-    if args.visual_nccl_ports is None:
-        args.visual_nccl_ports = can_use_ports[0 : args.visual_dp]
-        can_use_ports = can_use_ports[args.visual_dp :]
-    else:
+    if args.visual_nccl_ports is not None:
         args.visual_nccl_ports = args.visual_nccl_ports[: args.visual_dp]
 
     args.router_port = router_port
     args.visual_port = visual_port
     args.audio_port = audio_port
     args.cache_port = cache_port
     args.metric_port = metric_port
-    args.visual_model_rpc_ports = visual_model_tp_ports
     args.visual_node_id = uuid.uuid4().int
 
     logger.info(f"all start args:{args}")
@@ -586,9 +586,9 @@ def config_server_start(args):
         "--log-level",
         "info",
         "--access-logfile",
-        "/dev/stdout",
+        "-",
         "--error-logfile",
-        "/dev/stderr",
+        "-",
         "lightllm.server.config_server.api_http:app",
         "--keep-alive",
         f"{get_lightllm_gunicorn_keep_alive()}",
diff --git a/lightllm/server/embed_cache/impl/memory_cache_with_redis.py b/lightllm/server/embed_cache/impl/memory_cache_with_redis.py
@@ -51,9 +51,11 @@ def set_items_embed(self, ids: list[int]) -> None:
         with self.lock:
             for id in ids:
                 self.redis_cache.insert(str(id))
-                self._records[id].embed = True
-                if self._records[id].ref > 0:
-                    self._update_record_ref_by_id(id, -1)
+                rec = self._records.get(id)
+                if rec is not None:
+                    rec.embed = True
+                    if rec.ref > 0:
+                        self._update_record_ref_by_id(id, -1)
                 # 保留一份 redis 引用，直到真正的消费者读取完成后再 release，
                 # 避免 VIT 刚写完文件但 LLM 还没来得及读取时被 LRU 误删。
 
@@ -66,19 +68,7 @@ def get_items_embed(self, ids: list[int], embeding_only: bool = False) -> list[O
                 exist = self.redis_cache.query_and_incre(str(id))
             ret.append(exist)
             if exist:
-                self._records[id].embed = True
+                rec = self._records.get(id)
+                if rec is not None:
+                    rec.embed = True
         return ret
-
-    # def get_items_embed_and_incre(self, ids: list[int]) -> list[Optional[bool]]:
-    #     ret = []
-    #     for id in ids:
-    #         # if self.redis_cache.query(str(id)):
-    #         #     ret.append(True)
-    #         #     continue
-    #         # 避免重复的引用计数增加
-    #         if self._records[id].embed:
-    #             ret.append(True)
-    #             continue
-    #         self._records[id].embed = self.redis_cache.query_and_incre(str(id))
-    #         ret.append(self._records[id].embed)
-    #     return ret
diff --git a/lightllm/server/embed_cache/manager.py b/lightllm/server/embed_cache/manager.py
@@ -26,10 +26,6 @@ def on_disconnect(self, conn):
         # (to finalize the service, if needed)
         pass
 
-    def exposed__check_and_set_new_id_range(self, token_num: int) -> int:
-        token_num = obtain(token_num)
-        return self._impl._check_and_set_new_id_range(token_num)
-
     def exposed_alloc(self, md5sum_list: list[str], token_num_list: list[int]) -> Optional[list[dict]]:
         md5sum_list = obtain(md5sum_list)
         token_num_list = obtain(token_num_list)
diff --git a/lightllm/server/embed_cache/utils.py b/lightllm/server/embed_cache/utils.py
@@ -10,7 +10,6 @@
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
-_ENSURED_AFS_DIRS = set()
 
 
 def _get_afs_path(base_dir: str, name: str) -> Path:
@@ -19,21 +18,6 @@ def _get_afs_path(base_dir: str, name: str) -> Path:
     return Path(base_dir) / name
 
 
-def _ensure_afs_dir(base_dir: Path) -> None:
-    base_dir_key = str(base_dir)
-    if base_dir_key in _ENSURED_AFS_DIRS:
-        return
-    if base_dir.exists():
-        if not base_dir.is_dir():
-            raise ValueError(f"image_embed_dir is not a directory: {base_dir}")
-        _ENSURED_AFS_DIRS.add(base_dir_key)
-        return
-
-    base_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
-    os.chmod(base_dir, 0o777)
-    _ENSURED_AFS_DIRS.add(base_dir_key)
-
-
 def tensor2bytes(t: torch.Tensor):
     buf = BytesIO()
     t = t.detach().cpu()
@@ -50,13 +34,11 @@ def bytes2tensor(b):
 
 def save_tensor_afs(name: str, tensor: torch.Tensor, base_dir: str) -> None:
     target_path = _get_afs_path(base_dir, name)
-    _ensure_afs_dir(target_path.parent)
     tmp_path = target_path.parent / f".{target_path.name}.tmp-{os.getpid()}-{time.time_ns()}"
 
     try:
         with open(tmp_path, "wb") as f:
             torch.save(tensor.detach().cpu(), f, _use_new_zipfile_serialization=False, pickle_protocol=4)
-        os.chmod(tmp_path, 0o777)
         os.replace(tmp_path, target_path)
         os.chmod(target_path, 0o777)
     except Exception:
@@ -86,7 +68,6 @@ def create_shm(name, data):
 
 def create_afs(name, data, path):
     target_path = _get_afs_path(path, name)
-    _ensure_afs_dir(target_path.parent)
     data_size = len(data)
     tmp_path = target_path.parent / f".{target_path.name}.tmp-{os.getpid()}-{time.time_ns()}"
 
@@ -96,7 +77,6 @@ def create_afs(name, data, path):
             f.write(mem_view[:data_size])
             f.flush()
             os.fsync(f.fileno())
-        os.chmod(tmp_path, 0o777)
         os.replace(tmp_path, target_path)
         os.chmod(target_path, 0o777)
     except Exception:
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
diff --git a/lightllm/server/visualserver/vit_connect.py b/lightllm/server/visualserver/vit_connect.py
diff --git a/lightllm/utils/dist_utils.py b/lightllm/utils/dist_utils.py