ModelTC
diff --git a/‎lightllm/common/basemodel/attention/flashinfer/mla.py‎
Lines changed: 5 additions & 2 deletions b/‎lightllm/common/basemodel/attention/flashinfer/mla.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎lightllm/common/basemodel/attention/triton/mla.py‎
Lines changed: 2 additions & 1 deletion b/‎lightllm/common/basemodel/attention/triton/mla.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 1 addition & 0 deletions b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/att/decode_att/gqa/flash_decoding/gqa_flash_decoding_vsm.py‎
Lines changed: 7 additions & 4 deletions b/‎lightllm/common/basemodel/triton_kernel/att/decode_att/gqa/flash_decoding/gqa_flash_decoding_vsm.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/fused_moe/grouped_topk.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/common/basemodel/triton_kernel/fused_moe/grouped_topk.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightllm/common/basemodel/triton_kernel/fused_moe/topk_select.py‎
Lines changed: 5 additions & 3 deletions b/‎lightllm/common/basemodel/triton_kernel/fused_moe/topk_select.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/mla_att/decode_att/gqa_flash_decoding.py‎
Lines changed: 10 additions & 3 deletions b/‎lightllm/common/basemodel/triton_kernel/mla_att/decode_att/gqa_flash_decoding.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/mla_att/prefill_att/context_flashattention_nopad_with_v.py‎
Lines changed: 32 additions & 14 deletions b/‎lightllm/common/basemodel/triton_kernel/mla_att/prefill_att/context_flashattention_nopad_with_v.py‎
Lines changed: 32 additions & 14 deletions
diff --git a/‎lightllm/common/quantization/no_quant.py‎
Lines changed: 2 additions & 2 deletions b/‎lightllm/common/quantization/no_quant.py‎
Lines changed: 2 additions & 2 deletions
@@ -16,6 +16,8 @@ def __init__(self, model):
         self.qk_nope_head_dim = model.qk_nope_head_dim
         self.qk_rope_head_dim = model.qk_rope_head_dim
         self.kv_lora_rank = model.kv_lora_rank
+        # v_head_dim may differ from qk_nope_head_dim (e.g., GLM-4.7-Flash: v_head_dim=256, qk_nope_head_dim=192)
+        self.v_head_dim = getattr(model, "v_head_dim", self.qk_nope_head_dim)
         self.q_data_type = model.data_type
         self.kv_data_type = model.data_type
         self.workspace_buffer = torch.empty(256 * 1024 * 1024, dtype=torch.int8, device=get_current_device_id())
@@ -69,7 +71,7 @@ def init_state(self):
             num_qo_heads=self.backend.tp_q_head_num,
             num_kv_heads=self.backend.tp_q_head_num,
             head_dim_qk=self.backend.qk_nope_head_dim + self.backend.qk_rope_head_dim,
-            head_dim_vo=self.backend.qk_nope_head_dim,
+            head_dim_vo=self.backend.v_head_dim,  # Use v_head_dim, not qk_nope_head_dim
             q_data_type=self.backend.q_data_type,
             causal=True,
             sm_scale=self.backend.softmax_scale,
@@ -101,7 +103,8 @@ def _mla_prefill_att(
     ) -> torch.Tensor:
         self.backend: MlaFlashInferAttBackend = self.backend  # for typing
         k_nope, k_rope = k
-        o_tensor = alloc_func((q.shape[0], q.shape[1], k_nope.shape[2]), q.dtype, device="cuda")
+        # Output dimension is v_head_dim (from v.shape[-1]), not qk_nope_head_dim
+        o_tensor = alloc_func((q.shape[0], q.shape[1], v.shape[-1]), q.dtype, device="cuda")
         q_head_num = q.shape[1]
         k = torch.cat([k_nope, torch.repeat_interleave(k_rope, q_head_num, dim=-2)], dim=-1)
         self.prefill_wrapper.run(q, k, v, out=o_tensor)
 
@@ -44,7 +44,8 @@ def _mla_prefill_att(
 
         qk_rope_head_dim = 64
         q_nope, q_rope = q[:, :, :-qk_rope_head_dim], q[:, :, -qk_rope_head_dim:]
-        o_tensor = alloc_func(q_nope.shape, dtype=q_nope.dtype, device=q.device)
+        #  GLM-4.7-Flash ： v_head_dim != qk_nope_head_dim
+        o_tensor = alloc_func((q_nope.shape[0], q_nope.shape[1], v.shape[-1]), dtype=q_nope.dtype, device=q.device)
         k_nope, k_rope = k
         assert att_control.mla_prefill
         softmax_scale = att_control.mla_prefill_dict["softmax_scale"]
 
@@ -1022,6 +1022,7 @@ def _gen_special_model_input(self, token_num: int):
             "Deepseek3MTPModel" in str(self.__class__)
             or "Qwen3MOEMTPModel" in str(self.__class__)
             or "MistralMTPModel" in str(self.__class__)
+            or "Glm4MoeLiteMTPModel" in str(self.__class__)
         )
         if is_mtp_draft_model:
             special_model_input["mtp_draft_input_hiddens"] = torch.randn(
 
@@ -81,8 +81,11 @@ def _fwd_kernel_calcu_index_and_block_seq(
     vsm_count,
     batch_size,
     BLOCK_N: tl.constexpr,
+    MAX_BATCH_SIZE: tl.constexpr,
 ):
-    b_seq_len = tl.load(b_seq_len + tl.arange(0, 2048), mask=tl.arange(0, 2048) < batch_size, other=0)
+    b_seq_len = tl.load(
+        b_seq_len + tl.arange(0, MAX_BATCH_SIZE), mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size, other=0
+    )
     total_token_num = tl.sum(b_seq_len)
 
     block_seq = tl.cdiv(total_token_num, vsm_count * 4)
@@ -93,9 +96,9 @@ def _fwd_kernel_calcu_index_and_block_seq(
     cumsum_seq_len = tl.cumsum(block_seq_len)
     batch_start_index = cumsum_seq_len - block_seq_len
     tl.store(
-        mid_o_batch_start_index + tl.arange(0, 2048),
+        mid_o_batch_start_index + tl.arange(0, MAX_BATCH_SIZE),
         batch_start_index,
-        mask=tl.arange(0, 2048) < batch_size,
+        mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size,
     )
     tl.store(mid_o_decode_att_block_seq, block_seq)
 
@@ -455,7 +458,6 @@ def gqa_token_decode_attention_flash_decoding_vsm(
     )
 
     if not hasattr(infer_state, "decode_att_block_seq"):
-        assert batch_size <= 2048
         decode_att_block_seq = torch.empty(
             [
                 1,
@@ -477,6 +479,7 @@ def gqa_token_decode_attention_flash_decoding_vsm(
             num_vsm,
             batch_size,
             BLOCK_N=run_config["BLOCK_N"],
+            MAX_BATCH_SIZE=triton.next_power_of_2(batch_size),
             num_warps=4,
         )
 
 
@@ -227,7 +227,7 @@ def triton_grouped_topk(
 
     scores_buffer = torch.empty((token_num, total_expert_num), dtype=dtype, device="cuda")
     out_topk_weights = torch.empty((token_num, topk), dtype=torch.float32, device="cuda")
-    out_topk_ids = torch.empty((token_num, topk), dtype=torch.long, device="cuda")
+    out_topk_ids = torch.empty((token_num, topk), dtype=torch.int32, device="cuda")
 
     assert total_expert_num % num_expert_group == 0
 
 
@@ -196,10 +196,12 @@ def select_experts(
                 scoring_func=scoring_func,
             )
         else:
-            group_score_topk_num = 1
-            # for deepseek v3
-            if topk_group == 4 and num_expert_group == 8 and top_k == 8:
+            if correction_bias is not None:
                 group_score_topk_num = 2
+            elif topk_group == 4 and num_expert_group == 8 and top_k == 8:
+                group_score_topk_num = 2
+            else:
+                group_score_topk_num = 1
 
             topk_weights, topk_ids = triton_grouped_topk(
                 hidden_states=hidden_states,
 
@@ -67,7 +67,6 @@ def gqa_token_decode_attention_flash_decoding(
     )
 
     if not hasattr(infer_state, "decode_att_block_seq"):
-        assert batch_size <= 2048
         decode_att_block_seq = torch.empty(
             [
                 1,
@@ -89,6 +88,7 @@ def gqa_token_decode_attention_flash_decoding(
             vsm_count,
             batch_size,
             BLOCK_N=BLOCK_N,
+            MAX_BATCH_SIZE=triton.next_power_of_2(batch_size),
             num_warps=4,
         )
 
@@ -134,8 +134,11 @@ def _fwd_kernel_calcu_index_and_block_seq(
     num_sm,
     batch_size,
     BLOCK_N: tl.constexpr,
+    MAX_BATCH_SIZE: tl.constexpr,
 ):
-    b_seq_len = tl.load(b_seq_len_ptr + tl.arange(0, 2048), mask=tl.arange(0, 2048) < batch_size, other=0)
+    b_seq_len = tl.load(
+        b_seq_len_ptr + tl.arange(0, MAX_BATCH_SIZE), mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size, other=0
+    )
     total_token_num = tl.sum(b_seq_len)
 
     block_seq = tl.cast(total_token_num / (num_sm * 4), dtype=tl.int32) + 1
@@ -144,6 +147,10 @@ def _fwd_kernel_calcu_index_and_block_seq(
     block_seq_len = tl.cdiv(b_seq_len, block_seq)
     cumsum_seq_len = tl.cumsum(block_seq_len)
     batch_start_index = cumsum_seq_len - block_seq_len
-    tl.store(mid_o_batch_start_index_ptr + tl.arange(0, 2048), batch_start_index, mask=tl.arange(0, 2048) < batch_size)
+    tl.store(
+        mid_o_batch_start_index_ptr + tl.arange(0, MAX_BATCH_SIZE),
+        batch_start_index,
+        mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size,
+    )
     tl.store(mid_o_decode_att_block_seq_ptr, block_seq)
     return
@@ -36,6 +36,9 @@ def _fwd_kernel_with_v(
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_ROPE_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
+    BLOCK_V_DMODEL: tl.constexpr,
+    ACTUAL_DMODEL: tl.constexpr,
+    ACTUAL_V_DMODEL: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
@@ -53,8 +56,13 @@ def _fwd_kernel_with_v(
     # initialize offsets
     offs_n = tl.arange(0, BLOCK_N)
     offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_v_d = tl.arange(0, BLOCK_V_DMODEL)
     offs_rope_d = tl.arange(0, BLOCK_ROPE_DMODEL)
     offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+
+    d_mask = offs_d < ACTUAL_DMODEL
+    v_d_mask = offs_v_d < ACTUAL_V_DMODEL
+
     off_q = (cur_batch_in_q_start_index + offs_m[:, None]) * stride_q_bs + cur_head * stride_q_h + offs_d[None, :]
     off_q_rope = (
         (cur_batch_in_q_start_index + offs_m[:, None]) * stride_q_rope_bs
@@ -63,9 +71,10 @@ def _fwd_kernel_with_v(
     )
     off_k = offs_n[None, :] * stride_k_bs + cur_k_head * stride_k_h + offs_d[:, None]
     off_k_rope = offs_n[None, :] * stride_k_rope_bs + offs_rope_d[:, None]
-    off_v = offs_n[:, None] * stride_vbs + cur_k_head * stride_vh + offs_d[None, :]
+    off_v = offs_n[:, None] * stride_vbs + cur_k_head * stride_vh + offs_v_d[None, :]
 
-    q = tl.load(Q_nope + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0)
+    q_mask = (offs_m[:, None] < cur_batch_seq_len) & d_mask[None, :]
+    q = tl.load(Q_nope + off_q, mask=q_mask, other=0.0)
     q_rope = tl.load(Q_rope + off_q_rope, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0)
 
     k_ptrs = K_nope + off_k
@@ -75,22 +84,24 @@ def _fwd_kernel_with_v(
     # initialize pointer to m and l
     m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_V_DMODEL], dtype=tl.float32)
 
     block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)
     block_end_loc = tl.minimum((start_m + 1) * BLOCK_M + prompt_cache_len, cur_batch_seq_len + prompt_cache_len)
 
     for start_n in range(0, block_mask * block_end_loc, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
+        k_seq_mask = (start_n + offs_n[None, :]) < block_end_loc
+        k_mask = k_seq_mask & d_mask[:, None]
         k = tl.load(
             k_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_k_bs,
-            mask=(start_n + offs_n[None, :]) < block_end_loc,
+            mask=k_mask,
             other=0.0,
         )
         k_rope = tl.load(
             k_rope_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_k_rope_bs,
-            mask=(start_n + offs_n[None, :]) < block_end_loc,
+            mask=k_seq_mask,
             other=0.0,
         )
 
@@ -112,9 +123,11 @@ def _fwd_kernel_with_v(
         # -- update output accumulator --
         acc = acc * alpha[:, None]
         # update acc
+        v_seq_mask = (start_n + offs_n[:, None]) < block_end_loc
+        v_mask = v_seq_mask & v_d_mask[None, :]
         v = tl.load(
             v_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_vbs,
-            mask=(start_n + offs_n[:, None]) < block_end_loc,
+            mask=v_mask,
             other=0.0,
         )
         p = p.to(v.dtype)
@@ -124,9 +137,10 @@ def _fwd_kernel_with_v(
 
     acc = acc / l_i[:, None]
     # initialize pointers to output
-    off_o = (cur_batch_in_q_start_index + offs_m[:, None]) * stride_obs + cur_head * stride_oh + offs_d[None, :]
+    off_o = (cur_batch_in_q_start_index + offs_m[:, None]) * stride_obs + cur_head * stride_oh + offs_v_d[None, :]
     out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len)
+    o_mask = (offs_m[:, None] < cur_batch_seq_len) & v_d_mask[None, :]
+    tl.store(out_ptrs, acc, mask=o_mask)
     return
 
 
@@ -149,13 +163,14 @@ def context_attention_fwd_with_v(
     BLOCK = 128 if not is_tesla() else 64
     q_nope_dim = q_nope.shape[-1]
     q_rope_dim = q_rope.shape[-1]
+    v_dim = v.shape[-1]
     assert q_nope_dim == k_nope.shape[-1]
     assert q_rope_dim == k_rope.shape[-1]
-    assert q_nope_dim in {16, 32, 64, 128, 256, 512}
-    assert q_rope_dim in {16, 32, 64, 128, 256}
-    assert q_nope_dim == v.shape[-1]
 
-    if q_nope_dim >= 512:
+    q_nope_dim_padded = triton.next_power_of_2(q_nope_dim)
+    v_dim_padded = triton.next_power_of_2(v_dim)
+
+    if q_nope_dim_padded >= 512 or v_dim_padded >= 512:
         BLOCK = 64 if not is_tesla() else 32
     else:
         BLOCK = 128 if not is_tesla() else 64
@@ -167,7 +182,7 @@ def context_attention_fwd_with_v(
     batch, head = b_seq_len.shape[0], q_nope.shape[1]
 
     grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
-    num_warps = 4 if q_nope_dim <= 64 else 8
+    num_warps = 4 if q_nope_dim_padded <= 64 else 8
 
     _fwd_kernel_with_v[grid](
         q_nope,
@@ -194,9 +209,12 @@ def context_attention_fwd_with_v(
         o.stride(1),
         b_prompt_cache_len=b_prompt_cache_len,
         BLOCK_M=BLOCK,
-        BLOCK_DMODEL=q_nope_dim,
+        BLOCK_DMODEL=q_nope_dim_padded,
         BLOCK_ROPE_DMODEL=q_rope_dim,
         BLOCK_N=BLOCK,
+        BLOCK_V_DMODEL=v_dim_padded,
+        ACTUAL_DMODEL=q_nope_dim,
+        ACTUAL_V_DMODEL=v_dim,
         num_warps=num_warps,
         num_stages=1,
     )
 
@@ -28,8 +28,8 @@ def apply(
             device = input_tensor.device
             if use_custom_tensor_mananger:
                 out = g_cache_manager.alloc_tensor(shape, dtype, device=device)
-        else:
-            out = torch.empty(shape, dtype=dtype, device=device)
+            else:
+                out = torch.empty(shape, dtype=dtype, device=device)
         if bias is None:
             return torch.mm(input_tensor, weight, out=out)
         return torch.addmm(bias, input_tensor, weight, out=out)
Original file line number	Diff line number	Diff line change
`@@ -1022,6 +1022,7 @@ def _gen_special_model_input(self, token_num: int):`
`1022`	`1022`	`"Deepseek3MTPModel" in str(self.__class__)`
`1023`	`1023`	`or "Qwen3MOEMTPModel" in str(self.__class__)`
`1024`	`1024`	`or "MistralMTPModel" in str(self.__class__)`
	`1025`	`+ or "Glm4MoeLiteMTPModel" in str(self.__class__)`
`1025`	`1026`	`)`
`1026`	`1027`	`if is_mtp_draft_model:`
`1027`	`1028`	`special_model_input["mtp_draft_input_hiddens"] = torch.randn(`