From 0f7dedd7fd7f3417686e997e68fd6064829be925 Mon Sep 17 00:00:00 2001
From: Wanglongzhi2001 <583087864@qq.com>
Date: Mon, 30 Mar 2026 21:57:21 +0800
Subject: [PATCH 1/3] [Other] refractor dynamic cache quant test

---
 ...est_kv_cache_int8_dynamic_quant_backend.py | 974 ++++++------------
 1 file changed, 321 insertions(+), 653 deletions(-)

diff --git a/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py b/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py
index 17a393ee11e..5a13349699a 100644
--- a/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py
+++ b/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py
@@ -15,30 +15,30 @@
 """
 
 """
-Unit tests for the KV cache int8 dynamic quant fix on flash_attn_backend
-and flash_mask_attn_backend (commit 584df2ba8).
-
-The fix ensures that when cache_quant_type_str == "block_wise_fp8":
-  - cache_k/v are taken from caches[4*layer_id : 4*layer_id+2]
-  - cache_k/v_scales are taken from caches[4*layer_id+2 : 4*layer_id+4]
-Otherwise (non-dynamic-quant):
-  - cache_k/v are taken from caches[2*layer_id : 2*layer_id+2]
-  - cache_k/v_scales are taken from layer.cache_k_scale / layer.cache_v_scale
-
-Strategy: We mock the entire fastdeploy import chain and the external op
-functions, then verify the correct cache tensors are routed through.
+Unit tests for KV cache dynamic quantization on FlashAttentionBackend
+and FlashMaskAttentionBackend.
+
+Tests:
+  1. Smoke tests: forward_mixed runs without error under dynamic C8.
+  2. Diff tests: dynamic C8 vs C16 produce consistent outputs.
+  3. GPU tests: real GPU forward calls (skipped without GPU).
+
+Extensibility: To add a new quant type (e.g., C4), add an entry to
+QUANT_CONFIGS and follow the existing test patterns.
 """
 
+import math
 import sys
 import types
 import unittest
+from dataclasses import dataclass
 from unittest.mock import patch
 
 import numpy as np
 import paddle
 
 # ---------------------------------------------------------------------------
-# Environment setup: mock missing fastdeploy dependencies before import
+# Environment setup: mock missing dependencies before import
 # ---------------------------------------------------------------------------
 
 
@@ -53,10 +53,8 @@ def _ensure_mock_module(name, attrs=None):
     return sys.modules[name]
 
 
-# Mock problematic transitive dependencies that may be missing in some environments
 _ensure_mock_module("aistudio_sdk.snapshot_download", {"snapshot_download": lambda *a, **kw: None})
 
-# Try importing the backends. If it still fails, mark tests as skipped.
 _IMPORT_ERROR = None
 try:
     from fastdeploy.model_executor.layers.attention.flash_attn_backend import (
@@ -72,50 +70,90 @@ def _ensure_mock_module(name, attrs=None):
 
 
 # ---------------------------------------------------------------------------
-# Dummy / Mock helpers
+# Quant config registry (extend here for new quant types)
 # ---------------------------------------------------------------------------
 
 
-class DummyFDConfig:
-    """Minimal FDConfig for constructing backend objects.
+@dataclass
+class QuantConfig:
+    """Configuration for a cache quantization type."""
 
-    Uses __getattr__ to return MagicMock for any missing nested attributes,
-    avoiding the need to enumerate every config attribute.
-    """
+    cache_quant_type_str: str  # e.g., "block_wise_fp8", "none"
+    cache_dtype: str  # "uint8" for quantized, "bfloat16" for fp16/bf16
+    has_dynamic_scales: bool  # True if scales stored in caches list
+    caches_per_layer: int  # 4 for dynamic (k,v,k_scale,v_scale), 2 otherwise
+
+
+QUANT_CONFIGS = {
+    "C16": QuantConfig("none", "bfloat16", False, 2),
+    "C8_dynamic": QuantConfig("block_wise_fp8", "uint8", True, 4),
+    # Future: "C4_dynamic": QuantConfig("block_wise_int4", "uint8", True, 4),
+}
+
+# ---------------------------------------------------------------------------
+# Test constants
+# ---------------------------------------------------------------------------
+
+BATCH_SIZE = 4
+NUM_HEADS = 56
+KV_NUM_HEADS = 4
+HEAD_DIM = 128
+BLOCK_SIZE = 64
+NUM_LAYERS = 2
+MAX_SEQ_LEN = 2048
+QKV_DIM = (NUM_HEADS + 2 * KV_NUM_HEADS) * HEAD_DIM  # 7680
+ATTN_OUTPUT_DIM = NUM_HEADS * HEAD_DIM  # 7168
+Q_DIM = NUM_HEADS * HEAD_DIM  # 7168
+K_DIM = KV_NUM_HEADS * HEAD_DIM  # 512
+V_DIM = KV_NUM_HEADS * HEAD_DIM  # 512
+
+FLASH_ATTN_MODULE = "fastdeploy.model_executor.layers.attention.flash_attn_backend"
+FLASH_MASK_MODULE = "fastdeploy.model_executor.layers.attention.flash_mask_attn_backend"
+
+# Backend registry for parameterized tests
+BACKENDS = [
+    ("flash_attn", FlashAttentionBackend, FLASH_ATTN_MODULE),
+    ("flash_mask", FlashMaskAttentionBackend, FLASH_MASK_MODULE),
+]
+
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+
+class DummyFDConfig:
+    """Minimal FDConfig for constructing backend objects."""
 
     def __init__(self):
-        self.cache_config = type("CacheConfig", (), {"block_size": 64})()
+        self.cache_config = type("C", (), {"block_size": BLOCK_SIZE})()
         self.model_config = type(
-            "ModelConfig",
+            "M",
             (),
             {
-                "max_model_len": 2048,
-                "head_dim": 128,
-                "num_hidden_layers": 2,
+                "max_model_len": MAX_SEQ_LEN,
+                "head_dim": HEAD_DIM,
+                "num_hidden_layers": NUM_LAYERS,
                 "causal": True,
                 "start_layer_index": 0,
                 "rope_3d": False,
                 "use_3d_rope": False,
             },
         )()
-        self.scheduler_config = type("SchedulerConfig", (), {"max_num_seqs": 4})()
-        self.graph_opt_config = type(
-            "GraphOptConfig",
-            (),
-            {"cudagraph_capture_sizes": None},
-        )()
+        self.scheduler_config = type("S", (), {"max_num_seqs": BATCH_SIZE})()
+        self.graph_opt_config = type("G", (), {"cudagraph_capture_sizes": None})()
         self.parallel_config = type(
-            "ParallelConfig",
+            "P",
             (),
             {
-                "block_size": 64,
+                "block_size": BLOCK_SIZE,
                 "data_parallel_rank": 0,
                 "pd_disaggregation_mode": "none",
                 "expert_parallel_rank": 0,
             },
         )()
         self.speculative_config = type(
-            "SpeculativeConfig",
+            "Sp",
             (),
             {
                 "method": None,
@@ -127,19 +165,19 @@ def __init__(self):
 
 
 class DummyLayer:
-    """Mimics the Attention layer object with relevant attributes."""
-
-    def __init__(
-        self,
-        layer_id=0,
-        cache_quant_type_str="none",
-        cache_k_scale=None,
-        cache_v_scale=None,
-    ):
+    """Mimics the Attention layer object."""
+
+    def __init__(self, layer_id=0, quant_config=None):
         self.layer_id = layer_id
-        self.cache_quant_type_str = cache_quant_type_str
-        self.cache_k_scale = cache_k_scale
-        self.cache_v_scale = cache_v_scale
+        cfg = quant_config or QUANT_CONFIGS["C16"]
+        self.cache_quant_type_str = cfg.cache_quant_type_str
+        # Static quant types use layer-level scales; dynamic types use caches list
+        if not cfg.has_dynamic_scales and cfg.cache_quant_type_str != "none":
+            self.cache_k_scale = paddle.ones([1], dtype="float32")
+            self.cache_v_scale = paddle.ones([1], dtype="float32")
+        else:
+            self.cache_k_scale = None
+            self.cache_v_scale = None
         self.cache_k_out_scale = None
         self.cache_v_out_scale = None
         self.cache_k_zp = None
@@ -156,46 +194,18 @@ def __init__(
         self.quant_min_bound = 0.0
 
 
-def _make_sentinel(name: str) -> paddle.Tensor:
-    """Create a uniquely identifiable 'sentinel' tensor for tracing through call args."""
-    t = paddle.zeros([1], dtype="float32")
-    t._sentinel_name = name
-    return t
-
-
-def _make_caches_normal(layer_id=0):
-    """Create a caches list for normal (non-block_wise_fp8) mode."""
-    num_entries = 2 * (layer_id + 1)
-    return [_make_sentinel(f"normal_cache_{i}") for i in range(num_entries)]
-
-
-def _make_caches_block_wise_fp8(layer_id=0):
-    """Create a caches list for block_wise_fp8 mode."""
-    num_entries = 4 * (layer_id + 1)
-    return [_make_sentinel(f"bwfp8_cache_{i}") for i in range(num_entries)]
-
-
 class DummyForwardMeta:
-    """Minimal ForwardMeta with lazily-created None attributes.
-
-    Simulates a multi-batch scenario with batch_size=4.
-    In decode-only mode (max_len_val=0): 4 decode tokens, 0 prefill.
-    In prefill mode (max_len_val>0): mixed prefill + decode across 4 batches.
-    """
-
-    BATCH_SIZE = 4
+    """Minimal ForwardMeta for decode-only mode (max_len_val=0)."""
 
     def __init__(self, caches, max_len_val=0):
-        bs = self.BATCH_SIZE
+        bs = BATCH_SIZE
         self.caches = caches
-        # 4 batches: in decode mode each has 0 encoder len, 1 decoder token
         self.seq_lens_encoder = paddle.to_tensor([0] * bs, dtype="int32")
         self.seq_lens_decoder = paddle.to_tensor([1] * bs, dtype="int32")
         self.seq_lens_this_time = paddle.to_tensor([1] * bs, dtype="int32")
-        # total tokens = batch_size (1 per batch in decode)
         self.cu_seqlens_q = paddle.to_tensor(list(range(bs + 1)), dtype="int32")
         self.cu_seqlens_k = paddle.to_tensor(list(range(bs + 1)), dtype="int32")
-        self.rotary_embs = paddle.zeros([bs, 1, 128], dtype="float32")
+        self.rotary_embs = paddle.zeros([bs, 1, HEAD_DIM], dtype="float32")
         self.batch_id_per_token = paddle.to_tensor(list(range(bs)), dtype="int32")
         self.block_tables = paddle.to_tensor([[i] for i in range(bs)], dtype="int32")
         self.decoder_batch_ids = paddle.to_tensor(list(range(bs)), dtype="int32")
@@ -209,7 +219,6 @@ def __init__(self, caches, max_len_val=0):
         self.kv_batch_ids = paddle.to_tensor(list(range(bs)), dtype="int32")
         self.kv_tile_ids_per_batch = paddle.to_tensor([0] * bs, dtype="int32")
         self.kv_num_blocks_x_cpu = paddle.to_tensor([bs], dtype="int32")
-        # max_len_tensor_cpu: [max_enc_len, n_prefill, max_dec_len, max_kv_len]
         self.max_len_tensor_cpu = paddle.to_tensor([0, max_len_val, 10, 10], dtype="int32")
         self.attn_mask = None
         self.attn_mask_offsets = None
@@ -218,647 +227,306 @@ def __init__(self, caches, max_len_val=0):
         self.exist_prefill = False
 
     def __getattr__(self, name):
-        """Mimic ForwardMeta's lazy attribute creation."""
         return None
 
 
 class DummyMetadata:
     """Minimal attention metadata."""
 
-    def __init__(self, num_layers=2):
+    def __init__(self, num_layers=NUM_LAYERS):
         self.kv_signal_data_list = [None] * num_layers
         self._fuse_kernel_compute_dtype = "bf16"
         self._dtype = paddle.bfloat16
         self.max_len_tensor_cpu_decoder = None
 
 
-# ---------------------------------------------------------------------------
-# Helpers to extract cache args from mock calls
-# ---------------------------------------------------------------------------
+def make_qkv_inputs(token_num=BATCH_SIZE, dtype="bfloat16"):
+    """Create real q, k, v, qkv tensors with random data.
+
+    Returns:
+        (q, k, v, qkv) where qkv is the fused [token_num, QKV_DIM] tensor
+        and q/k/v are the individual head-grouped tensors.
+    """
+    qkv = paddle.randn([token_num, QKV_DIM]).cast(dtype)
+    q = qkv[:, :Q_DIM].reshape([token_num, NUM_HEADS, HEAD_DIM])
+    k = qkv[:, Q_DIM : Q_DIM + K_DIM].reshape([token_num, KV_NUM_HEADS, HEAD_DIM])
+    v = qkv[:, Q_DIM + K_DIM :].reshape([token_num, KV_NUM_HEADS, HEAD_DIM])
+    return q, k, v, qkv
 
 
-def _extract_cache_args_from_gqa_rope(mock_call):
-    """Extract (key_cache, value_cache, cache_k_scales, cache_v_scales)
-    from a gqa_rope_write_cache call.
-    Positional: qkv[0], key_cache[1], value_cache[2], ...,
-    cache_k_quant_scales[19], cache_v_quant_scales[20]"""
-    args = mock_call[0]
-    return args[1], args[2], args[19], args[20]
+def make_caches(quant_config, layer_id=0):
+    """Create a caches list for the given quant config and layer_id.
 
+    For dynamic C8: [cache_k, cache_v, k_scale, v_scale] * layers
+    For C16:   [cache_k, cache_v] * layers
+    """
+    num_entries = quant_config.caches_per_layer * (layer_id + 1)
+    return [paddle.zeros([1], dtype="float32") for _ in range(num_entries)]
 
-def _extract_cache_args_from_append_attention(mock_call):
-    """Extract (key_cache, value_cache, cache_k_scales, cache_v_scales)
-    from an append_attention call.
-    Positional: qkv[0], key_cache[1], value_cache[2], ...,
-    k_quant_scale[23], v_quant_scale[24]"""
-    args = mock_call[0]
-    return args[1], args[2], args[23], args[24]
+
+def create_backend(backend_class, module_path):
+    """Factory to create a backend instance with mocked init dependencies.
+    During attention initialization, some prerequisite steps are performed,
+    such as initializing the distributed environment.
+    """
+    patches = [
+        patch(f"{module_path}.init_rank_and_device_id", return_value=(0, 0)),
+        patch(f"{module_path}.open_shm_and_get_meta_signal", return_value=None),
+        patch(f"{module_path}.init_kv_signal_per_query", return_value=None),
+    ]
+    # FlashAttentionBackend also needs get_sm_version mocked
+    if "flash_attn_backend" in module_path and "flash_mask" not in module_path:
+        patches.append(patch(f"{module_path}.get_sm_version", return_value=90))
+
+    for p in patches:
+        p.start()
+    try:
+        backend = backend_class(DummyFDConfig(), kv_num_heads=KV_NUM_HEADS, num_heads=NUM_HEADS, head_dim=HEAD_DIM)
+    finally:
+        for p in patches:
+            p.stop()
+    return backend
+
+
+def _run_forward_mocked(backend, module_path, quant_config, layer_id=0, return_tensor=None, qkv_inputs=None):
+    """Run forward_mixed with mocked external ops, return the result.
+
+    Args:
+        backend: The attention backend instance.
+        module_path: Module path for patching ops.
+        quant_config: QuantConfig to use.
+        layer_id: Layer ID for the dummy layer.
+        return_tensor: If provided, mock append_attention to return this tensor.
+        qkv_inputs: Optional (q, k, v, qkv) tuple. Generated if not provided.
+    """
+    backend.attention_metadata = DummyMetadata()
+    layer = DummyLayer(layer_id=layer_id, quant_config=quant_config)
+    caches = make_caches(quant_config, layer_id=layer_id)
+    fm = DummyForwardMeta(caches=caches, max_len_val=0)
+
+    if qkv_inputs is None:
+        q, k, v, qkv = make_qkv_inputs()
+    else:
+        q, k, v, qkv = qkv_inputs
+
+    if return_tensor is None:
+        return_tensor = paddle.zeros([BATCH_SIZE, ATTN_OUTPUT_DIM], dtype="bfloat16")
+
+    with patch(f"{module_path}.append_attention", return_value=return_tensor):
+        with patch(f"{module_path}.get_block_shape_and_split_kv_block"):
+            result = backend.forward_mixed(
+                q=q,
+                k=k,
+                v=v,
+                qkv=qkv,
+                compressed_kv=None,
+                k_pe=None,
+                layer=layer,
+                forward_meta=fm,
+            )
+    return result
 
 
 # ---------------------------------------------------------------------------
-# FlashAttentionBackend tests
+# Part 1: Mock-based smoke tests (no GPU required)
 # ---------------------------------------------------------------------------
 
-FLASH_ATTN_MODULE = "fastdeploy.model_executor.layers.attention.flash_attn_backend"
-
 
 @unittest.skipIf(_IMPORT_ERROR is not None, f"Cannot import backends: {_IMPORT_ERROR}")
-class TestFlashAttnBackendCacheRouting(unittest.TestCase):
-    """Test that FlashAttentionBackend.forward_mixed selects the correct
-    cache tensors based on cache_quant_type_str."""
-
-    def _create_backend(self):
-        with patch(f"{FLASH_ATTN_MODULE}.init_rank_and_device_id", return_value=(0, 0)):
-            with patch(f"{FLASH_ATTN_MODULE}.get_sm_version", return_value=90):
-                with patch(f"{FLASH_ATTN_MODULE}.open_shm_and_get_meta_signal", return_value=None):
-                    with patch(f"{FLASH_ATTN_MODULE}.init_kv_signal_per_query", return_value=None):
-                        backend = FlashAttentionBackend(DummyFDConfig(), kv_num_heads=4, num_heads=56, head_dim=128)
-        return backend
-
-    @patch(f"{FLASH_ATTN_MODULE}.append_attention")
-    @patch(f"{FLASH_ATTN_MODULE}.get_block_shape_and_split_kv_block")
-    def test_normal_quant_uses_2x_indexing_decode_only(self, mock_split_kv, mock_append_attn):
-        """cache_int8: cache_k=caches[2*id], scales from layer attrs."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
+class TestBackendForwardSmoke(unittest.TestCase):
+    """Smoke test: forward_mixed runs without error for each backend x quant config."""
 
-        layer_ks = _make_sentinel("layer_ks")
-        layer_vs = _make_sentinel("layer_vs")
-        layer = DummyLayer(
-            layer_id=0, cache_quant_type_str="cache_int8", cache_k_scale=layer_ks, cache_v_scale=layer_vs
-        )
-        caches = _make_caches_normal(layer_id=0)
-        fm = DummyForwardMeta(caches=caches, max_len_val=0)
-        # batch_size=4, total_tokens=4 in decode
-        bs = DummyForwardMeta.BATCH_SIZE
-        mock_append_attn.return_value = paddle.zeros([bs, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([bs, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
-        )
+    def _smoke_test(self, backend_class, module_path, quant_config_name):
+        config = QUANT_CONFIGS[quant_config_name]
+        backend = create_backend(backend_class, module_path)
+        # Should not raise
+        result = _run_forward_mocked(backend, module_path, config)
+        self.assertIsNotNone(result)
 
-        kc, vc, ks, vs = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc, caches[0])
-        self.assertIs(vc, caches[1])
-        self.assertIs(ks, layer_ks)
-        self.assertIs(vs, layer_vs)
-
-    @patch(f"{FLASH_ATTN_MODULE}.append_attention")
-    @patch(f"{FLASH_ATTN_MODULE}.get_block_shape_and_split_kv_block")
-    def test_block_wise_fp8_uses_4x_indexing_decode_only(self, mock_split_kv, mock_append_attn):
-        """block_wise_fp8: cache_k=caches[4*id], scales=caches[4*id+2/3]."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
+    def test_flash_attn_c8_dynamic(self):
+        self._smoke_test(FlashAttentionBackend, FLASH_ATTN_MODULE, "C8_dynamic")
 
-        layer = DummyLayer(layer_id=0, cache_quant_type_str="block_wise_fp8")
-        caches = _make_caches_block_wise_fp8(layer_id=0)
-        fm = DummyForwardMeta(caches=caches, max_len_val=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        mock_append_attn.return_value = paddle.zeros([bs, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([bs, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
-        )
+    def test_flash_attn_c16(self):
+        self._smoke_test(FlashAttentionBackend, FLASH_ATTN_MODULE, "C16")
 
-        kc, vc, ks, vs = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc, caches[0])
-        self.assertIs(vc, caches[1])
-        self.assertIs(ks, caches[2])
-        self.assertIs(vs, caches[3])
-
-    @patch(f"{FLASH_ATTN_MODULE}.append_attention")
-    @patch(f"{FLASH_ATTN_MODULE}.get_block_shape_and_split_kv_block")
-    def test_block_wise_fp8_layer_id_1(self, mock_split_kv, mock_append_attn):
-        """block_wise_fp8 with layer_id=1: indices 4,5,6,7."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
+    def test_flash_mask_attn_c8_dynamic(self):
+        self._smoke_test(FlashMaskAttentionBackend, FLASH_MASK_MODULE, "C8_dynamic")
 
-        layer = DummyLayer(layer_id=1, cache_quant_type_str="block_wise_fp8")
-        caches = _make_caches_block_wise_fp8(layer_id=1)
-        fm = DummyForwardMeta(caches=caches, max_len_val=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        mock_append_attn.return_value = paddle.zeros([bs, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([bs, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
-        )
+    def test_flash_mask_attn_c16(self):
+        self._smoke_test(FlashMaskAttentionBackend, FLASH_MASK_MODULE, "C16")
 
-        kc, vc, ks, vs = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc, caches[4])
-        self.assertIs(vc, caches[5])
-        self.assertIs(ks, caches[6])
-        self.assertIs(vs, caches[7])
-
-    @patch(f"{FLASH_ATTN_MODULE}.append_attention")
-    @patch(f"{FLASH_ATTN_MODULE}.get_block_shape_and_split_kv_block")
-    def test_normal_quant_layer_id_1(self, mock_split_kv, mock_append_attn):
-        """Normal quant with layer_id=1: indices 2,3."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
 
-        layer_ks = _make_sentinel("ks_l1")
-        layer_vs = _make_sentinel("vs_l1")
-        layer = DummyLayer(
-            layer_id=1, cache_quant_type_str="cache_int8", cache_k_scale=layer_ks, cache_v_scale=layer_vs
-        )
-        caches = _make_caches_normal(layer_id=1)
-        fm = DummyForwardMeta(caches=caches, max_len_val=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        mock_append_attn.return_value = paddle.zeros([bs, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([bs, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
-        )
+# ---------------------------------------------------------------------------
+# Part 2: Mock-based C8 vs C16 diff tests (no GPU required)
+# ---------------------------------------------------------------------------
 
-        kc, vc, ks, vs = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc, caches[2])
-        self.assertIs(vc, caches[3])
-        self.assertIs(ks, layer_ks)
-        self.assertIs(vs, layer_vs)
-
-    @patch(f"{FLASH_ATTN_MODULE}.flash_attn_func")
-    @patch(f"{FLASH_ATTN_MODULE}.append_attention")
-    @patch(f"{FLASH_ATTN_MODULE}.pre_cache_len_concat")
-    @patch(f"{FLASH_ATTN_MODULE}.get_block_shape_and_split_kv_block")
-    @patch(f"{FLASH_ATTN_MODULE}.gqa_rope_write_cache")
-    def test_block_wise_fp8_prefill_path(
-        self,
-        mock_gqa_rope,
-        mock_split_kv,
-        mock_pre_cache,
-        mock_append_attn,
-        mock_flash_attn,
-    ):
-        """Prefill path: both gqa_rope_write_cache and append_attention
-        receive block_wise_fp8 caches. 4 batches, 5 tokens each = 20 total."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
 
-        layer = DummyLayer(layer_id=0, cache_quant_type_str="block_wise_fp8")
-        caches = _make_caches_block_wise_fp8(layer_id=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        total_tokens = bs * 5  # 20 tokens total (4 batches * 5 tokens)
-        fm = DummyForwardMeta(caches=caches, max_len_val=5)
-
-        mock_pre_cache.return_value = (
-            paddle.to_tensor([0, 5, 10, 15, 20], dtype="int32"),  # cu_seqlens_k
-            paddle.to_tensor(list(range(bs)), dtype="int32"),  # pre_cache_batch_ids
-            paddle.to_tensor([0] * bs, dtype="int32"),  # pre_cache_tile_ids
-            paddle.to_tensor([bs], dtype="int32"),  # pre_cache_num_blocks
-            paddle.to_tensor([total_tokens], dtype="int32"),  # kv_token_num
-        )
-        mock_gqa_rope.return_value = (
-            paddle.zeros([total_tokens, 56, 128], dtype="bfloat16"),
-            paddle.zeros([total_tokens, 4, 128], dtype="bfloat16"),
-            paddle.zeros([total_tokens, 4, 128], dtype="bfloat16"),
-            None,
-        )
-        mock_flash_attn.return_value = (
-            paddle.zeros([total_tokens, 56, 128], dtype="bfloat16"),
-            None,
+@unittest.skipIf(_IMPORT_ERROR is not None, f"Cannot import backends: {_IMPORT_ERROR}")
+class TestBackendC8VsC16OutputDiff(unittest.TestCase):
+    """Diff test: C8 dynamic and C16 produce identical outputs when external
+    ops return the same data (validates the forward path is consistent)."""
+
+    def _diff_test(self, backend_class, module_path):
+        # Use the same known tensor as the mock return value for both configs
+        known_output = paddle.randn([BATCH_SIZE, ATTN_OUTPUT_DIM]).cast("bfloat16")
+        # Use the same qkv inputs for both configs
+        shared_qkv = make_qkv_inputs()
+
+        backend_c8 = create_backend(backend_class, module_path)
+        result_c8 = _run_forward_mocked(
+            backend_c8,
+            module_path,
+            QUANT_CONFIGS["C8_dynamic"],
+            return_tensor=known_output.clone(),
+            qkv_inputs=shared_qkv,
         )
-        mock_append_attn.return_value = paddle.zeros([total_tokens, 7168], dtype="bfloat16")
 
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([total_tokens, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
+        backend_c16 = create_backend(backend_class, module_path)
+        result_c16 = _run_forward_mocked(
+            backend_c16,
+            module_path,
+            QUANT_CONFIGS["C16"],
+            return_tensor=known_output.clone(),
+            qkv_inputs=shared_qkv,
         )
 
-        # gqa_rope_write_cache should get caches[0..3]
-        kc, vc, ks, vs = _extract_cache_args_from_gqa_rope(mock_gqa_rope.call_args)
-        self.assertIs(kc, caches[0])
-        self.assertIs(vc, caches[1])
-        self.assertIs(ks, caches[2])
-        self.assertIs(vs, caches[3])
-
-        # append_attention should also get caches[0..3]
-        kc2, vc2, ks2, vs2 = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc2, caches[0])
-        self.assertIs(vc2, caches[1])
-        self.assertIs(ks2, caches[2])
-        self.assertIs(vs2, caches[3])
-
-    @patch(f"{FLASH_ATTN_MODULE}.append_attention")
-    @patch(f"{FLASH_ATTN_MODULE}.get_block_shape_and_split_kv_block")
-    def test_none_quant_type_defaults_to_2x(self, mock_split_kv, mock_append_attn):
-        """cache_quant_type_str='none': 2x indexing, None scales."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
-
-        layer = DummyLayer(layer_id=0, cache_quant_type_str="none")
-        caches = _make_caches_normal(layer_id=0)
-        fm = DummyForwardMeta(caches=caches, max_len_val=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        mock_append_attn.return_value = paddle.zeros([bs, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([bs, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
+        np.testing.assert_array_equal(
+            result_c8.numpy(),
+            result_c16.numpy(),
+            err_msg=f"C8 dynamic and C16 outputs differ for {backend_class.__name__}",
         )
 
-        kc, vc, ks, vs = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc, caches[0])
-        self.assertIs(vc, caches[1])
-        self.assertIsNone(ks)
-        self.assertIsNone(vs)
-
-    @patch(f"{FLASH_ATTN_MODULE}.append_attention")
-    @patch(f"{FLASH_ATTN_MODULE}.get_block_shape_and_split_kv_block")
-    def test_cache_fp8_uses_2x_indexing(self, mock_split_kv, mock_append_attn):
-        """cache_fp8 (static): 2x indexing, scales from layer attrs."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
-
-        layer_ks = _make_sentinel("fp8_ks")
-        layer_vs = _make_sentinel("fp8_vs")
-        layer = DummyLayer(
-            layer_id=0, cache_quant_type_str="cache_fp8", cache_k_scale=layer_ks, cache_v_scale=layer_vs
-        )
-        caches = _make_caches_normal(layer_id=0)
-        fm = DummyForwardMeta(caches=caches, max_len_val=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        mock_append_attn.return_value = paddle.zeros([bs, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([bs, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
-        )
+    def test_flash_attn_c8_vs_c16(self):
+        self._diff_test(FlashAttentionBackend, FLASH_ATTN_MODULE)
 
-        kc, vc, ks, vs = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc, caches[0])
-        self.assertIs(vc, caches[1])
-        self.assertIs(ks, layer_ks)
-        self.assertIs(vs, layer_vs)
+    def test_flash_mask_attn_c8_vs_c16(self):
+        self._diff_test(FlashMaskAttentionBackend, FLASH_MASK_MODULE)
 
 
 # ---------------------------------------------------------------------------
-# FlashMaskAttentionBackend tests
+# Part 3: GPU-based tests (require real GPU)
 # ---------------------------------------------------------------------------
 
-FLASH_MASK_MODULE = "fastdeploy.model_executor.layers.attention.flash_mask_attn_backend"
+_HAS_GPU = paddle.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0
 
 
-@unittest.skipIf(_IMPORT_ERROR is not None, f"Cannot import backends: {_IMPORT_ERROR}")
-class TestFlashMaskAttnBackendCacheRouting(unittest.TestCase):
-    """Test that FlashMaskAttentionBackend.forward_mixed selects the correct
-    cache tensors based on cache_quant_type_str."""
-
-    def _create_backend(self):
-        with patch(f"{FLASH_MASK_MODULE}.init_rank_and_device_id", return_value=(0, 0)):
-            with patch(f"{FLASH_MASK_MODULE}.open_shm_and_get_meta_signal", return_value=None):
-                with patch(f"{FLASH_MASK_MODULE}.init_kv_signal_per_query", return_value=None):
-                    backend = FlashMaskAttentionBackend(DummyFDConfig(), kv_num_heads=4, num_heads=56, head_dim=128)
-        return backend
-
-    @patch(f"{FLASH_MASK_MODULE}.append_attention")
-    @patch(f"{FLASH_MASK_MODULE}.get_block_shape_and_split_kv_block")
-    def test_normal_quant_uses_2x_indexing_decode_only(self, mock_split_kv, mock_append_attn):
-        """Non block_wise_fp8: caches[2*layer_id] indexing."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
+def _make_gpu_caches(quant_config, max_block_num=16):
+    """Create real GPU cache tensors following the paged KV cache layout.
 
-        layer_ks = _make_sentinel("mask_ks")
-        layer_vs = _make_sentinel("mask_vs")
-        layer = DummyLayer(
-            layer_id=0, cache_quant_type_str="cache_int8", cache_k_scale=layer_ks, cache_v_scale=layer_vs
-        )
-        caches = _make_caches_normal(layer_id=0)
-        fm = DummyForwardMeta(caches=caches, max_len_val=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        mock_append_attn.return_value = paddle.zeros([bs, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([bs, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
-        )
+    Cache shape: (max_block_num, kv_num_heads, block_size, head_dim)
+    Scale shape: (max_block_num, kv_num_heads, block_size)
+    """
+    cache_shape = [max_block_num, KV_NUM_HEADS, BLOCK_SIZE, HEAD_DIM]
+    scale_shape = [max_block_num, KV_NUM_HEADS, BLOCK_SIZE]
 
-        kc, vc, ks, vs = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc, caches[0])
-        self.assertIs(vc, caches[1])
-        self.assertIs(ks, layer_ks)
-        self.assertIs(vs, layer_vs)
-
-    @patch(f"{FLASH_MASK_MODULE}.append_attention")
-    @patch(f"{FLASH_MASK_MODULE}.get_block_shape_and_split_kv_block")
-    def test_block_wise_fp8_uses_4x_indexing_decode_only(self, mock_split_kv, mock_append_attn):
-        """block_wise_fp8: caches[4*layer_id] indexing."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
+    cache_k = paddle.zeros(cache_shape, dtype=quant_config.cache_dtype)
+    cache_v = paddle.zeros(cache_shape, dtype=quant_config.cache_dtype)
 
-        layer = DummyLayer(layer_id=0, cache_quant_type_str="block_wise_fp8")
-        caches = _make_caches_block_wise_fp8(layer_id=0)
-        fm = DummyForwardMeta(caches=caches, max_len_val=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        mock_append_attn.return_value = paddle.zeros([bs, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([bs, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
-        )
+    if quant_config.has_dynamic_scales:
+        cache_k_scale = paddle.zeros(scale_shape, dtype="bfloat16")
+        cache_v_scale = paddle.zeros(scale_shape, dtype="bfloat16")
+        return [cache_k, cache_v, cache_k_scale, cache_v_scale]
+    else:
+        return [cache_k, cache_v]
 
-        kc, vc, ks, vs = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc, caches[0])
-        self.assertIs(vc, caches[1])
-        self.assertIs(ks, caches[2])
-        self.assertIs(vs, caches[3])
-
-    @patch(f"{FLASH_MASK_MODULE}.append_attention")
-    @patch(f"{FLASH_MASK_MODULE}.get_block_shape_and_split_kv_block")
-    def test_block_wise_fp8_layer_id_1(self, mock_split_kv, mock_append_attn):
-        """block_wise_fp8 with layer_id=1: indices 4,5,6,7."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
 
-        layer = DummyLayer(layer_id=1, cache_quant_type_str="block_wise_fp8")
-        caches = _make_caches_block_wise_fp8(layer_id=1)
-        fm = DummyForwardMeta(caches=caches, max_len_val=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        mock_append_attn.return_value = paddle.zeros([bs, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([bs, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
-        )
+def _make_gpu_forward_meta(caches, seq_len=1):
+    """Create a ForwardMeta suitable for real GPU decode-only forward."""
+    bs = BATCH_SIZE
+    block_num_per_seq = math.ceil(seq_len / BLOCK_SIZE) or 1
 
-        kc, vc, ks, vs = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc, caches[4])
-        self.assertIs(vc, caches[5])
-        self.assertIs(ks, caches[6])
-        self.assertIs(vs, caches[7])
-
-    @patch(f"{FLASH_MASK_MODULE}.flash_mask_attention")
-    @patch(f"{FLASH_MASK_MODULE}.append_attention")
-    @patch(f"{FLASH_MASK_MODULE}.pre_cache_len_concat")
-    @patch(f"{FLASH_MASK_MODULE}.get_block_shape_and_split_kv_block")
-    @patch(f"{FLASH_MASK_MODULE}.gqa_rope_write_cache")
-    def test_block_wise_fp8_prefill_path(
-        self,
-        mock_gqa_rope,
-        mock_split_kv,
-        mock_pre_cache,
-        mock_append_attn,
-        mock_flash_mask,
-    ):
-        """Prefill: gqa_rope_write_cache and append_attention both get
-        block_wise_fp8 caches. 4 batches, 5 tokens each = 20 total."""
-        backend = self._create_backend()
-        backend.attention_metadata = DummyMetadata()
+    block_tables = paddle.zeros([bs, block_num_per_seq], dtype="int32")
+    idx = 0
+    for i in range(bs):
+        for j in range(block_num_per_seq):
+            block_tables[i, j] = idx
+            idx += 1
 
-        layer = DummyLayer(layer_id=0, cache_quant_type_str="block_wise_fp8")
-        caches = _make_caches_block_wise_fp8(layer_id=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        total_tokens = bs * 5  # 20 tokens total (4 batches * 5 tokens)
-        fm = DummyForwardMeta(caches=caches, max_len_val=5)
-
-        mock_pre_cache.return_value = (
-            paddle.to_tensor([0, 5, 10, 15, 20], dtype="int32"),  # cu_seqlens_k
-            paddle.to_tensor(list(range(bs)), dtype="int32"),  # pre_cache_batch_ids
-            paddle.to_tensor([0] * bs, dtype="int32"),  # pre_cache_tile_ids
-            paddle.to_tensor([bs], dtype="int32"),  # pre_cache_num_blocks
-            paddle.to_tensor([total_tokens], dtype="int32"),  # kv_token_num
-        )
-        mock_gqa_rope.return_value = (
-            paddle.zeros([total_tokens, 56, 128], dtype="bfloat16"),
-            paddle.zeros([total_tokens, 4, 128], dtype="bfloat16"),
-            paddle.zeros([total_tokens, 4, 128], dtype="bfloat16"),
-            None,
-        )
-        mock_flash_mask.return_value = None
-        mock_append_attn.return_value = paddle.zeros([total_tokens, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([total_tokens, 7680], dtype="bfloat16"),
-            compressed_kv=None,
-            k_pe=None,
-            layer=layer,
-            forward_meta=fm,
-        )
+    fm = DummyForwardMeta(caches=caches, max_len_val=0)
+    fm.block_tables = block_tables
+    return fm
 
-        kc, vc, ks, vs = _extract_cache_args_from_gqa_rope(mock_gqa_rope.call_args)
-        self.assertIs(kc, caches[0])
-        self.assertIs(vc, caches[1])
-        self.assertIs(ks, caches[2])
-        self.assertIs(vs, caches[3])
-
-        kc2, vc2, ks2, vs2 = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc2, caches[0])
-        self.assertIs(vc2, caches[1])
-        self.assertIs(ks2, caches[2])
-        self.assertIs(vs2, caches[3])
-
-    @patch(f"{FLASH_MASK_MODULE}.append_attention")
-    @patch(f"{FLASH_MASK_MODULE}.get_block_shape_and_split_kv_block")
-    def test_none_quant_type_defaults_to_2x(self, mock_split_kv, mock_append_attn):
-        """cache_quant_type_str='none': 2x indexing."""
-        backend = self._create_backend()
+
+@unittest.skipIf(not _HAS_GPU, "No GPU available")
+@unittest.skipIf(_IMPORT_ERROR is not None, f"Cannot import backends: {_IMPORT_ERROR}")
+class TestBackendForwardGPU(unittest.TestCase):
+    """GPU-based tests: real forward_mixed calls on GPU hardware."""
+
+    def _gpu_smoke_test(self, backend_class, module_path, quant_config_name):
+        """Test that forward_mixed runs on GPU without error."""
+        config = QUANT_CONFIGS[quant_config_name]
+        backend = create_backend(backend_class, module_path)
         backend.attention_metadata = DummyMetadata()
 
-        layer = DummyLayer(layer_id=0, cache_quant_type_str="none")
-        caches = _make_caches_normal(layer_id=0)
-        fm = DummyForwardMeta(caches=caches, max_len_val=0)
-        bs = DummyForwardMeta.BATCH_SIZE
-        mock_append_attn.return_value = paddle.zeros([bs, 7168], dtype="bfloat16")
-
-        backend.forward_mixed(
-            q=None,
-            k=None,
-            v=None,
-            qkv=paddle.zeros([bs, 7680], dtype="bfloat16"),
+        max_block_num = BATCH_SIZE
+        caches = _make_gpu_caches(config, max_block_num=max_block_num)
+        layer = DummyLayer(layer_id=0, quant_config=config)
+        fm = _make_gpu_forward_meta(caches, seq_len=1)
+        q, k, v, qkv = make_qkv_inputs()
+
+        result = backend.forward_mixed(
+            q=q,
+            k=k,
+            v=v,
+            qkv=qkv,
             compressed_kv=None,
             k_pe=None,
             layer=layer,
             forward_meta=fm,
         )
+        self.assertEqual(result.shape, [BATCH_SIZE, ATTN_OUTPUT_DIM])
+
+    def _gpu_diff_test(self, backend_class, module_path):
+        """Compare C8 dynamic vs C16 outputs on GPU (loose tolerance)."""
+        max_block_num = BATCH_SIZE
+        q, k, v, qkv = make_qkv_inputs()
+
+        results = {}
+        for config_name in ["C8_dynamic", "C16"]:
+            config = QUANT_CONFIGS[config_name]
+            backend = create_backend(backend_class, module_path)
+            backend.attention_metadata = DummyMetadata()
+
+            caches = _make_gpu_caches(config, max_block_num=max_block_num)
+            layer = DummyLayer(layer_id=0, quant_config=config)
+            fm = _make_gpu_forward_meta(caches, seq_len=1)
+
+            results[config_name] = backend.forward_mixed(
+                q=q.clone(),
+                k=k.clone(),
+                v=v.clone(),
+                qkv=qkv.clone(),
+                compressed_kv=None,
+                k_pe=None,
+                layer=layer,
+                forward_meta=fm,
+            )
+
+        np.testing.assert_allclose(
+            results["C8_dynamic"].cast("float32").numpy(),
+            results["C16"].cast("float32").numpy(),
+            rtol=0.1,
+            atol=0.1,
+            err_msg=f"C8 dynamic vs C16 GPU output diff too large for {backend_class.__name__}",
+        )
 
-        kc, vc, ks, vs = _extract_cache_args_from_append_attention(mock_append_attn.call_args)
-        self.assertIs(kc, caches[0])
-        self.assertIs(vc, caches[1])
-        self.assertIsNone(ks)
-        self.assertIsNone(vs)
-
-
-# ---------------------------------------------------------------------------
-# Softmax -INFINITY fix tests
-# ---------------------------------------------------------------------------
-
-
-class TestSoftmaxInfinityHandling(unittest.TestCase):
-    """Test the softmax numerical fix for -INFINITY handling.
-
-    The fix in softmax.hpp:
-    1. scale_apply_exp2: when max == -INFINITY, max_scaled = 0 (not NaN)
-    2. Softmax::rescale: when both prev/cur max are -INFINITY, scale = 1.0
-    """
-
-    def test_scale_apply_exp2_normal(self):
-        """Normal case: max is finite."""
-        scale = 1.0 / np.log(2)
-        max_val = 2.0
-        tensor_val = 3.0
-        result = 2 ** (tensor_val * scale - max_val * scale)
-        self.assertTrue(np.isfinite(result))
-
-    def test_scale_apply_exp2_neg_inf_max(self):
-        """When max == -inf, fix sets max_scaled=0 avoiding NaN."""
-        scale = 1.4426950408889634  # 1/ln(2)
-        max_val = float("-inf")
-
-        # Fixed: max_scaled = 0
-        max_scaled_fixed = 0.0 if max_val == float("-inf") else max_val * scale
-        self.assertEqual(max_scaled_fixed, 0.0)
-
-        # Broken: tensor=-inf, max=-inf => -inf - (-inf) = NaN
-        tensor_val = float("-inf")
-        broken_result = 2 ** (tensor_val * scale - max_val * scale)
-        self.assertTrue(np.isnan(broken_result))
-
-        # Fixed: exp2(-inf - 0) = 0
-        fixed_result = 2 ** (tensor_val * scale - max_scaled_fixed)
-        self.assertEqual(fixed_result, 0.0)
-
-    def test_rescale_both_neg_inf(self):
-        """Both prev/cur max -inf => scale=1.0 (not NaN)."""
-        scale_log2 = 1.4426950408889634
-        prev = float("-inf")
-        cur = float("-inf")
-
-        # Fixed
-        if prev == float("-inf") and cur == float("-inf"):
-            fixed = 1.0
-        else:
-            fixed = 2 ** ((prev - cur) * scale_log2)
-        self.assertEqual(fixed, 1.0)
-
-        # Broken: -inf - (-inf) = NaN
-        broken = 2 ** ((prev - cur) * scale_log2)
-        self.assertTrue(np.isnan(broken))
-
-    def test_rescale_prev_neg_inf_cur_finite(self):
-        """prev=-inf, cur=finite => scale=0 (first tile case)."""
-        scale = 2 ** ((float("-inf") - 2.0) * 1.4426950408889634)
-        self.assertEqual(scale, 0.0)
-
-    def test_rescale_both_finite(self):
-        """Normal rescaling with finite values."""
-        scale_log2 = 1.4426950408889634
-        scale = 2 ** ((3.0 - 4.0) * scale_log2)
-        expected = 2 ** (-1.0 * scale_log2)
-        self.assertAlmostEqual(scale, expected, places=6)
-        self.assertTrue(0 < scale < 1)
+    def test_flash_attn_c8_dynamic_gpu(self):
+        self._gpu_smoke_test(FlashAttentionBackend, FLASH_ATTN_MODULE, "C8_dynamic")
 
-    def test_row_sum_preservation_with_inf_fix(self):
-        """row_sum * 1.0 preserved; row_sum * NaN corrupted."""
-        row_sum = 0.5
-        self.assertEqual(row_sum * 1.0, 0.5)
-        self.assertTrue(np.isnan(row_sum * float("nan")))
+    def test_flash_attn_c16_gpu(self):
+        self._gpu_smoke_test(FlashAttentionBackend, FLASH_ATTN_MODULE, "C16")
 
+    def test_flash_mask_attn_c8_dynamic_gpu(self):
+        self._gpu_smoke_test(FlashMaskAttentionBackend, FLASH_MASK_MODULE, "C8_dynamic")
 
-# ---------------------------------------------------------------------------
-# CUDA kernel config tests
-# ---------------------------------------------------------------------------
+    def test_flash_mask_attn_c16_gpu(self):
+        self._gpu_smoke_test(FlashMaskAttentionBackend, FLASH_MASK_MODULE, "C16")
 
+    def test_flash_attn_c8_vs_c16_gpu(self):
+        self._gpu_diff_test(FlashAttentionBackend, FLASH_ATTN_MODULE)
 
-class TestAppendCacheKVC8KernelConfig(unittest.TestCase):
-    """Test kernel template parameter mapping for append_cache_kv_c8."""
-
-    def test_quant_type_to_kernel_params(self):
-        configs = {
-            "cache_int8": {"IS_FP8": False, "dynamic_quant": False},
-            "cache_fp8": {"IS_FP8": True, "dynamic_quant": False},
-            "block_wise_fp8": {"IS_FP8": True, "dynamic_quant": True},
-        }
-        self.assertFalse(configs["cache_int8"]["IS_FP8"])
-        self.assertFalse(configs["cache_int8"]["dynamic_quant"])
-        self.assertTrue(configs["cache_fp8"]["IS_FP8"])
-        self.assertFalse(configs["cache_fp8"]["dynamic_quant"])
-        self.assertTrue(configs["block_wise_fp8"]["IS_FP8"])
-        self.assertTrue(configs["block_wise_fp8"]["dynamic_quant"])
-
-    def test_dynamic_quant_scale_indexing(self):
-        """Dynamic quant: per-token scale = (block_id*kv_num_heads+head)*block_size+row."""
-        kv_num_heads = 4
-        block_size = 64
-        block_id, head_idx, row_idx = 3, 2, 5
-        idx = (block_id * kv_num_heads + head_idx) * block_size + row_idx
-        self.assertEqual(idx, (3 * 4 + 2) * 64 + 5)
-
-    def test_block_wise_fp8_in_c8_branch(self):
-        c8_types = {"cache_int8", "cache_fp8", "block_wise_fp8"}
-        self.assertIn("block_wise_fp8", c8_types)
-        self.assertNotIn("cache_int4_zp", c8_types)
-        self.assertNotIn("none", c8_types)
-
-    def test_static_quant_null_quant_scales(self):
-        """Static quant: quant_scales=None, dequant_scales provided."""
-        self.assertIsNone(None)  # quant_scales
-        self.assertIsNotNone(np.ones(4))  # dequant_scales
+    def test_flash_mask_attn_c8_vs_c16_gpu(self):
+        self._gpu_diff_test(FlashMaskAttentionBackend, FLASH_MASK_MODULE)
 
 
 if __name__ == "__main__":

From c7b3184099f07bf5d1e861a6bf8181d28c45c1df Mon Sep 17 00:00:00 2001
From: Wanglongzhi2001 <583087864@qq.com>
Date: Mon, 30 Mar 2026 21:59:21 +0800
Subject: [PATCH 2/3] fix typo

---
 ...est_kv_cache_int8_dynamic_quant_backend.py | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py b/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py
index 5a13349699a..13632714292 100644
--- a/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py
+++ b/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py
@@ -28,8 +28,6 @@
 """
 
 import math
-import sys
-import types
 import unittest
 from dataclasses import dataclass
 from unittest.mock import patch
@@ -37,24 +35,6 @@
 import numpy as np
 import paddle
 
-# ---------------------------------------------------------------------------
-# Environment setup: mock missing dependencies before import
-# ---------------------------------------------------------------------------
-
-
-def _ensure_mock_module(name, attrs=None):
-    """Ensure a module exists in sys.modules, creating a mock if needed."""
-    if name not in sys.modules:
-        mod = types.ModuleType(name)
-        if attrs:
-            for k, v in attrs.items():
-                setattr(mod, k, v)
-        sys.modules[name] = mod
-    return sys.modules[name]
-
-
-_ensure_mock_module("aistudio_sdk.snapshot_download", {"snapshot_download": lambda *a, **kw: None})
-
 _IMPORT_ERROR = None
 try:
     from fastdeploy.model_executor.layers.attention.flash_attn_backend import (

From e4af7d32979f4cbc2303c5d51bc062c29a05e64f Mon Sep 17 00:00:00 2001
From: Wanglongzhi2001 <583087864@qq.com>
Date: Tue, 31 Mar 2026 10:51:19 +0800
Subject: [PATCH 3/3] fix test

---
 ...test_kv_cache_int8_dynamic_quant_backend.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py b/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py
index 13632714292..f60a54da7c0 100644
--- a/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py
+++ b/tests/layers/test_kv_cache_int8_dynamic_quant_backend.py
@@ -75,8 +75,8 @@ class QuantConfig:
 # ---------------------------------------------------------------------------
 
 BATCH_SIZE = 4
-NUM_HEADS = 56
-KV_NUM_HEADS = 4
+NUM_HEADS = 16
+KV_NUM_HEADS = 2
 HEAD_DIM = 128
 BLOCK_SIZE = 64
 NUM_LAYERS = 2
@@ -190,16 +190,18 @@ def __init__(self, caches, max_len_val=0):
         self.block_tables = paddle.to_tensor([[i] for i in range(bs)], dtype="int32")
         self.decoder_batch_ids = paddle.to_tensor(list(range(bs)), dtype="int32")
         self.decoder_tile_ids_per_batch = paddle.to_tensor([0] * bs, dtype="int32")
-        self.decoder_num_blocks_cpu = paddle.to_tensor([bs], dtype="int32")
+        self.decoder_num_blocks_cpu = paddle.to_tensor([bs], dtype="int32", place=paddle.CPUPlace())
         self.decoder_num_blocks_device = paddle.to_tensor([bs], dtype="int32")
         self.decoder_chunk_size_device = paddle.to_tensor([1] * bs, dtype="int32")
         self.encoder_batch_ids = paddle.to_tensor(list(range(bs)), dtype="int32")
         self.encoder_tile_ids_per_batch = paddle.to_tensor([0] * bs, dtype="int32")
-        self.encoder_num_blocks_x_cpu = paddle.to_tensor([0], dtype="int32")
+        self.encoder_num_blocks_x_cpu = paddle.to_tensor([0], dtype="int32", place=paddle.CPUPlace())
         self.kv_batch_ids = paddle.to_tensor(list(range(bs)), dtype="int32")
         self.kv_tile_ids_per_batch = paddle.to_tensor([0] * bs, dtype="int32")
-        self.kv_num_blocks_x_cpu = paddle.to_tensor([bs], dtype="int32")
-        self.max_len_tensor_cpu = paddle.to_tensor([0, max_len_val, 10, 10], dtype="int32")
+        self.kv_num_blocks_x_cpu = paddle.to_tensor([bs], dtype="int32", place=paddle.CPUPlace())
+        self.max_len_tensor_cpu = paddle.to_tensor(
+            [0, max_len_val, 10, 10, 10, 10], dtype="int32", place=paddle.CPUPlace()
+        )
         self.attn_mask = None
         self.attn_mask_offsets = None
         self.forward_mode = None
@@ -485,8 +487,8 @@ def _gpu_diff_test(self, backend_class, module_path):
         np.testing.assert_allclose(
             results["C8_dynamic"].cast("float32").numpy(),
             results["C16"].cast("float32").numpy(),
-            rtol=0.1,
-            atol=0.1,
+            rtol=1e-4,
+            atol=1e-4,
             err_msg=f"C8 dynamic vs C16 GPU output diff too large for {backend_class.__name__}",
         )