diff --git a/custom_ops/gpu_ops/swap_cache_batch.cu b/custom_ops/gpu_ops/swap_cache_batch.cu
index d8c2c1d59bc..86554197464 100644
--- a/custom_ops/gpu_ops/swap_cache_batch.cu
+++ b/custom_ops/gpu_ops/swap_cache_batch.cu
@@ -127,7 +127,6 @@ void SwapCacheAllLayers(
     const std::vector<int64_t>& swap_block_ids_cpu,
     int rank,
     int mode) {
-  checkCudaErrors(cudaSetDevice(rank));  // used for distributed launch
   assert(cache_gpu_tensors.size() > 0 &&
          cache_gpu_tensors.size() == cache_cpu_ptrs.size());
   switch (cache_gpu_tensors[0].dtype()) {
diff --git a/custom_ops/gpu_ops/swap_cache_optimized.cu b/custom_ops/gpu_ops/swap_cache_optimized.cu
new file mode 100644
index 00000000000..8844e4752f4
--- /dev/null
+++ b/custom_ops/gpu_ops/swap_cache_optimized.cu
@@ -0,0 +1,400 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file swap_cache_optimized.cu
+ * @brief Optimized KV cache swap operators using warp-level parallelism.
+ *
+ * This file implements high-performance operators for KV cache transfer
+ * between GPU and CPU pinned memory:
+ *
+ * swap_cache_per_layer:       Single-layer transfer (sync, backward compatible)
+ * swap_cache_per_layer_async: Single-layer transfer (async, no cudaStreamSync)
+ *
+ * Key optimizations vs original:
+ * 1. Consecutive block fast path: detects consecutive block ID runs and uses
+ *    cudaMemcpyAsync instead of warp kernel (avoids kernel launch overhead).
+ * 2. Async variant: swap_cache_per_layer_async omits cudaStreamSynchronize,
+ *    enabling true async pipelining when called on a dedicated cupy stream.
+ * 3. Warp-level PTX: non-temporal load/store for non-consecutive blocks to
+ *    avoid L2 cache pollution.
+ */
+
+#include "cuda_multiprocess.h"
+#include "helper.h"
+#include "paddle/extension.h"
+
+#include <cstdint>
+#include <vector>
+
+// ============================================================================
+// Device Functions: Warp-Level Parallel Transfer
+// ============================================================================
+
+/**
+ * @brief Warp-level parallel data transfer function.
+ *
+ * Uses PTX inline assembly for optimized memory access:
+ * - ld.global.nc.b64: Non-cacheable load (avoids L2 cache pollution)
+ * - st.global.cg.b64: Cache-globing store (optimizes write performance)
+ *
+ * @param lane_id Thread lane ID within the warp (0-WARP_SIZE-1)
+ * @param src_addr Source memory address
+ * @param dst_addr Destination memory address
+ * @param item_size_bytes Size of the item in bytes (must be 8-byte aligned)
+ */
+__device__ __forceinline__ void transfer_item_warp(int32_t lane_id,
+                                                   const void* src_addr,
+                                                   void* dst_addr,
+                                                   int64_t item_size_bytes) {
+  const uint64_t* __restrict__ src = static_cast<const uint64_t*>(src_addr);
+  uint64_t* __restrict__ dst = static_cast<uint64_t*>(dst_addr);
+  const int total_chunks = item_size_bytes / sizeof(uint64_t);
+
+#pragma unroll
+  for (int j = lane_id; j < total_chunks; j += WARP_SIZE) {
+    uint64_t tmp;
+#ifdef PADDLE_WITH_HIP
+    // ROCm/HIP path using built-in nontemporal operations
+    tmp = __builtin_nontemporal_load(src + j);
+    __builtin_nontemporal_store(tmp, dst + j);
+#else
+    // NVIDIA CUDA path using PTX inline assembly
+    asm volatile("ld.global.nc.b64 %0,[%1];"
+                 : "=l"(tmp)
+                 : "l"(src + j)
+                 : "memory");
+    asm volatile("st.global.cg.b64 [%0],%1;" ::"l"(dst + j), "l"(tmp)
+                 : "memory");
+#endif
+  }
+}
+
+// ============================================================================
+// Kernels
+// ============================================================================
+
+/**
+ * @brief CUDA kernel for single-layer KV cache transfer (non-consecutive path).
+ *
+ * Each warp processes one block using warp-level parallel PTX loads/stores.
+ * Used only when block IDs are non-consecutive; consecutive runs are handled
+ * by cudaMemcpyAsync in the host-side fast path.
+ *
+ * @tparam D2H true = Device->Host (evict), false = Host->Device (load)
+ */
+template <bool D2H>
+__global__ void swap_cache_per_layer_kernel(
+    const void* __restrict__ src_ptr,
+    void* __restrict__ dst_ptr,
+    const int64_t* __restrict__ src_block_ids,
+    const int64_t* __restrict__ dst_block_ids,
+    int64_t num_blocks,
+    int64_t item_size_bytes) {
+  int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_t lane_id = tid % WARP_SIZE;
+  int32_t warp_id = tid / WARP_SIZE;
+
+  if (warp_id >= num_blocks) return;
+
+  int64_t src_block_id = src_block_ids[warp_id];
+  int64_t dst_block_id = dst_block_ids[warp_id];
+
+  const char* src_now =
+      static_cast<const char*>(src_ptr) + src_block_id * item_size_bytes;
+  char* dst_now = static_cast<char*>(dst_ptr) + dst_block_id * item_size_bytes;
+
+  transfer_item_warp(lane_id, src_now, dst_now, item_size_bytes);
+}
+
+// ============================================================================
+// Helper: Consecutive Block Fast Path
+// ============================================================================
+
+/**
+ * @brief Transfer a single layer using consecutive-block detection.
+ *
+ * Scans src/dst block ID pairs for consecutive runs. For each run, issues
+ * a single cudaMemcpyAsync (like swap_cache_all_layers). Non-consecutive
+ * blocks are batched and handled by the warp kernel.
+ *
+ * @tparam D2H true = Device->Host, false = Host->Device
+ * @param src_ptr     Source base pointer (GPU or CPU depending on D2H)
+ * @param dst_ptr     Destination base pointer
+ * @param src_block_ids Host vector of source block IDs
+ * @param dst_block_ids Host vector of destination block IDs
+ * @param num_blocks  Number of blocks to transfer
+ * @param item_size_bytes Bytes per block
+ * @param stream      CUDA stream
+ */
+template <bool D2H>
+void TransferSingleLayerWithFastPath(const void* src_ptr,
+                                     void* dst_ptr,
+                                     const std::vector<int64_t>& src_block_ids,
+                                     const std::vector<int64_t>& dst_block_ids,
+                                     int64_t num_blocks,
+                                     int64_t item_size_bytes,
+                                     cudaStream_t stream) {
+  // --- Pass 1: handle consecutive runs with cudaMemcpyAsync ---
+  // Collect indices of non-consecutive blocks for the kernel fallback.
+  std::vector<int64_t> nc_src, nc_dst;
+  const cudaMemcpyKind kind =
+      D2H ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;
+
+  int64_t run_start = 0;
+  for (int64_t i = 1; i <= num_blocks; ++i) {
+    bool end_of_run = (i == num_blocks) ||
+                      (src_block_ids[i] != src_block_ids[i - 1] + 1) ||
+                      (dst_block_ids[i] != dst_block_ids[i - 1] + 1);
+    if (!end_of_run) continue;
+
+    int64_t run_len = i - run_start;
+    if (run_len > 1) {
+      // Consecutive run: merge into a single cudaMemcpyAsync
+      const char* src_run = static_cast<const char*>(src_ptr) +
+                            src_block_ids[run_start] * item_size_bytes;
+      char* dst_run = static_cast<char*>(dst_ptr) +
+                      dst_block_ids[run_start] * item_size_bytes;
+      checkCudaErrors(cudaMemcpyAsync(
+          dst_run, src_run, run_len * item_size_bytes, kind, stream));
+    } else {
+      // Single non-consecutive block: defer to warp kernel
+      nc_src.push_back(src_block_ids[run_start]);
+      nc_dst.push_back(dst_block_ids[run_start]);
+    }
+    run_start = i;
+  }
+
+  // --- Pass 2: warp kernel for remaining non-consecutive blocks ---
+  if (!nc_src.empty()) {
+    int64_t nc_count = static_cast<int64_t>(nc_src.size());
+    int64_t *d_src, *d_dst;
+    checkCudaErrors(
+        cudaMallocAsync(&d_src, nc_count * sizeof(int64_t), stream));
+    checkCudaErrors(
+        cudaMallocAsync(&d_dst, nc_count * sizeof(int64_t), stream));
+    checkCudaErrors(cudaMemcpyAsync(d_src,
+                                    nc_src.data(),
+                                    nc_count * sizeof(int64_t),
+                                    cudaMemcpyHostToDevice,
+                                    stream));
+    checkCudaErrors(cudaMemcpyAsync(d_dst,
+                                    nc_dst.data(),
+                                    nc_count * sizeof(int64_t),
+                                    cudaMemcpyHostToDevice,
+                                    stream));
+
+    constexpr int kWarpsPerBlock = 4;
+    const int threads_per_block = kWarpsPerBlock * WARP_SIZE;
+    const int grid =
+        (static_cast<int>(nc_count) + kWarpsPerBlock - 1) / kWarpsPerBlock;
+
+    swap_cache_per_layer_kernel<D2H><<<grid, threads_per_block, 0, stream>>>(
+        src_ptr, dst_ptr, d_src, d_dst, nc_count, item_size_bytes);
+
+    checkCudaErrors(cudaFreeAsync(d_src, stream));
+    checkCudaErrors(cudaFreeAsync(d_dst, stream));
+  }
+}
+
+// ============================================================================
+// Implementation: Single Layer
+// ============================================================================
+
+/**
+ * @brief Core implementation for single-layer KV cache transfer.
+ *
+ * @param do_sync  If true, calls cudaStreamSynchronize at end (sync op).
+ *                 Set to false for the async variant.
+ */
+template <paddle::DataType D, bool D2H>
+void SwapCachePerLayerImpl(const paddle::Tensor& cache_gpu,
+                           int64_t cache_cpu_ptr,
+                           int64_t max_block_num_cpu,
+                           const std::vector<int64_t>& swap_block_ids_gpu,
+                           const std::vector<int64_t>& swap_block_ids_cpu,
+                           cudaStream_t stream,
+                           bool do_sync) {
+  typedef typename PDTraits<D>::DataType DataType_;
+  typedef typename PDTraits<D>::data_t data_t;
+
+  auto cache_shape = cache_gpu.shape();
+  const int64_t max_block_num_gpu = cache_shape[0];
+  const int64_t num_heads = cache_shape[1];
+  const int64_t block_size = cache_shape[2];
+  const int64_t head_dim = cache_shape.size() == 4 ? cache_shape[3] : 1;
+  const int64_t item_size_bytes =
+      num_heads * block_size * head_dim * sizeof(DataType_);
+
+  const int64_t num_blocks = swap_block_ids_gpu.size();
+  if (num_blocks == 0) return;
+
+  // Validate block IDs
+  for (size_t i = 0; i < swap_block_ids_gpu.size(); ++i) {
+    if (swap_block_ids_gpu[i] < 0 ||
+        swap_block_ids_gpu[i] >= max_block_num_gpu) {
+      PD_THROW("Invalid swap_block_ids_gpu at index " + std::to_string(i) +
+               ": " + std::to_string(swap_block_ids_gpu[i]) +
+               " out of range [0, " + std::to_string(max_block_num_gpu) + ")");
+    }
+    if (swap_block_ids_cpu[i] < 0 ||
+        swap_block_ids_cpu[i] >= max_block_num_cpu) {
+      PD_THROW("Invalid swap_block_ids_cpu at index " + std::to_string(i) +
+               ": " + std::to_string(swap_block_ids_cpu[i]) +
+               " out of range [0, " + std::to_string(max_block_num_cpu) + ")");
+    }
+  }
+
+  // D2H: src=GPU, dst=CPU; H2D: src=CPU, dst=GPU
+  const auto& src_block_ids = D2H ? swap_block_ids_gpu : swap_block_ids_cpu;
+  const auto& dst_block_ids = D2H ? swap_block_ids_cpu : swap_block_ids_gpu;
+
+  const void* src_ptr;
+  void* dst_ptr;
+  if (D2H) {
+    src_ptr = cache_gpu.data<data_t>();
+    dst_ptr = reinterpret_cast<void*>(cache_cpu_ptr);
+  } else {
+    src_ptr = reinterpret_cast<const void*>(cache_cpu_ptr);
+    dst_ptr = const_cast<data_t*>(cache_gpu.data<data_t>());
+  }
+
+  TransferSingleLayerWithFastPath<D2H>(src_ptr,
+                                       dst_ptr,
+                                       src_block_ids,
+                                       dst_block_ids,
+                                       num_blocks,
+                                       item_size_bytes,
+                                       stream);
+
+  if (do_sync) {
+    checkCudaErrors(cudaStreamSynchronize(stream));
+  }
+}
+
+// ============================================================================
+// Operator Registration
+// ============================================================================
+// Operator Entry Points
+// ============================================================================
+
+// Helper macro to dispatch dtype and direction for SwapCachePerLayerImpl
+#define DISPATCH_PER_LAYER(DTYPE, MODE, DO_SYNC, ...)                         \
+  switch (DTYPE) {                                                            \
+    case paddle::DataType::BFLOAT16:                                          \
+      if ((MODE) == 0)                                                        \
+        SwapCachePerLayerImpl<paddle::DataType::BFLOAT16, true>(__VA_ARGS__,  \
+                                                                DO_SYNC);     \
+      else                                                                    \
+        SwapCachePerLayerImpl<paddle::DataType::BFLOAT16, false>(__VA_ARGS__, \
+                                                                 DO_SYNC);    \
+      break;                                                                  \
+    case paddle::DataType::FLOAT16:                                           \
+      if ((MODE) == 0)                                                        \
+        SwapCachePerLayerImpl<paddle::DataType::FLOAT16, true>(__VA_ARGS__,   \
+                                                               DO_SYNC);      \
+      else                                                                    \
+        SwapCachePerLayerImpl<paddle::DataType::FLOAT16, false>(__VA_ARGS__,  \
+                                                                DO_SYNC);     \
+      break;                                                                  \
+    case paddle::DataType::UINT8:                                             \
+      if ((MODE) == 0)                                                        \
+        SwapCachePerLayerImpl<paddle::DataType::UINT8, true>(__VA_ARGS__,     \
+                                                             DO_SYNC);        \
+      else                                                                    \
+        SwapCachePerLayerImpl<paddle::DataType::UINT8, false>(__VA_ARGS__,    \
+                                                              DO_SYNC);       \
+      break;                                                                  \
+    default:                                                                  \
+      PD_THROW("Unsupported data type for swap_cache_per_layer.");            \
+  }
+
+/**
+ * @brief Single-layer KV cache swap (synchronous, backward compatible).
+ */
+void SwapCachePerLayer(const paddle::Tensor& cache_gpu,
+                       int64_t cache_cpu_ptr,
+                       int64_t max_block_num_cpu,
+                       const std::vector<int64_t>& swap_block_ids_gpu,
+                       const std::vector<int64_t>& swap_block_ids_cpu,
+                       int rank,
+                       int mode) {
+  auto stream = cache_gpu.stream();
+  DISPATCH_PER_LAYER(cache_gpu.dtype(),
+                     mode,
+                     /*do_sync=*/true,
+                     cache_gpu,
+                     cache_cpu_ptr,
+                     max_block_num_cpu,
+                     swap_block_ids_gpu,
+                     swap_block_ids_cpu,
+                     stream);
+}
+
+/**
+ * @brief Single-layer KV cache swap (async, no cudaStreamSynchronize).
+ *
+ * Designed for use inside a cupy stream context. Completion is tracked
+ * by the caller via CUDA events (record_input_stream_event).
+ */
+void SwapCachePerLayerAsync(const paddle::Tensor& cache_gpu,
+                            int64_t cache_cpu_ptr,
+                            int64_t max_block_num_cpu,
+                            const std::vector<int64_t>& swap_block_ids_gpu,
+                            const std::vector<int64_t>& swap_block_ids_cpu,
+                            int rank,
+                            int mode) {
+  auto stream = cache_gpu.stream();
+  DISPATCH_PER_LAYER(cache_gpu.dtype(),
+                     mode,
+                     /*do_sync=*/false,
+                     cache_gpu,
+                     cache_cpu_ptr,
+                     max_block_num_cpu,
+                     swap_block_ids_gpu,
+                     swap_block_ids_cpu,
+                     stream);
+}
+
+// ============================================================================
+// Operator Registration
+// ============================================================================
+
+PD_BUILD_STATIC_OP(swap_cache_per_layer)
+    .Inputs({"cache_gpu"})
+    .Attrs({
+        "cache_cpu_ptr: int64_t",
+        "max_block_num_cpu: int64_t",
+        "swap_block_ids_gpu: std::vector<int64_t>",
+        "swap_block_ids_cpu: std::vector<int64_t>",
+        "rank: int",
+        "mode: int",
+    })
+    .Outputs({"cache_dst_out"})
+    .SetInplaceMap({{"cache_gpu", "cache_dst_out"}})
+    .SetKernelFn(PD_KERNEL(SwapCachePerLayer));
+
+PD_BUILD_STATIC_OP(swap_cache_per_layer_async)
+    .Inputs({"cache_gpu"})
+    .Attrs({
+        "cache_cpu_ptr: int64_t",
+        "max_block_num_cpu: int64_t",
+        "swap_block_ids_gpu: std::vector<int64_t>",
+        "swap_block_ids_cpu: std::vector<int64_t>",
+        "rank: int",
+        "mode: int",
+    })
+    .Outputs({"cache_dst_out"})
+    .SetInplaceMap({{"cache_gpu", "cache_dst_out"}})
+    .SetKernelFn(PD_KERNEL(SwapCachePerLayerAsync));
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
index 180116bf2c7..12f09ee331a 100644
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -315,6 +315,7 @@ def find_end_files(directory, end_str):
         "gpu_ops/swap_cache_batch.cu",
         "gpu_ops/swap_cache.cu",
         "gpu_ops/swap_cache_layout.cu",
+        "gpu_ops/swap_cache_optimized.cu",  # 新增：优化的 KV cache 换入算子
         "gpu_ops/step_system_cache.cu",
         "gpu_ops/cpp_extensions.cc",
         "gpu_ops/share_external_data.cu",
diff --git a/fastdeploy/cache_manager/ops.py b/fastdeploy/cache_manager/ops.py
index ff52a8b861e..ea1e594d372 100644
--- a/fastdeploy/cache_manager/ops.py
+++ b/fastdeploy/cache_manager/ops.py
@@ -23,6 +23,12 @@
 
 try:
     if current_platform.is_cuda():
+        from fastdeploy.model_executor.ops.gpu import (
+            swap_cache_per_layer,  # 单层 KV cache 换入算子（同步）
+        )
+        from fastdeploy.model_executor.ops.gpu import (
+            swap_cache_per_layer_async,  # 单层 KV cache 换入算子（异步，无强制 sync）
+        )
         from fastdeploy.model_executor.ops.gpu import (
             cuda_host_alloc,
             cuda_host_free,
@@ -43,6 +49,12 @@ def get_peer_mem_addr(*args, **kwargs):
             raise RuntimeError("CUDA no need of get_peer_mem_addr!")
 
     elif current_platform.is_maca():
+        from fastdeploy.model_executor.ops.gpu import (
+            swap_cache_per_layer,  # 单层 KV cache 换入算子（同步）
+        )
+        from fastdeploy.model_executor.ops.gpu import (
+            swap_cache_per_layer_async,  # 单层 KV cache 换入算子（异步，无强制 sync）
+        )
         from fastdeploy.model_executor.ops.gpu import (  # get_output_kv_signal,; ipc_sent_key_value_cache_by_remote_ptr_block_sync,
             cuda_host_alloc,
             cuda_host_free,
@@ -89,6 +101,12 @@ def ipc_sent_key_value_cache_by_remote_ptr(*args, **kwargs):
         def ipc_sent_key_value_cache_by_remote_ptr_block_sync(*args, **kwargs):
             raise RuntimeError("XPU No ipc_sent_key_value_cache_by_remote_ptr UNIMPLENENTED")
 
+        def swap_cache_per_layer(*args, **kwargs):  # 单层 KV cache 换入算子（同步）
+            raise RuntimeError("XPU swap_cache_per_layer UNIMPLENENTED")
+
+        def swap_cache_per_layer_async(*args, **kwargs):  # 单层 KV cache 换入算子（异步）
+            raise RuntimeError("XPU swap_cache_per_layer_async UNIMPLENENTED")
+
     else:
         raise RuntimeError("Prefix cache ops only supported CUDA nor XPU platform ")
 
@@ -128,6 +146,8 @@ def get_all_visible_devices():
     set_data_ipc = None
     share_external_data_ = None
     swap_cache_all_layers = None
+    swap_cache_per_layer = None  # 单层 KV cache 换入算子（同步）
+    swap_cache_per_layer_async = None  # 单层 KV cache 换入算子（异步）
     unset_data_ipc = None
     set_device = None
     memory_allocated = None
@@ -146,6 +166,8 @@ def get_all_visible_devices():
     "set_data_ipc",
     "share_external_data_",
     "swap_cache_all_layers",
+    "swap_cache_per_layer",  # 单层 KV cache 换入算子（同步）
+    "swap_cache_per_layer_async",  # 单层 KV cache 换入算子（异步，无强制 sync）
     "unset_data_ipc",  # XPU是 None
     "set_device",
     "memory_allocated",
diff --git a/fastdeploy/cache_manager/v1/__init__.py b/fastdeploy/cache_manager/v1/__init__.py
new file mode 100644
index 00000000000..ca9380f8528
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/__init__.py
@@ -0,0 +1,71 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .base import KVCacheBase
+from .cache_controller import CacheController
+from .cache_manager import CacheManager
+from .cache_utils import LayerDoneCounter, LayerSwapTimeoutError
+from .metadata import (
+    AsyncTaskHandler,
+    BlockNode,
+    CacheBlockMetadata,
+    CacheStatus,
+    MatchResult,
+    PDTransferMetadata,
+    StorageConfig,
+    StorageMetadata,
+    StorageType,
+    TransferConfig,
+    TransferResult,
+    TransferStatus,
+    TransferTask,
+    TransferType,
+)
+from .storage import create_storage_connector, create_storage_scheduler
+from .transfer import create_transfer_connector
+from .transfer_manager import CacheTransferManager
+
+__all__ = [
+    # Base classes
+    "KVCacheBase",
+    # Managers
+    "CacheManager",
+    "CacheController",
+    "CacheTransferManager",
+    # Exceptions
+    "LayerSwapTimeoutError",
+    # Utils
+    "LayerDoneCounter",
+    # Metadata
+    "CacheBlockMetadata",
+    "BlockNode",
+    "CacheStatus",
+    "TransferTask",
+    "TransferStatus",
+    "TransferConfig",
+    "TransferResult",
+    "AsyncTaskHandler",
+    "MatchResult",
+    "StorageMetadata",
+    "PDTransferMetadata",
+    "StorageConfig",
+    "StorageType",
+    "TransferType",
+    # Factory functions
+    "create_storage_scheduler",
+    "create_storage_connector",
+    "create_transfer_connector",
+]
diff --git a/fastdeploy/cache_manager/v1/base.py b/fastdeploy/cache_manager/v1/base.py
new file mode 100644
index 00000000000..2f9c8db9c99
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/base.py
@@ -0,0 +1,80 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from fastdeploy.config import FDConfig
+
+
+class KVCacheBase(ABC):
+    """
+    Abstract base class for KV cache management.
+
+    This class defines the common interface for cache management operations.
+    Subclasses (CacheManager and CacheController) implement specific behaviors
+    based on their roles in the system.
+
+    CacheManager (Scheduler process):
+        - Manages DeviceBlockPool and HostBlockPool
+        - Handles block allocation and release
+        - Coordinates storage operations via StorageScheduler
+
+    CacheController (Worker process):
+        - Manages cache transfer operations
+        - Handles layer-by-layer transfer synchronization
+        - Coordinates cross-node transfer via TransferConnector
+    """
+
+    def __init__(self, config: "FDConfig"):
+        """
+        Initialize the KV cache base.
+
+        Args:
+            config: FDConfig instance containing all fastdeploy configuration
+        """
+        self.config = config
+
+        # Extract configuration from FDConfig
+        self.model_config = config.model_config
+        self.cache_config = config.cache_config
+        self.quant_config = config.quant_config
+        self.parallel_config = config.parallel_config
+
+        self._initialized = False
+
+    @abstractmethod
+    def reset_cache(self) -> bool:
+        """
+        Reset the cache state.
+
+        This method should be implemented by subclasses to reset their
+        specific cache state (e.g., clear block pools, reset transfer state).
+
+        Returns:
+            True if reset was successful, False otherwise
+        """
+        pass
+
+    def is_initialized(self) -> bool:
+        """
+        Check if the cache has been initialized.
+
+        Returns:
+            True if initialized, False otherwise
+        """
+        return self._initialized
diff --git a/fastdeploy/cache_manager/v1/block_pool.py b/fastdeploy/cache_manager/v1/block_pool.py
new file mode 100644
index 00000000000..7a2a9bdffbd
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/block_pool.py
@@ -0,0 +1,250 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import threading
+import traceback
+from abc import ABC
+from typing import Any, Dict, List, Optional
+
+from fastdeploy.utils import get_logger
+
+from .metadata import CacheBlockMetadata
+
+logger = get_logger("block_pool", "cache_manager.log")
+
+
+class BlockPool(ABC):
+    """
+    Abstract base class for block pool management.
+    """
+
+    def __init__(
+        self,
+        num_blocks: int,
+        block_size: int,
+    ):
+        """
+        Initialize the block pool.
+
+        Args:
+            num_blocks: Total number of blocks in the pool
+            block_size: Size of each block in bytes
+        """
+        self.num_blocks = num_blocks
+        self.block_size = block_size
+        self._lock = threading.RLock()
+
+        # Track free and used blocks
+        self._free_blocks: List[int] = list(range(num_blocks))
+        self._used_blocks: set = set()
+
+        # Block metadata
+        self._metadata: Dict[int, CacheBlockMetadata] = {}
+
+    def allocate(self, num_blocks: int) -> Optional[List[int]]:
+        """
+        Allocate blocks from the pool.
+
+        Args:
+            num_blocks: Number of blocks to allocate
+
+        Returns:
+            List of allocated block indices if successful, None if not enough blocks
+        """
+        with self._lock:
+            if num_blocks > len(self._free_blocks):
+                logger.warning(
+                    f"BlockPool.allocate failed: not enough blocks, "
+                    f"requested={num_blocks}, available={len(self._free_blocks)}"
+                )
+                return None
+
+            allocated = []
+            for _ in range(num_blocks):
+                block_idx = self._free_blocks.pop(0)
+                self._used_blocks.add(block_idx)
+                allocated.append(block_idx)
+
+            return allocated
+
+    def release(self, block_indices: List[int]) -> None:
+        """
+        Release blocks back to the pool.
+
+        Args:
+            block_indices: List of block indices to release
+        """
+        with self._lock:
+            for idx in block_indices:
+                if idx in self._used_blocks:
+                    self._used_blocks.remove(idx)
+                    self._free_blocks.append(idx)
+                    # Clear metadata
+                    self._metadata.pop(idx, None)
+                else:
+                    logger.error(
+                        f"BlockPool.release: block_id={idx} NOT in used_blocks! "
+                        f"request_blocks={block_indices}, "
+                        f"is_in_free_blocks={idx in self._free_blocks}, "
+                        f"is_valid_block_id={0 <= idx < self.num_blocks}"
+                    )
+                    logger.error(f"BlockPool.release callstack:\n{traceback.format_exc()}")
+
+    def get_metadata(self, block_idx: int) -> Optional[CacheBlockMetadata]:
+        """
+        Get metadata for a block.
+
+        Args:
+            block_idx: Block index
+
+        Returns:
+            Block metadata or None if not found
+        """
+        return self._metadata.get(block_idx)
+
+    def set_metadata(
+        self,
+        block_idx: int,
+        metadata: CacheBlockMetadata,
+    ) -> None:
+        """
+        Set metadata for a block.
+
+        Args:
+            block_idx: Block index
+            metadata: Block metadata to set
+        """
+        self._metadata[block_idx] = metadata
+
+    def available_blocks(self) -> int:
+        """Get number of available blocks."""
+        return len(self._free_blocks)
+
+    def used_blocks(self) -> int:
+        """Get number of used blocks."""
+        return len(self._used_blocks)
+
+    def reset(self) -> None:
+        """Reset the block pool."""
+        with self._lock:
+            self._free_blocks = list(range(self.num_blocks))
+            self._used_blocks.clear()
+            self._metadata.clear()
+
+    def resize(self, new_num_blocks: int) -> bool:
+        """
+        Resize the block pool.
+
+        Supports both expansion and shrinking. Shrinking will fail if
+        there are more used blocks than the new size.
+
+        Args:
+            new_num_blocks: New total number of blocks
+
+        Returns:
+            True if resize was successful, False otherwise
+        """
+        with self._lock:
+            current_used = len(self._used_blocks)
+
+            # Cannot shrink below currently used blocks
+            if new_num_blocks < current_used:
+                return False
+
+            old_num_blocks = self.num_blocks
+            self.num_blocks = new_num_blocks
+
+            if new_num_blocks > old_num_blocks:
+                # Expansion: add new free blocks
+                new_blocks = list(range(old_num_blocks, new_num_blocks))
+                self._free_blocks.extend(new_blocks)
+            elif new_num_blocks < old_num_blocks:
+                # Shrinking: remove free blocks beyond new size
+                blocks_to_keep = set(range(new_num_blocks))
+                self._free_blocks = [b for b in self._free_blocks if b in blocks_to_keep]
+                # Clean up metadata for removed blocks
+                for block_id in range(new_num_blocks, old_num_blocks):
+                    self._metadata.pop(block_id, None)
+
+            return True
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get pool statistics."""
+        return {
+            "num_blocks": self.num_blocks,
+            "block_size": self.block_size,
+            "available": len(self._free_blocks),
+            "used": len(self._used_blocks),
+        }
+
+
+class DeviceBlockPool(BlockPool):
+    """
+    GPU device memory block pool.
+
+    Manages KV cache blocks on GPU memory.
+    Does not track per-device blocks - device affinity is handled elsewhere.
+    """
+
+    def __init__(
+        self,
+        num_blocks: int,
+        block_size: int,
+    ):
+        """
+        Initialize the device block pool.
+
+        Args:
+            num_blocks: Total number of blocks in the pool
+            block_size: Size of each block in bytes
+        """
+        super().__init__(num_blocks, block_size)
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get device pool statistics."""
+        stats = super().get_stats()
+        return stats
+
+
+class HostBlockPool(BlockPool):
+    """
+    CPU host memory block pool.
+
+    Manages KV cache blocks on CPU memory (pinned memory for fast GPU transfer).
+    """
+
+    def __init__(
+        self,
+        num_blocks: int,
+        block_size: int,
+        use_pinned_memory: bool = True,
+    ):
+        """
+        Initialize the host block pool.
+
+        Args:
+            num_blocks: Total number of blocks
+            block_size: Size of each block in bytes
+            use_pinned_memory: Whether to use pinned (page-locked) memory
+        """
+        super().__init__(num_blocks, block_size)
+        self.use_pinned_memory = use_pinned_memory
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get host pool statistics."""
+        stats = super().get_stats()
+        stats["use_pinned_memory"] = self.use_pinned_memory
+        return stats
diff --git a/fastdeploy/cache_manager/v1/cache_controller.py b/fastdeploy/cache_manager/v1/cache_controller.py
new file mode 100644
index 00000000000..a331f5e6914
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/cache_controller.py
@@ -0,0 +1,1088 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import ctypes
+import os
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import paddle
+from paddleformers.utils.log import logger
+
+if TYPE_CHECKING:
+    from fastdeploy.config import FDConfig
+
+# Import ops for CPU cache allocation
+from fastdeploy.cache_manager.ops import cuda_host_alloc, cuda_host_free
+
+from .base import KVCacheBase
+from .cache_utils import LayerDoneCounter
+from .metadata import (
+    AsyncTaskHandler,
+    CacheLevel,
+    CacheSwapMetadata,
+    PDTransferMetadata,
+    StorageMetadata,
+    TransferResult,
+)
+from .transfer_manager import CacheTransferManager
+
+
+class CacheController(KVCacheBase):
+    """
+    Cache Controller for Worker process.
+
+    Inherits KVCacheBase, handles transfer tasks by block index only, does NOT manage BlockPool.
+    BlockPool is managed by CacheManager. CacheController only executes transfers
+    based on block IDs provided by Scheduler.
+
+    All transfer methods are async - they submit tasks and return immediately,
+    returning an AsyncTaskHandler for the caller to track completion.
+
+    Three-level cache hierarchy:
+        Level 1: Device (GPU) - Fastest access, directly used for inference
+        Level 2: Host (CPU) - Medium speed, needs to be loaded to Device
+        Level 3: Storage - Slowest, needs to be fetched to Host first
+
+    Attributes:
+        transfer_manager: CacheTransferManager instance.
+        layer_counter: LayerDoneCounter instance.
+        num_layers: Total number of model layers.
+    """
+
+    def __init__(self, config: "FDConfig", local_rank: int, device_id: int):
+        """
+        Initialize the Cache Controller.
+
+        Args:
+            config: FDConfig instance containing all fastdeploy configuration
+        """
+        super().__init__(config)
+
+        self._num_layers = self.model_config.num_hidden_layers
+        self._local_rank = local_rank
+        self._device_id = device_id
+
+        # cache_kvs_map: stores created kv cache tensors by name
+        self.cache_kvs_map: Dict[str, Any] = {}
+        # host_cache_kvs_map: stores Host (pinned memory) kv cache tensors by name for swap space
+        self.host_cache_kvs_map: Dict[str, Any] = {}
+
+        # Thread safety
+        self._lock = threading.RLock()
+
+        # Thread pool executor for async operations
+        self._executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="cache_transfer")
+
+        # Initialize transfer manager
+        self._transfer_manager = CacheTransferManager(config, local_rank, device_id)
+
+        # Note: LayerDoneCounter is no longer a singleton
+        # Each submit_swap_tasks call creates a new LayerDoneCounter instance
+        self._layer_done_counter = None
+
+        # Pending evict LayerDoneCounters for write_back mode ordering
+        self._pending_evict_counters: List["LayerDoneCounter"] = []
+
+        self._initialized = True
+
+        # NUMA binding flag
+        self._numa_bound = False
+
+    @property
+    def write_policy(self) -> Optional[str]:
+        """Get the write policy for cache operations."""
+        if self.cache_config and hasattr(self.cache_config, "write_policy"):
+            return self.cache_config.write_policy
+        return None
+
+    def _should_wait_for_swap_out(self) -> bool:
+        """
+        Determine if swap-out operations should wait synchronously.
+
+        Returns:
+            True if write_policy is 'write_back', otherwise False.
+        """
+        return self.write_policy == "write_back"
+
+    def submit_swap_tasks(
+        self,
+        evict_metadata: Optional["CacheSwapMetadata"],
+        swap_in_metadata: Optional["CacheSwapMetadata"],
+    ) -> Optional["LayerDoneCounter"]:
+        """
+        Submit evict and swap-in tasks with proper synchronization.
+
+        Logic:
+        1. Before submitting evict, wait for existing pending evict counters to complete
+        2. write_back: Wait for evict to complete before submitting swap-in
+        3. Other policies: Submit both evict and swap-in immediately
+
+        Args:
+            evict_metadata: CacheSwapMetadata for device-to-host eviction (can be None)
+            swap_in_metadata: CacheSwapMetadata for host-to-device swap-in (can be None)
+
+        Returns:
+            LayerDoneCounter for swap-in task, or None if no swap-in metadata provided.
+        """
+        # Step 1: Wait for existing pending evict counters before submitting new evict
+        self._wait_for_pending_evict_counters()
+
+        # Step 2: Submit evict task if provided
+        # Note: evict returns LayerDoneCounter but we don't wait on it layer-by-layer
+        # (except in write_back mode where we wait synchronously via wait_all)
+        if evict_metadata is not None:
+            evict_counter = self.evict_device_to_host(evict_metadata)
+            self._pending_evict_counters.append(evict_counter)
+
+            # Step 3: For write_back, wait for evict to complete before submitting swap-in
+            if self._should_wait_for_swap_out():
+                self._wait_for_pending_evict_counters()
+
+        # Step 4: Submit swap-in task if provided
+        # Returns LayerDoneCounter for tracking layer completion
+        if swap_in_metadata is not None:
+            self._layer_done_counter = self.load_host_to_device(swap_in_metadata)
+            return self._layer_done_counter
+
+        return None
+
+    def _wait_for_pending_evict_counters(self) -> None:
+        """
+        Wait for all pending evict counters to complete.
+
+        This is called before submitting new evict tasks to ensure proper ordering.
+        Uses LayerDoneCounter.wait_all() for efficient waiting.
+        """
+        if not self._pending_evict_counters:
+            return
+
+        evict_wait_start = time.time()
+        evict_length = len(self._pending_evict_counters)
+
+        for counter in self._pending_evict_counters:
+            counter.wait_all()
+
+        self._pending_evict_counters.clear()
+        evict_wait_ms = (time.time() - evict_wait_start) * 1000
+        if evict_wait_ms > 0.1:
+            logger.info(f"cache evict wait time: {evict_wait_ms:.2f}ms, {evict_length} pending evictions")
+
+    # ============ Properties ============
+
+    @property
+    def transfer_manager(self) -> CacheTransferManager:
+        """Get the transfer manager."""
+        return self._transfer_manager
+
+    @property
+    def swap_layer_done_counter(self) -> Optional["LayerDoneCounter"]:
+        """Get the layer done counter for layer swap."""
+        return self._layer_done_counter
+
+    # ============ Helper Methods ============
+
+    def _get_kv_cache_quant_type(self) -> Optional[str]:
+        """Get KV cache quantization type."""
+        if (
+            self.quant_config
+            and hasattr(self.quant_config, "kv_cache_quant_type")
+            and self.quant_config.kv_cache_quant_type is not None
+        ):
+            return self.quant_config.kv_cache_quant_type
+        return None
+
+    def _is_fp8_quantization(self, quant_type: Optional[str] = None) -> bool:
+        """Check if using fp8 quantization."""
+        if quant_type is None:
+            quant_type = self._get_kv_cache_quant_type()
+        return quant_type == "block_wise_fp8"
+
+    def _get_cache_names(self, layer_idx: int) -> Dict[str, str]:
+        """
+        Generate cache names for a layer.
+
+        Args:
+            layer_idx: Layer index.
+
+        Returns:
+            Dictionary with cache names: {
+                "key": "key_caches_{layer}_rank{rank}.device{device}",
+                "value": "value_caches_{layer}_rank{rank}.device{device}",
+                "key_scale": "key_cache_scales_{layer}_rank{rank}.device{device}",
+                "value_scale": "value_cache_scales_{layer}_rank{rank}.device{device}",
+            }
+        """
+        local_rank = self._local_rank % self.parallel_config.tensor_parallel_size
+
+        return {
+            "key": f"key_caches_{layer_idx}_rank{local_rank}.device{self._device_id}",
+            "value": f"value_caches_{layer_idx}_rank{local_rank}.device{self._device_id}",
+            "key_scale": f"key_cache_scales_{layer_idx}_rank{local_rank}.device{self._device_id}",
+            "value_scale": f"value_cache_scales_{layer_idx}_rank{local_rank}.device{self._device_id}",
+        }
+
+    # ============ KV Cache Management ============
+
+    def get_kv_caches(self) -> Optional[Dict[str, Any]]:
+        """
+        Get the current KV Cache tensor dictionary.
+
+        Returns:
+            KV Cache tensor dictionary, None if not initialized.
+        """
+        with self._lock:
+            return self.cache_kvs_map
+
+    def initialize_kv_cache(
+        self,
+        attn_backend: Any,
+        num_gpu_blocks: int,
+    ) -> List[Any]:
+        """
+        Initialize KV Cache tensors.
+
+        Create KV Cache tensors on GPU for storing attention Key and Value.
+
+        Args:
+            attn_backend: Attention backend instance for getting kv cache shape.
+            num_gpu_blocks: Maximum number of blocks on GPU.
+
+        Returns:
+            cache_kvs_list: KV Cache tensor list in [key_cache_layer0, value_cache_layer0, ...] order.
+        """
+        # Get kv cache quantization type
+        kv_cache_quant_type = self._get_kv_cache_quant_type()
+
+        # Get kv cache shape
+        key_cache_shape, value_cache_shape = attn_backend.get_kv_cache_shape(
+            max_num_blocks=num_gpu_blocks, kv_cache_quant_type=kv_cache_quant_type
+        )
+
+        # Get scale shape for block_wise_fp8 quantization
+        kv_cache_scale_shape = None
+        if self._is_fp8_quantization(kv_cache_quant_type):
+            kv_cache_scale_shape = [key_cache_shape[0], key_cache_shape[1], key_cache_shape[2]]
+
+        logger.info(f"Initializing kv cache for all layers. num_layers={self._num_layers}")
+        cache_kvs_list = []
+
+        for i in range(self._num_layers):
+            # Generate cache names
+            cache_names = self._get_cache_names(i)
+
+            logger.info(f"..creating kv cache for layer {i}: key:{key_cache_shape}, value:{value_cache_shape}")
+
+            # Create key cache and value cache
+            key_cache = paddle.full(shape=key_cache_shape, fill_value=0, dtype=self.model_config.dtype)
+            self.cache_kvs_map[cache_names["key"]] = key_cache
+
+            val_cache = paddle.full(shape=value_cache_shape, fill_value=0, dtype=self.model_config.dtype)
+            self.cache_kvs_map[cache_names["value"]] = val_cache
+            cache_kvs_list.extend([key_cache, val_cache])
+
+            # Create scale caches for block_wise_fp8 quantization
+            if self._is_fp8_quantization(kv_cache_quant_type) and kv_cache_scale_shape:
+                key_cache_scales = paddle.full(
+                    shape=kv_cache_scale_shape, fill_value=0, dtype=paddle.get_default_dtype()
+                )
+                val_cache_scales = paddle.full(
+                    shape=kv_cache_scale_shape, fill_value=0, dtype=paddle.get_default_dtype()
+                )
+                self.cache_kvs_map[cache_names["key_scale"]] = key_cache_scales
+                self.cache_kvs_map[cache_names["value_scale"]] = val_cache_scales
+                cache_kvs_list.extend([key_cache_scales, val_cache_scales])
+
+        paddle.device.cuda.empty_cache()
+        logger.info("kv cache is initialized!")
+
+        # Share cache_kvs_map with transfer manager for data transfer operations
+        self._transfer_manager.set_cache_kvs_map(self.cache_kvs_map)
+
+        # Initialize host cache
+        self.initialize_host_cache(attn_backend)
+
+        return cache_kvs_list
+
+    def initialize_mtp_kv_cache(
+        self,
+        attn_backend: Any,
+        num_gpu_blocks: int,
+        num_mtp_layers: int,
+        layer_offset: int,
+    ) -> List[Any]:
+        """
+        Initialize MTP (speculative decode) KV Cache tensors.
+
+        MTP cache layers use indices [layer_offset, layer_offset + num_mtp_layers),
+        so they share the same cache_kvs_map namespace as the main model cache but
+        with non-overlapping layer indices.  All subsequent transfer operations
+        via CacheController automatically cover MTP layers as well because they
+        live in the same cache_kvs_map.
+
+        Args:
+            attn_backend: MTP attention backend instance (proposer.attn_backends[0]).
+            num_gpu_blocks: Number of GPU blocks for MTP (already expanded by ratio).
+            num_mtp_layers: Number of MTP model layers (proposer.model_config.num_hidden_layers).
+            layer_offset: Starting layer index, equals main model num_hidden_layers.
+
+        Returns:
+            cache_kvs_list: KV Cache tensor list in [key_layer0, val_layer0, ...] order.
+        """
+        kv_cache_quant_type = self._get_kv_cache_quant_type()
+
+        key_cache_shape, value_cache_shape = attn_backend.get_kv_cache_shape(
+            max_num_blocks=num_gpu_blocks, kv_cache_quant_type=kv_cache_quant_type
+        )
+
+        kv_cache_scale_shape = None
+        if self._is_fp8_quantization(kv_cache_quant_type):
+            kv_cache_scale_shape = [key_cache_shape[0], key_cache_shape[1], key_cache_shape[2]]
+
+        logger.info(
+            f"[CacheController] Initializing MTP kv cache for {num_mtp_layers} layers "
+            f"(layer_offset={layer_offset}, num_gpu_blocks={num_gpu_blocks})."
+        )
+        cache_kvs_list = []
+
+        for i in range(layer_offset, layer_offset + num_mtp_layers):
+            cache_names = self._get_cache_names(i)
+
+            key_cache = paddle.full(shape=key_cache_shape, fill_value=0, dtype=self.model_config.dtype)
+            self.cache_kvs_map[cache_names["key"]] = key_cache
+
+            val_cache = paddle.full(shape=value_cache_shape, fill_value=0, dtype=self.model_config.dtype)
+            self.cache_kvs_map[cache_names["value"]] = val_cache
+            cache_kvs_list.extend([key_cache, val_cache])
+
+            if self._is_fp8_quantization(kv_cache_quant_type) and kv_cache_scale_shape:
+                key_cache_scales = paddle.full(
+                    shape=kv_cache_scale_shape, fill_value=0, dtype=paddle.get_default_dtype()
+                )
+                val_cache_scales = paddle.full(
+                    shape=kv_cache_scale_shape, fill_value=0, dtype=paddle.get_default_dtype()
+                )
+                self.cache_kvs_map[cache_names["key_scale"]] = key_cache_scales
+                self.cache_kvs_map[cache_names["value_scale"]] = val_cache_scales
+                cache_kvs_list.extend([key_cache_scales, val_cache_scales])
+
+        paddle.device.cuda.empty_cache()
+        logger.info("[CacheController] MTP kv cache initialized!")
+
+        # Refresh transfer manager so it sees the full map (main + MTP layers)
+        self._transfer_manager.set_cache_kvs_map(self.cache_kvs_map)
+
+        return cache_kvs_list
+
+    def _get_numa_node_for_gpu(self, device_id: int) -> int:
+        """
+        Get the NUMA node closest to the specified GPU device.
+
+        Tries multiple methods in order:
+        1. nvidia-smi topo -C -i <gpu_id> (fastest and most reliable)
+        2. /sys/class/nvidia-gpu/ (direct sysfs)
+        3. /sys/bus/pci/devices/ (fallback)
+
+        Args:
+            device_id: CUDA device ID.
+
+        Returns:
+            NUMA node index, or -1 if cannot be determined.
+        """
+        try:
+            # Method 1: Use nvidia-smi topo -C -i (fastest, SGLang-style)
+            # This directly outputs the NUMA ID for the specific GPU
+            try:
+                import subprocess
+
+                result = subprocess.run(
+                    ["nvidia-smi", "topo", "-C", "-i", str(device_id)], capture_output=True, text=True, timeout=5
+                )
+                if result.returncode == 0:
+                    output_line = result.stdout.strip()
+                    prefix = "NUMA IDs of closest CPU:"
+                    if output_line.startswith(prefix):
+                        numa_str = output_line[len(prefix) :].strip()
+                        # Handle comma-separated or range values (e.g., "0" or "0,1" or "0-1")
+                        if numa_str:
+                            # Take the first NUMA node if multiple are listed
+                            first_numa = numa_str.split(",")[0].split("-")[0].strip()
+                            if first_numa.isdigit():
+                                return int(first_numa)
+            except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e:
+                logger.debug(f"[CacheController] nvidia-smi topo -C method failed: {e}")
+
+            # Method 2: Try to read from /sys filesystem
+            sys_path = f"/sys/class/nvidia-gpu/nvidia{device_id}/device/numa_node"
+            if os.path.exists(sys_path):
+                with open(sys_path, "r") as f:
+                    return int(f.read().strip())
+
+            # Method 3: Fallback - check all NVIDIA PCI devices
+            import glob
+
+            numa_paths = glob.glob("/sys/bus/pci/devices/*/numa_node")
+            for path in numa_paths:
+                vendor_path = path.replace("numa_node", "vendor")
+                if os.path.exists(vendor_path):
+                    with open(vendor_path, "r") as f:
+                        vendor = f.read().strip()
+                        if vendor == "0x10de":  # NVIDIA vendor ID
+                            with open(path, "r") as f:
+                                return int(f.read().strip())
+
+            return -1
+        except Exception as e:
+            logger.debug(f"[CacheController] Failed to get NUMA node for GPU {device_id}: {e}")
+            return -1
+
+    def _bind_to_closest_numa_node(self) -> bool:
+        """
+        Bind current thread and memory allocation to the NUMA node closest to the GPU.
+
+        This should be called before allocating host memory to ensure the memory
+        is allocated on the NUMA node local to the GPU, reducing cross-NUMA access
+        latency during H2D transfers.
+
+        Returns:
+            True if binding was successful, False otherwise.
+        """
+        if self._numa_bound:
+            return True
+
+        try:
+            # Load libnuma
+            try:
+                libnuma = ctypes.CDLL("libnuma.so.1")
+            except OSError:
+                try:
+                    libnuma = ctypes.CDLL("libnuma.so")
+                except OSError:
+                    logger.warning("[CacheController] libnuma not found, NUMA binding skipped")
+                    return False
+
+            # Check if NUMA is available
+            if libnuma.numa_available() < 0:
+                logger.warning("[CacheController] NUMA is not available on this system")
+                return False
+
+            # Get NUMA node for current GPU
+            numa_node = self._get_numa_node_for_gpu(self._device_id)
+
+            if numa_node < 0:
+                logger.warning(f"[CacheController] Could not determine NUMA node for GPU {self._device_id}")
+                return False
+
+            # Bind current thread to specific NUMA node
+            # numa_run_on_node binds the current thread to run on the specified node
+            result = libnuma.numa_run_on_node(numa_node)
+            if result < 0:
+                logger.warning(f"[CacheController] numa_run_on_node({numa_node}) failed")
+                return False
+
+            # Set memory allocation preference to the specified NUMA node
+            # This affects subsequent memory allocations (including cudaHostAlloc)
+            libnuma.numa_set_preferred(numa_node)
+
+            self._numa_bound = True
+            logger.info(
+                f"[CacheController] NUMA binding successful: " f"GPU {self._device_id} bound to NUMA node {numa_node}"
+            )
+            return True
+
+        except Exception as e:
+            logger.warning(f"[CacheController] NUMA binding failed: {e}")
+            return False
+
+    def initialize_host_cache(
+        self,
+        attn_backend: Any,
+    ) -> Dict[str, Any]:
+        """
+        Initialize Host (Pinned Memory) KV Cache.
+
+        Use cuda_host_alloc to allocate pinned memory for fast Host-Device data transfer.
+        Called during initialization to create Host-side swap space.
+
+        Args:
+            attn_backend: Attention backend instance for getting kv cache shape.
+
+        Returns:
+            host_cache_kvs_map: Host KV Cache pointer dictionary, indexed by name.
+        """
+        num_host_blocks = self.cache_config.num_cpu_blocks
+        if num_host_blocks == 0:
+            logger.info("[CacheController] No swap space (Host cache) specified, skipping initialization.")
+            return
+
+        if len(self.host_cache_kvs_map) > 0:
+            return
+
+        # Step 0: Bind to closest NUMA node before allocating host memory
+        # This ensures subsequent cuda_host_alloc allocations are on the local NUMA node
+        if not self._numa_bound:
+            self._bind_to_closest_numa_node()
+
+        # Get kv cache quantization type
+        kv_cache_quant_type = self._get_kv_cache_quant_type()
+
+        # Get kv cache shape (pass num_host_blocks as max_num_blocks for host cache)
+        key_cache_shape, value_cache_shape = attn_backend.get_kv_cache_shape(
+            max_num_blocks=num_host_blocks, kv_cache_quant_type=kv_cache_quant_type
+        )
+
+        # Calculate cache sizes (elements per block per layer)
+        key_cache_size = key_cache_shape[1] * key_cache_shape[2] * key_cache_shape[3]
+        if value_cache_shape:
+            value_cache_size = value_cache_shape[1] * value_cache_shape[2] * value_cache_shape[3]
+        else:
+            value_cache_size = 0
+
+        # Get cache dtype and bytes per element
+        cache_dtype = self.cache_config.cache_dtype
+        cache_item_bytes = self.cache_config.get_cache_bytes(cache_dtype)
+
+        # Calculate total bytes to allocate
+        key_need_to_allocate_bytes = num_host_blocks * cache_item_bytes * key_cache_size
+        value_need_to_allocate_bytes = num_host_blocks * cache_item_bytes * value_cache_size
+
+        # Calculate scale sizes for block_wise_fp8 quantization
+        scales_key_need_to_allocate_bytes = 0
+        scales_value_need_to_allocate_bytes = 0
+        cache_scale_shape = None
+        if self._is_fp8_quantization(kv_cache_quant_type):
+            cache_scales_size = key_cache_shape[1] * key_cache_shape[2]
+            # Scale tensor uses default dtype (float32)
+            scale_bytes = 4  # float32
+            scales_key_need_to_allocate_bytes = num_host_blocks * scale_bytes * cache_scales_size
+            scales_value_need_to_allocate_bytes = num_host_blocks * scale_bytes * cache_scales_size
+            cache_scale_shape = [num_host_blocks, key_cache_shape[1], key_cache_shape[2]]
+
+        num_layers = self._num_layers + self.config.speculative_config.num_extra_cache_layer
+
+        per_layer_size_gb = (key_need_to_allocate_bytes + value_need_to_allocate_bytes) / (1024**3)
+        actual_alloc_gb = per_layer_size_gb * num_layers
+        logger.info(
+            f"[CacheController] Host swap space allocated: {actual_alloc_gb:.2f}GB "
+            f"({per_layer_size_gb:.2f}GB per layer x {num_layers} layers), "
+            f"num_host_blocks: {num_host_blocks}"
+        )
+
+        logger.info(f"[CacheController] Initializing swap space (Host cache) for {num_layers} layers.")
+
+        # Allocate Host cache for each layer
+        for i in range(num_layers):
+            # Generate cache names
+            cache_names = self._get_cache_names(i)
+
+            logger.info(
+                f"[CacheController] Creating Host cache for layer {i}: "
+                f"key={(key_need_to_allocate_bytes / 1024 ** 3):.2f}GB, "
+                f"value={(value_need_to_allocate_bytes / 1024 ** 3):.2f}GB"
+            )
+
+            # Allocate key cache using cuda_host_alloc (pinned memory)
+            self.host_cache_kvs_map[cache_names["key"]] = cuda_host_alloc(key_need_to_allocate_bytes)
+
+            # Allocate scale cache for block_wise_fp8 quantization
+            if self._is_fp8_quantization(kv_cache_quant_type):
+                self.host_cache_kvs_map[cache_names["key_scale"]] = cuda_host_alloc(scales_key_need_to_allocate_bytes)
+
+            # Allocate value cache if needed
+            if value_need_to_allocate_bytes > 0:
+                self.host_cache_kvs_map[cache_names["value"]] = cuda_host_alloc(value_need_to_allocate_bytes)
+                if self._is_fp8_quantization(kv_cache_quant_type):
+                    self.host_cache_kvs_map[cache_names["value_scale"]] = cuda_host_alloc(
+                        scales_value_need_to_allocate_bytes
+                    )
+
+        logger.info(f"[CacheController] Swap space (Host cache) is ready for {num_layers} layers!")
+
+        # Store shapes for later use
+        self._host_key_cache_shape = [num_host_blocks] + list(key_cache_shape[1:])
+        self._host_value_cache_shape = [num_host_blocks] + list(value_cache_shape[1:]) if value_cache_shape else None
+        self._host_cache_scale_shape = cache_scale_shape
+        self._num_host_blocks = num_host_blocks
+
+        # Share host_cache_kvs_map with transfer manager
+        self._transfer_manager.set_host_cache_kvs_map(self.host_cache_kvs_map)
+
+    def get_host_cache_kvs_map(self) -> Dict[str, Any]:
+        """
+        Get the Host KV Cache pointer dictionary.
+
+        Returns:
+            Host KV Cache pointer dictionary, empty dict if not initialized.
+        """
+        return self.host_cache_kvs_map
+
+    # ============ Worker Methods ============
+
+    def _submit_swap_task(
+        self,
+        meta: CacheSwapMetadata,
+        src_location: CacheLevel,
+        dst_location: CacheLevel,
+        transfer_fn_all: callable,
+        transfer_fn_layer: callable,
+        force_all_layers: bool = False,
+    ) -> LayerDoneCounter:
+        """
+        Submit a single swap transfer task (internal method).
+
+        Creates a LayerDoneCounter for tracking layer completion.
+        The counter is returned to the caller for later waiting.
+
+        H2D (load) always uses layer-by-layer mode for compute-transfer overlap.
+        D2H (evict) always uses all-layers mode via _output_stream (fire-and-forget).
+
+        Args:
+            meta: CacheSwapMetadata containing src_block_ids and dst_block_ids.
+            src_location: Source cache level (CacheLevel.HOST or CacheLevel.DEVICE).
+            dst_location: Destination cache level (CacheLevel.DEVICE or CacheLevel.HOST).
+            transfer_fn_all: All-layer transfer function, signature (src_ids, dst_ids) -> bool.
+            transfer_fn_layer: Layer-by-layer transfer function, signature (layer_indices, on_layer_complete, src_ids, dst_ids) -> bool.
+            force_all_layers: If True, always use all-layers mode (used for D2H evict).
+
+        Returns:
+            LayerDoneCounter instance for tracking layer completion.
+        """
+        # Create LayerDoneCounter for this transfer (independent sync primitive)
+        layer_counter = LayerDoneCounter(self._num_layers)
+
+        src_block_ids = meta.src_block_ids
+        dst_block_ids = meta.dst_block_ids
+
+        if not src_block_ids or not dst_block_ids:
+            logger.info(f"[SwapTask] skip: empty block_ids src={src_block_ids}, dst={dst_block_ids}")
+            meta.success = False
+            meta.error_message = "Empty block IDs in CacheSwapMetadata"
+            return layer_counter
+
+        layers_to_transfer = list(range(self._num_layers))
+
+        def _on_layer_complete(layer_idx: int) -> None:
+            """Callback called after each layer's H2D kernel is submitted to input_stream.
+
+            Records a CUDA event on input_stream so that wait_for_layer() can
+            synchronize on the actual transfer stream (cross-stream dependency).
+            """
+            # Record event on _input_stream so wait_for_layer() waits for the real H2D transfer.
+            # Must use input_stream (not Paddle default stream) to capture the correct dependency.
+            stream_event = self._transfer_manager.record_input_stream_event()
+            if stream_event is not None:
+                layer_counter.set_layer_event(layer_idx, stream_event)
+
+            # Mark layer done (adds to _completed_layers, unblocks polling fallback)
+            layer_counter.mark_layer_done(layer_idx)
+
+        def _do_transfer():
+            try:
+                start_time = time.time()
+                if force_all_layers:
+                    success = transfer_fn_all(src_block_ids, dst_block_ids)
+                    elapsed = time.time() - start_time
+                    if success:
+                        # For H2D transfers: record event on _input_stream so that
+                        # wait_all() synchronizes on the actual transfer stream, not
+                        # Paddle's default stream. set_layer_event must be called
+                        # before mark_all_done() so wait_all()'s loop finds the event.
+                        if dst_location == CacheLevel.DEVICE:
+                            stream_event = self._transfer_manager.record_input_stream_event()
+                            if stream_event is not None:
+                                layer_counter.set_layer_event(self._num_layers - 1, stream_event)
+
+                        # Mark all layers done at once
+                        layer_counter.mark_all_done()
+
+                    result = TransferResult(
+                        src_block_ids=src_block_ids,
+                        dst_block_ids=dst_block_ids,
+                        src_type=src_location,
+                        dst_type=dst_location,
+                        success=success,
+                        error_message=(
+                            None if success else f"All-layer {src_location.value}→{dst_location.value} transfer failed"
+                        ),
+                    )
+                    logger.debug(
+                        f"[SwapTask] all_layers {src_location.value}->{dst_location.value} "
+                        f"{'success' if success else 'FAILED'} "
+                        f"src={src_block_ids} dst={dst_block_ids} elapsed={elapsed*1000:.3f}ms"
+                    )
+                else:
+                    success = transfer_fn_layer(
+                        layers_to_transfer,
+                        _on_layer_complete,
+                        src_block_ids,
+                        dst_block_ids,
+                    )
+                    elapsed = time.time() - start_time
+                    result = TransferResult(
+                        src_block_ids=src_block_ids,
+                        dst_block_ids=dst_block_ids,
+                        src_type=src_location,
+                        dst_type=dst_location,
+                        success=success,
+                        error_message=(
+                            None
+                            if success
+                            else f"Layer-by-layer {src_location.value}→{dst_location.value} transfer failed"
+                        ),
+                    )
+                    logger.debug(
+                        f"[SwapTask] layer_by_layer {src_location.value}->{dst_location.value} "
+                        f"{'success' if success else 'FAILED'} "
+                        f"src={src_block_ids} dst={dst_block_ids} elapsed={elapsed*1000:.3f}ms"
+                    )
+
+                # Update metadata with result
+                meta.success = result.success
+                meta.error_message = result.error_message
+
+            except Exception as e:
+                import traceback
+
+                traceback.print_exc()
+                logger.error(
+                    f"[SwapTask] {src_location.value}->{dst_location.value} "
+                    f"EXCEPTION: {e}\n{traceback.format_exc()}"
+                )
+                meta.success = False
+                meta.error_message = str(e)
+            finally:
+                # Cleanup CUDA events when transfer is complete
+                layer_counter.cleanup()
+
+        self._executor.submit(_do_transfer)
+        return layer_counter
+
+    def load_host_to_device(
+        self,
+        swap_metadata: CacheSwapMetadata,
+    ) -> LayerDoneCounter:
+        """
+        Load host cache to device (async).
+
+        Creates an async transfer task and returns LayerDoneCounter
+        for tracking layer completion.
+
+        Args:
+            swap_metadata: CacheSwapMetadata containing:
+                - src_block_ids: Source host block IDs
+                - dst_block_ids: Destination device block IDs
+
+        Returns:
+            LayerDoneCounter for tracking layer completion.
+        """
+        layer_counter = self._submit_swap_task(
+            meta=swap_metadata,
+            src_location=CacheLevel.HOST,
+            dst_location=CacheLevel.DEVICE,
+            transfer_fn_all=None,
+            transfer_fn_layer=lambda layer_indices, on_layer_complete, src_ids, dst_ids: self._transfer_manager.load_layers_to_device_async(
+                layer_indices=layer_indices,
+                host_block_ids=src_ids,
+                device_block_ids=dst_ids,
+                on_layer_complete=on_layer_complete,
+            ),
+        )
+        return layer_counter
+
+    def evict_device_to_host(
+        self,
+        swap_metadata: CacheSwapMetadata,
+    ) -> LayerDoneCounter:
+        """
+        Evict device cache to host (async).
+
+        Creates an async transfer task and returns LayerDoneCounter
+        for tracking layer completion.
+
+        Args:
+            swap_metadata: CacheSwapMetadata containing:
+                - src_block_ids: Source device block IDs
+                - dst_block_ids: Destination host block IDs
+
+        Returns:
+            LayerDoneCounter for tracking layer completion.
+        """
+        layer_counter = self._submit_swap_task(
+            meta=swap_metadata,
+            src_location=CacheLevel.DEVICE,
+            dst_location=CacheLevel.HOST,
+            transfer_fn_all=lambda src_ids, dst_ids: self._transfer_manager.evict_to_host_async(src_ids, dst_ids),
+            transfer_fn_layer=None,
+            force_all_layers=True,  # Eviction always uses output_stream for all-layers async transfer
+        )
+        return layer_counter
+
+    def prefetch_from_storage(
+        self,
+        metadata: StorageMetadata,
+    ) -> AsyncTaskHandler:
+        """
+        Prefetch storage cache to host (async).
+
+        When Scheduler matches cache in storage, Worker uses this method
+        to pull data from storage to host.
+
+        Args:
+            metadata: Storage transfer metadata, containing:
+                - hash_values: Hash values to fetch
+                - block_ids: Destination host block IDs (pre-allocated by Scheduler)
+                - Other storage-specific parameters
+
+        Returns:
+            AsyncTaskHandler for tracking the async transfer task.
+        """
+
+        handler = AsyncTaskHandler()
+
+        # TODO: Implement storage prefetch logic
+        handler.set_error("Storage prefetch not implemented yet")
+
+        return handler
+
+    def backup_device_to_storage(
+        self,
+        device_block_ids: List[int],
+        metadata: StorageMetadata,
+    ) -> AsyncTaskHandler:
+        """
+        Backup device cache to storage (async).
+
+        Backup KV cache from device memory to external storage
+        for reuse by subsequent requests.
+
+        Args:
+            device_block_ids: Device block IDs to backup.
+            metadata: Storage transfer metadata.
+
+        Returns:
+            AsyncTaskHandler for tracking the async transfer task.
+        """
+
+        handler = AsyncTaskHandler()
+
+        # TODO: Implement storage backup logic
+        handler.set_error("Storage backup not implemented yet")
+
+        return handler
+
+    def backup_host_to_storage(
+        self,
+        host_block_ids: List[int],
+        metadata: StorageMetadata,
+    ) -> AsyncTaskHandler:
+        """
+        Backup host cache to storage (async).
+
+        Backup KV cache from host memory to external storage.
+
+        Args:
+            host_block_ids: Host block IDs to backup.
+            metadata: Storage transfer metadata.
+
+        Returns:
+            AsyncTaskHandler for tracking the async transfer task.
+        """
+
+        handler = AsyncTaskHandler()
+
+        # TODO: Implement storage backup logic
+        handler.set_error("Storage backup not implemented yet")
+
+        return handler
+
+    def send_to_node(
+        self,
+        metadata: PDTransferMetadata,
+    ) -> AsyncTaskHandler:
+        """
+        Send cache to another node (PD separation, async).
+
+        In PD separation architecture, P node uses this method
+        to send KV cache to D node.
+
+        Args:
+            metadata: PD transfer metadata, containing:
+                - target_node_id: Target node identifier
+                - block_ids: Block IDs to transfer
+                - Other transfer-specific parameters
+
+        Returns:
+            AsyncTaskHandler for tracking the async transfer task.
+        """
+
+        handler = AsyncTaskHandler()
+
+        # TODO: Implement PD separation transfer logic
+        handler.set_error("PD transfer not implemented yet")
+
+        return handler
+
+    def wait_for_transfer_from_node(
+        self,
+        metadata: PDTransferMetadata,
+    ) -> AsyncTaskHandler:
+        """
+        Wait for cache transfer from another node (PD separation, async).
+
+        In PD separation architecture, D node uses this method
+        to wait for P node to send KV cache.
+
+        Args:
+            metadata: PD transfer metadata, containing:
+                - source_node_id: Source node identifier
+                - block_ids: Block IDs to receive
+                - Other transfer-specific parameters
+
+        Returns:
+            AsyncTaskHandler for tracking the async transfer task.
+        """
+
+        handler = AsyncTaskHandler()
+
+        # TODO: Implement PD separation transfer wait logic
+        handler.set_error("PD transfer not implemented yet")
+
+        return handler
+
+    # ============ Public Interface Implementation ============
+
+    def reset_cache(self) -> bool:
+        """
+        Reset cache state (clear content only, do NOT free storage).
+
+        This method only clears the transfer state:
+        - Clears pending evict counters
+
+        It does NOT free any storage (GPU memory, CPU pinned memory, or storage).
+        Use free_cache() to release storage resources.
+
+        Returns:
+            True if successful, False otherwise.
+        """
+        try:
+            with self._lock:
+                # Clear pending evict counters
+                self._pending_evict_counters.clear()
+            return True
+        except Exception:
+            return False
+
+    def free_cache(self) -> bool:
+        """
+        Free all cache storage (GPU memory + CPU pinned memory + storage).
+
+        This releases all underlying storage resources, not just clears content.
+        Use this when shutting down or wanting to fully release cache resources.
+
+        Returns:
+            True if successful, False otherwise.
+        """
+        try:
+            # First reset transfer state
+            self.reset_cache()
+
+            # Free GPU cache
+            self.free_gpu_cache()
+
+            # Free CPU cache (pinned memory)
+            self._free_host_cache()
+
+            # Clear storage
+            self._clear_storage()
+
+            return True
+        except Exception:
+            return False
+
+    def free_gpu_cache(self) -> None:
+        """Free GPU cache tensors stored in cache_kvs_map."""
+        if not hasattr(self, "cache_kvs_map") or not self.cache_kvs_map:
+            return
+
+        logger.info(f"[CacheController] Freeing GPU cache memory, {len(self.cache_kvs_map)} tensors.")
+        self.cache_kvs_map.clear()
+        paddle.device.cuda.empty_cache()
+        logger.info("[CacheController] GPU cache memory released.")
+
+    def _clear_storage(self) -> None:
+        """Clear storage connector cache."""
+        storage_connector = getattr(self._transfer_manager, "_storage_connector", None)
+        if not storage_connector:
+            return
+
+        try:
+            if hasattr(storage_connector, "clear") and callable(storage_connector.clear):
+                count = storage_connector.clear()
+                logger.info(f"[CacheController] Cleared {count} entries from storage.")
+            elif hasattr(storage_connector, "disconnect") and callable(storage_connector.disconnect):
+                storage_connector.disconnect()
+                logger.info("[CacheController] Storage connector disconnected.")
+        except Exception as e:
+            logger.warning(f"[CacheController] Failed to clear storage: {e}")
+
+    # ============ Statistics Methods ============
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get controller statistics."""
+        with self._lock:
+            return {
+                "initialized": self._initialized,
+                "num_layers": self._num_layers,
+                "pending_evict_counters": len(self._pending_evict_counters),
+                "transfer_manager": self._transfer_manager.get_stats(),
+            }
+
+    def start(self) -> None:
+        """Start the transfer manager."""
+        self._transfer_manager.start()
+
+    def stop(self) -> None:
+        """Stop the transfer manager and shutdown thread pool."""
+        self._transfer_manager.stop()
+        # Shutdown thread pool executor
+        self._executor.shutdown(wait=False)
+
+    def __del__(self) -> None:
+        """Destructor to release pinned host memory."""
+        try:
+            self._free_host_cache()
+        except Exception:
+            pass
+
+    def _free_host_cache(self) -> None:
+        """Free pinned host memory allocated for swap space."""
+        if not hasattr(self, "host_cache_kvs_map"):
+            return
+
+        if not self.host_cache_kvs_map:
+            return
+
+        logger.info(f"[CacheController] Freeing host cache memory, {len(self.host_cache_kvs_map)} tensors.")
+        for name, ptr in list(self.host_cache_kvs_map.items()):
+            if ptr != 0:
+                try:
+                    cuda_host_free(ptr)
+                except Exception as e:
+                    logger.warning(f"[CacheController] Failed to free host cache {name}: {e}")
+        self.host_cache_kvs_map.clear()
+        logger.info("[CacheController] Host cache memory released.")
diff --git a/fastdeploy/cache_manager/v1/cache_manager.py b/fastdeploy/cache_manager/v1/cache_manager.py
new file mode 100644
index 00000000000..0a6c3b37b99
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/cache_manager.py
@@ -0,0 +1,1060 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+
+import threading
+import traceback
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+from fastdeploy.utils import get_logger
+
+if TYPE_CHECKING:
+    from fastdeploy.engine.request import Request
+    from fastdeploy.config import FDConfig
+    from fastdeploy.cache_manager.v1.storage import StorageScheduler
+
+from .base import KVCacheBase
+from .block_pool import DeviceBlockPool, HostBlockPool
+from .metadata import BlockNode, CacheLevel, CacheStatus, CacheSwapMetadata, MatchResult
+from .radix_tree import RadixTree
+from .storage import create_storage_scheduler
+
+logger = get_logger("prefix_cache_manager", "cache_manager.log")
+
+
+class CacheManager(KVCacheBase):
+    """
+    Cache Manager for Scheduler process.
+
+    Inherits from KVCacheBase and uniquely owns DeviceBlockPool and HostBlockPool.
+    Responsible for block allocation/release, cache matching, and eviction decisions.
+
+    Three-level cache hierarchy:
+        Level 1: Device (GPU) - Fastest access, directly used for inference
+        Level 2: Host (CPU) - Medium speed, needs to be loaded to Device
+        Level 3: Storage - Slowest, needs to be fetched to Host first
+
+    Attributes:
+        device_pool: DeviceBlockPool instance.
+        host_pool: HostBlockPool instance.
+        radix_tree: RadixTree instance for prefix matching.
+    """
+
+    def __init__(
+        self,
+        config: "FDConfig",
+    ):
+        """
+        Initialize the Cache Manager.
+
+        Args:
+            config: FDConfig instance containing all fastdeploy configuration
+        """
+        super().__init__(config)
+
+        # Extract configuration from FDConfig
+        self.num_gpu_blocks = self.cache_config.total_block_num
+        self.num_cpu_blocks = self.cache_config.num_cpu_blocks
+        self.block_size = self.cache_config.block_size
+        self.enable_host_cache = self.num_cpu_blocks > 0
+        self.enable_prefix_caching = self.cache_config.enable_prefix_caching
+
+        # Write policy for backup (write_through, write_through_selective, write_back)
+        # Normalize write_policy: "write_through" is a special case of "write_through_selective" with threshold=1
+        self._write_policy = self.cache_config.write_policy
+        self._write_through_threshold = self.cache_config.write_through_threshold
+        if self._write_policy == "write_through":
+            self._write_through_threshold = 1
+            self._write_policy = "write_through_selective"
+
+        # Thread safety
+        self._lock = threading.RLock()
+
+        # Initialize block pools
+        self._device_pool = DeviceBlockPool(
+            num_blocks=self.num_gpu_blocks,
+            block_size=self.block_size,
+        )
+        self._host_pool = HostBlockPool(
+            num_blocks=self.num_cpu_blocks,
+            block_size=self.block_size,
+        )
+
+        # Initialize radix tree for prefix matching
+        self._radix_tree = None
+        if self.enable_prefix_caching:
+            self._radix_tree = RadixTree(
+                enable_host_cache=self.enable_host_cache,
+                write_policy=self._write_policy,
+            )
+
+        # Pending backup list: nodes waiting to be backed up, to be issued via request's cache_evict_metadata
+        self._pending_backup: List[Tuple[List[BlockNode], List[int]]] = []
+        self._pending_block_ids: List[int] = []
+
+        # Storage scheduler (create using factory method if backend is configured)
+        self._storage_scheduler = create_storage_scheduler(self.cache_config)
+
+        # Eviction tracking
+        self._eviction_in_progress = False
+
+        self._initialized = True
+
+        logger.info(
+            f"CacheManager initialized, num_gpu_blocks: {self.num_gpu_blocks}, "
+            f"num_cpu_blocks: {self.num_cpu_blocks}, block_size: {self.block_size}, "
+            f"enable_prefix_caching: {self.enable_prefix_caching}, "
+            f"enable_host_cache: {self.enable_host_cache}, "
+            f"write_policy: {self._write_policy}, "
+            f"write_through_threshold: {self._write_through_threshold}"
+        )
+
+    # ============ Properties ============
+
+    @property
+    def device_pool(self) -> DeviceBlockPool:
+        """Get the device block pool."""
+        return self._device_pool
+
+    @property
+    def host_pool(self) -> HostBlockPool:
+        """Get the host block pool."""
+        return self._host_pool
+
+    @property
+    def radix_tree(self) -> RadixTree:
+        """Get the radix tree."""
+        return self._radix_tree
+
+    @property
+    def num_free_device_blocks(self) -> int:
+        """Get number of free device blocks."""
+        return self._device_pool.available_blocks()
+
+    @property
+    def num_free_host_blocks(self) -> int:
+        """Get number of free host blocks."""
+        return self._host_pool.available_blocks()
+
+    @property
+    def storage_scheduler(self) -> Optional["StorageScheduler"]:
+        """Get the storage scheduler."""
+        return self._storage_scheduler
+
+    # ============ Block Allocation/Release Methods ============
+
+    def can_allocate_device_blocks(self, num: int) -> bool:
+        """
+        Check if current resources can allocate the specified number of device blocks.
+
+        Args:
+            num: Number of blocks to check
+
+        Returns:
+            True if allocation is possible, False otherwise
+        """
+        if self._device_pool.available_blocks() >= num:
+            return True
+
+        elif self.enable_prefix_caching:
+            stats = self._radix_tree.get_stats()
+            if self._device_pool.available_blocks() + stats.evictable_device_count >= num:
+                return True
+
+        return False
+
+    def can_allocate_host_blocks(self, num: int) -> bool:
+        """
+        Check if current resources can allocate the specified number of host blocks.
+
+        Args:
+            num: Number of blocks to check
+
+        Returns:
+            True if allocation is possible, False otherwise
+        """
+        if self._host_pool.available_blocks() >= num:
+            return True
+
+        elif self.enable_prefix_caching:
+            stats = self._radix_tree.get_stats()
+            if self._host_pool.available_blocks() + stats.evictable_host_count >= num:
+                return True
+
+        return False
+
+    def allocate_device_blocks(
+        self,
+        request: Request,
+        num_blocks: int,
+    ) -> Optional[List[int]]:
+        """
+        Allocate device blocks for a request.
+
+        This method handles:
+        1. Evicting device blocks if needed
+        2. Swapping host blocks to device if matched
+        3. Inserting new blocks into RadixTree
+
+        Args:
+            request: Request object containing match result and prompt hashes
+            num_blocks: Number of new device blocks to allocate
+
+        Returns:
+            List of allocated device block indices, or empty list if allocation failed
+        """
+        try:
+            with self._lock:
+                match_result = request.match_result
+
+                need_block_num = num_blocks
+
+                if not self.can_allocate_device_blocks(need_block_num):
+                    return []
+
+                if need_block_num > self._device_pool.available_blocks():
+                    evicted_result = self._evict_blocks(need_block_num - self._device_pool.available_blocks())
+                    if evicted_result is None:
+                        logger.error(f"evict_device_blocks failed, request_id: {request.request_id}")
+                        return []
+
+                    if self.enable_host_cache and self._write_policy == "write_back":
+                        evicted_blocks, host_block_ids = evicted_result
+                        if len(evicted_blocks) != len(host_block_ids):
+                            logger.error(
+                                f"evict_blocks to host failed, request_id: {request.request_id}, "
+                                f"evicted_blocks: {evicted_blocks}, host_block_ids: {host_block_ids}"
+                            )
+                            return []
+                        request.cache_evict_metadata.append(
+                            CacheSwapMetadata(
+                                src_block_ids=evicted_blocks,
+                                dst_block_ids=host_block_ids,
+                                src_type=CacheLevel.DEVICE,
+                                dst_type=CacheLevel.HOST,
+                            )
+                        )
+
+                allocated = self._device_pool.allocate(need_block_num)
+                if allocated is None:
+                    logger.error(
+                        f"allocate device blocks failed, request_id: {request.request_id}, need: {need_block_num}"
+                    )
+                    return []
+
+                if self.enable_host_cache and match_result.matched_host_nums > 0:
+                    device_blocks = allocated[: match_result.matched_host_nums]
+
+                    free_host_block_ids = self._radix_tree.swap_to_device(match_result.host_nodes, device_blocks)
+                    logger.debug(
+                        f"[allocate_device_blocks] request_id={request.request_id} "
+                        f"swap host->device: host_block_ids={free_host_block_ids} -> device_block_ids={device_blocks}"
+                    )
+
+                    request.cache_swap_metadata.append(
+                        CacheSwapMetadata(
+                            src_block_ids=free_host_block_ids,
+                            dst_block_ids=device_blocks,
+                            src_type=CacheLevel.HOST,
+                            dst_type=CacheLevel.DEVICE,
+                        )
+                    )
+
+                    if self._write_policy == "write_through_selective":
+                        self._radix_tree.backup_blocks(match_result.host_nodes, free_host_block_ids)
+                    else:
+                        self.free_host_blocks(free_host_block_ids)
+
+                    match_result.device_nodes.extend(match_result.host_nodes)
+                    match_result.host_nodes = []
+
+                if self.enable_prefix_caching:
+                    block_hashes = request.prompt_hashes[match_result.matched_device_nums :]
+                    all_device_blocks = request.block_tables + allocated
+                    uncached_device_blocks = all_device_blocks[match_result.matched_device_nums :]
+                    num_block_lens = min(len(uncached_device_blocks), len(block_hashes))
+
+                    if num_block_lens > 0:
+                        blocks = list(zip(block_hashes[:num_block_lens], uncached_device_blocks[:num_block_lens]))
+                        start_node = match_result.device_nodes[-1] if match_result.device_nodes else None
+
+                        device_nodes, wasted_block_ids = self._radix_tree.insert(blocks=blocks, start_node=start_node)
+                        match_result.device_nodes.extend(device_nodes)
+
+                        inserted_block_ids = [n.block_id for n in device_nodes]
+                        logger.debug(
+                            f"[allocate_device_blocks] request_id={request.request_id} "
+                            f"newly allocated={allocated} "
+                            f"inserted_into_path_block_ids={inserted_block_ids} "
+                            f"wasted_block_ids(not_in_path)={wasted_block_ids}"
+                        )
+
+                        # Release any blocks that were wasted due to node reuse
+                        # and update allocated with actual block_ids
+                        if wasted_block_ids:
+                            match_result.uncached_block_ids.extend(wasted_block_ids)
+
+                return allocated
+        except Exception as e:
+            logger.error(f"allocate_device_blocks error: {e}, {str(traceback.format_exc())}")
+            return []
+
+    def allocate_host_blocks(self, num: int) -> List[int]:
+        """
+        Allocate host blocks from the pool.
+
+        Args:
+            num: Number of blocks to allocate
+
+        Returns:
+            List of allocated block indices (may be fewer than requested or empty on error)
+        """
+        try:
+            if self._host_pool.available_blocks() < num:
+                evict_blocks = self._radix_tree.evict_host_nodes(num - self._host_pool.available_blocks())
+                if evict_blocks is not None:
+                    self._host_pool.release(evict_blocks)
+            return self._host_pool.allocate(num) or []
+        except Exception as e:
+            logger.error(f"allocate_host_blocks error: {e}, {str(traceback.format_exc())}")
+            return []
+
+    def free_device_blocks(self, block_ids: List[int]) -> None:
+        """
+        Free device blocks back to the pool.
+
+        Args:
+            block_ids: List of block indices to free
+        """
+        if not block_ids:
+            return
+
+        with self._lock:
+            self._device_pool.release(block_ids)
+
+    def free_host_blocks(self, block_ids: List[int]) -> None:
+        """
+        Free host blocks back to the pool.
+
+        Args:
+            block_ids: List of block indices to free
+        """
+        if not block_ids:
+            return
+        self._host_pool.release(block_ids)
+
+    def free_all_device_blocks(self) -> int:
+        """
+        Free all device blocks.
+
+        Returns:
+            Number of blocks freed
+        """
+        with self._lock:
+            freed = self._device_pool.used_blocks()
+            self._device_pool.reset()
+            return freed
+
+    def free_all_host_blocks(self) -> int:
+        """
+        Free all host blocks.
+
+        Returns:
+            Number of blocks freed
+        """
+        with self._lock:
+            freed = self._host_pool.used_blocks()
+            self._host_pool.reset()
+            return freed
+
+    def resize_device_pool(self, new_num_blocks: int) -> bool:
+        """
+        Resize the device block pool.
+
+        Supports both expansion and shrinking. Shrinking will fail if
+        there are more used blocks than the new size.
+
+        Args:
+            new_num_blocks: New total number of blocks for device pool
+
+        Returns:
+            True if resize was successful, False otherwise
+        """
+        logger.info(f"resize_device_pool: {self._device_pool.available_blocks()} -> {new_num_blocks}")
+        with self._lock:
+            if self._device_pool.resize(new_num_blocks):
+                self.num_gpu_blocks = new_num_blocks
+                return True
+            return False
+
+    # ============ Legacy Compatibility Methods ============
+    # These methods provide backward compatibility with PrefixCacheManager interface
+    # for resource_manager.py
+
+    def write_cache_to_storage(self, req: Any) -> None:
+        """
+        Write request cache to storage if storage is enabled.
+
+        Args:
+            req: The request object containing cache data to write
+        """
+        if self._storage_scheduler is None:
+            return
+        # TODO: Implement storage write logic when storage is enabled
+        pass
+
+    @property
+    def gpu_free_block_list(self) -> List[int]:
+        """
+        Get list of free GPU block indices (legacy alias).
+
+        Returns list of available device block IDs for compatibility
+        with PrefixCacheManager.gpu_free_block_list.
+        """
+        # Return list representation of available blocks
+        return list(range(self._device_pool.available_blocks()))
+
+    @property
+    def available_gpu_resource(self) -> float:
+        """
+        Get available GPU resource ratio (legacy alias).
+
+        Returns the ratio of free blocks to total blocks.
+        """
+        if self.num_gpu_blocks == 0:
+            return 0.0
+        return self._device_pool.available_blocks() / self.num_gpu_blocks
+
+    def allocate_gpu_blocks(self, request: Request, num_blocks: int) -> Optional[List[int]]:
+        """
+        Allocate GPU blocks (legacy alias for allocate_device_blocks).
+
+        Args:
+            request: Request object containing match result
+            num_blocks: Number of blocks to allocate
+
+        Returns:
+            List of allocated block indices, or None if allocation failed
+        """
+        return self.allocate_device_blocks(request, num_blocks)
+
+    def can_allocate_gpu_blocks(self, num_blocks: int) -> bool:
+        """
+        Check if GPU blocks can be allocated (legacy alias).
+
+        Args:
+            num_blocks: Number of blocks to check
+
+        Returns:
+            True if allocation is possible, False otherwise
+        """
+        return self.can_allocate_device_blocks(num_blocks)
+
+    def update_cache_config(self, new_cfg) -> None:
+        """
+        Update cache configuration.
+
+        Args:
+            new_cfg: New cache configuration object
+        """
+        self.cache_config = new_cfg
+        new_num_blocks = getattr(new_cfg, "total_block_num", None)
+        if new_num_blocks is not None:
+            self.resize_device_pool(new_num_blocks)
+
+    # ============ Three-Level Cache Matching ============
+
+    def match_prefix(
+        self,
+        request: Request,
+        skip_storage: bool = False,
+    ) -> None:
+        """
+        Execute three-level cache matching (Device -> Host -> Storage).
+
+        This is the main entry point for prefix matching during scheduling.
+        Only effective when prefix caching is enabled. The result is stored
+        in request._match_result.
+
+        Args:
+            request: Request object containing prompt hashes
+            skip_storage: If True, skip storage-level matching
+
+        Returns:
+            None. Match result is stored in request._match_result.
+        """
+        if not self.enable_prefix_caching or self._radix_tree is None:
+            return
+
+        with self._lock:
+            try:
+                result = MatchResult()
+                block_hashes = request.prompt_hashes
+
+                # Step 1: Match Device and Host cache via RadixTree
+                matched_nodes = self._radix_tree.find_prefix(block_hashes)
+
+                #   Split matched_nodes into device blocks and host blocks
+                if self.enable_host_cache:
+                    for node in matched_nodes:
+                        if node.is_on_device():
+                            result.device_nodes.append(node)
+                        elif node.is_on_host():
+                            result.host_nodes.append(node)
+                else:
+                    result.device_nodes = matched_nodes
+
+                #   Calculate remaining hashes to match
+                matched_count = result.matched_device_nums + result.matched_host_nums
+                remaining_hashes = block_hashes[matched_count:]
+
+                # Step 2: Match Storage (if enabled and not skipped)
+                if not skip_storage and self._storage_scheduler and remaining_hashes:
+                    storage_matches = self._match_storage(remaining_hashes)
+                    result.storage_nodes = self.prepare_prefetch_metadata(storage_matches)
+
+                # Step 3: Increment ref count for matched blocks(only first match node)
+                if not (self._storage_scheduler and skip_storage):
+                    self._radix_tree.increment_ref_nodes(matched_nodes)
+
+                matched_device_ids = [n.block_id for n in result.device_nodes]
+                matched_host_ids = [n.block_id for n in result.host_nodes]
+                logger.info(
+                    f"match_prefix for request_id: {request.request_id} total_hashes: {len(block_hashes)}, "
+                    f"total_matched: {result.total_matched_blocks} (device_blocks={result.matched_device_nums}, "
+                    f"host_blocks={result.matched_host_nums}, storage_hashes={result.matched_storage_nums})"
+                )
+                logger.debug(
+                    f"[match_prefix] request_id={request.request_id} "
+                    f"matched_device_block_ids={matched_device_ids} "
+                    f"matched_host_block_ids={matched_host_ids}"
+                )
+                request._match_result = result
+            except Exception as e:
+                logger.error(f"match_prefix error: {e}, {str(traceback.format_exc())}")
+
+    def _match_storage(self, hash_values: List[str]) -> List[str]:
+        """
+        Match hash values against storage.
+
+        Args:
+            hash_values: List of hash values to check
+
+        Returns:
+            List of hashes that exist in storage
+        """
+        if not self._storage_scheduler:
+            return []
+
+        try:
+            if not self._storage_scheduler.is_connected():
+                self._storage_scheduler.connect()
+
+            existence_map = self._storage_scheduler.query(hash_values)
+            return [h for h, exists in existence_map.items() if exists]
+        except Exception:
+            return []
+
+    # ============ Eviction Methods ============
+
+    def _evict_blocks(self, num_blocks: int) -> Optional[List[int]]:
+        """
+        Evict device blocks to free device memory.
+
+        In write_through_selective policy:
+        - Blocks with backup (backuped=True): Update metadata only, no actual data transfer needed
+        - Blocks without backup but hit_count >= threshold: Trigger emergency backup, then evict
+        - Blocks without backup and hit_count < threshold: Release directly
+
+        Eviction flow (for other policies):
+        1. Try to allocate host block ids for device->host eviction
+        2. If not enough host blocks, evict host nodes first to free host blocks
+        3. Evict device blocks to host using RadixTree.evict_device_to_host()
+        4. Free the evicted device blocks back to the pool
+
+        Args:
+            num_blocks: Number of device blocks to evict
+
+        Returns:
+            List of evicted device block ids, or None if eviction failed
+        """
+        if not self.enable_prefix_caching or self._radix_tree is None:
+            logger.warning("_evict_blocks: prefix caching not enabled")
+            return None
+
+        if num_blocks <= 0:
+            return [], []
+
+        try:
+            with self._lock:
+                host_block_ids = []
+
+                # Step 1: Check if we have enough evictable device blocks
+                stats = self._radix_tree.get_stats()
+                if stats.evictable_device_count < num_blocks:
+                    logger.warning(
+                        f"_evict_blocks: not enough evictable device blocks, "
+                        f"needed: {num_blocks}, available: {stats.evictable_device_count}"
+                    )
+                    return None
+
+                # Step 2: Handle eviction based on write policy
+                if self.enable_host_cache:
+                    if self._write_policy == "write_through_selective":
+                        # write_through_selective policy: optimize eviction based on backup status
+                        released_device_ids = self._radix_tree.evict_nodes_selective(num_blocks=num_blocks)
+                    elif self._write_policy == "write_back":
+                        # write_back policy:: allocate host blocks and evict to host
+                        host_block_ids = self.allocate_host_blocks(num_blocks)
+                        if host_block_ids is None or len(host_block_ids) < num_blocks:
+                            logger.warning("_evict_blocks: failed to allocate host blocks")
+                            return None
+
+                        released_device_ids = self._radix_tree.evict_device_to_host(
+                            num_blocks=num_blocks,
+                            host_block_ids=host_block_ids,
+                        )
+                else:
+                    # No host cache, evict device nodes directly
+                    released_device_ids = self._radix_tree.evict_device_nodes(num_blocks)
+
+                if released_device_ids is None:
+                    return None
+
+                # Step 3: Free the evicted device blocks
+                self._device_pool.release(released_device_ids)
+
+                logger.debug(
+                    f"[_evict_blocks] evicted_device_block_ids={released_device_ids} "
+                    f"host_block_ids={host_block_ids} "
+                    f"write_policy={self._write_policy} "
+                    f"free_device_after={self._device_pool.available_blocks()}"
+                )
+
+                return released_device_ids, host_block_ids
+        except Exception as e:
+            logger.error(f"_evict_blocks error: {e}, {str(traceback.format_exc())}")
+            return None
+
+    # ============ Request Lifecycle Methods ============
+
+    def request_finish(
+        self,
+        request: Request,
+    ) -> None:
+        """
+        Update cache state when a request finishes.
+
+        This method:
+        1. Inserts new blocks into the RadixTree (for caching)
+        2. Decrements reference counts for matched blocks
+        3. Releases blocks that cannot be cached:
+           - Blocks without hash (partial blocks)
+           - Blocks wasted due to node reuse
+
+        Note: Blocks successfully inserted into RadixTree are managed by
+        the tree and will be freed when evicted.
+
+        Only effective when prefix caching is enabled.
+
+        Args:
+            request: Request object containing match result and block tables
+        """
+        with self._lock:
+            try:
+                if self.enable_prefix_caching and self._radix_tree is not None:
+                    match_result = request.match_result
+
+                    block_hashes = request.prompt_hashes[match_result.matched_device_nums :]
+                    device_blocks = request.block_tables[match_result.matched_device_nums :]
+                    num_block_lens = min(len(device_blocks), len(block_hashes))
+
+                    if num_block_lens > 0:
+                        blocks = list(zip(block_hashes[:num_block_lens], device_blocks[:num_block_lens]))
+                        start_node = match_result.device_nodes[-1] if match_result.device_nodes else None
+
+                        device_nodes, wasted_block_ids = self._radix_tree.insert(blocks=blocks, start_node=start_node)
+                        match_result.device_nodes.extend(device_nodes)
+
+                        # Release blocks that were wasted due to node reuse
+                        if wasted_block_ids:
+                            match_result.uncached_block_ids.extend(wasted_block_ids)
+
+                    # Release uncached blocks
+                    uncached_blocks = match_result.uncached_block_ids
+                    uncached_blocks.extend(request.block_tables[match_result.matched_device_nums :])
+
+                    # Decrement ref count - blocks become evictable if ref_count reaches 0
+                    self._radix_tree.decrement_ref_nodes(match_result.device_nodes)
+                    self._device_pool.release(uncached_blocks)
+
+                    cached_block_ids = [n.block_id for n in match_result.device_nodes]
+                    logger.debug(
+                        f"[request_finish] request_id={request.request_id} "
+                        f"cached_block_ids(in_radix_tree)={cached_block_ids} "
+                        f"released_uncached_block_ids={uncached_blocks}"
+                    )
+                    logger.info(
+                        f"request {request.request_id} finished, cached blocks: {match_result.matched_device_nums}, "
+                        f"uncached blocks freed: {len(uncached_blocks)}, "
+                        f"total_free: {self._device_pool.available_blocks()}"
+                    )
+                else:
+                    self._device_pool.release(request.block_tables)
+
+                    logger.debug(
+                        f"[request_finish] request_id={request.request_id} "
+                        f"prefix_caching=disabled released_block_ids={request.block_tables}"
+                    )
+                    logger.info(
+                        f"request {request.request_id} finished, release blocks: {len(request.block_tables)}, "
+                        f"total_free: {self._device_pool.available_blocks()}"
+                    )
+            except Exception as e:
+                logger.error(f"request_finish error: {e}, {str(traceback.format_exc())}")
+
+    # ============ Write-through Selective Backup Methods ============
+
+    def get_pending_backup_count(self) -> int:
+        """
+        Get the number of pending backup tasks.
+
+        Returns:
+            Number of pending backup tasks in the queue.
+        """
+        return len(self._pending_backup)
+
+    def issue_pending_backup_to_batch_request(
+        self,
+    ) -> Optional[CacheSwapMetadata]:
+        """
+        Issue pending backup tasks and return a CacheSwapMetadata for BatchRequest.
+
+        This method is called during scheduling to prepare pending backup tasks
+        to be attached to a BatchRequest. The BatchRequest will pass this metadata
+        to the worker, which will execute the backup (Device->Host transfer).
+
+        Returns:
+            CacheSwapMetadata containing backup tasks, or None if no pending backup.
+        """
+        if not self._pending_backup:
+            return None
+
+        if not self.enable_host_cache or not self._radix_tree:
+            # No host cache, clear pending backup
+            self._pending_backup.clear()
+            return None
+
+        try:
+            with self._lock:
+                if not self._pending_backup:
+                    return None
+
+                all_device_block_ids = []
+                all_host_block_ids = []
+                freed_host_ids = []
+
+                for nodes, host_block_ids in self._pending_backup:
+                    # Filter out nodes that are no longer valid (already evicted, etc.)
+                    valid_nodes = []
+                    valid_host_ids = []
+
+                    for node, host_block_id in zip(nodes, host_block_ids):
+                        # Check if node is still in evictable_device and not already backed up
+                        if (
+                            node.node_id in self._radix_tree._evictable_device
+                            and not node.backuped
+                            and node.cache_status == CacheStatus.DEVICE
+                        ):
+                            valid_nodes.append(node)
+                            valid_host_ids.append(host_block_id)
+                        else:
+                            # Node no longer valid, release the allocated host block
+                            freed_host_ids.append(host_block_id)
+
+                    if valid_nodes:
+                        # Mark nodes as backed up
+                        self._radix_tree.backup_blocks(valid_nodes, valid_host_ids)
+
+                        # Collect device block IDs
+                        all_device_block_ids.extend([node.block_id for node in valid_nodes])
+                        all_host_block_ids.extend(valid_host_ids)
+
+                # Release invalid host block allocations
+                if freed_host_ids:
+                    self._host_pool.release(freed_host_ids)
+
+                # Clear pending backup
+                self._pending_backup.clear()
+                self._pending_block_ids.clear()
+
+                # Create and return CacheSwapMetadata
+                if all_device_block_ids:
+                    evict_metadata = CacheSwapMetadata(
+                        src_block_ids=all_device_block_ids,
+                        dst_block_ids=all_host_block_ids,
+                        src_type=CacheLevel.DEVICE,
+                        dst_type=CacheLevel.HOST,
+                    )
+                    return evict_metadata
+
+                return None
+
+        except Exception as e:
+            logger.error(f"issue_pending_backup_to_batch_request error: {e}, {str(traceback.format_exc())}")
+            # Clear pending backup on error to avoid infinite accumulation
+            self._pending_backup.clear()
+            self._pending_block_ids.clear()
+            return None
+
+    def check_and_add_pending_backup(
+        self,
+    ) -> None:
+        """
+        Check for nodes that meet backup criteria and add them to pending backup queue.
+
+        This method is called after request_finish to check if any nodes
+        in the radix tree meet the write_through_selective backup criteria.
+
+        For write_through_selective policy:
+        - Nodes with hit_count >= threshold that are not yet backed up
+        - are added to the pending backup queue
+
+        The pending backup will be issued to the next scheduled request.
+        """
+        if not self.enable_host_cache or not self._radix_tree:
+            return
+
+        if self._write_policy != "write_through_selective":
+            return
+
+        try:
+            with self._lock:
+                # Get candidates from radix tree
+                candidates = self._radix_tree.get_candidates_for_backup(
+                    self._write_through_threshold,
+                    self._pending_block_ids,
+                )
+
+                if not candidates:
+                    return
+
+                # Allocate host blocks for backup
+                host_block_ids = self.allocate_host_blocks(len(candidates))
+                if host_block_ids is None or len(host_block_ids) < len(candidates):
+                    logger.warning(
+                        f"check_and_add_pending_backup: failed to allocate host blocks, "
+                        f"needed={len(candidates)}, got={len(host_block_ids) if host_block_ids else 0}"
+                    )
+                    if host_block_ids:
+                        self._host_pool.release(host_block_ids)
+                    return
+
+                # Add to pending backup queue
+                self._pending_backup.append((candidates, host_block_ids))
+                self._pending_block_ids.extend([node.block_id for node in candidates])
+
+        except Exception as e:
+            logger.error(f"check_and_add_pending_backup error: {e}, {str(traceback.format_exc())}")
+
+    # ============ Host/Device Transfer Coordination ============
+
+    def offload_to_host(self, block_indices: List[int]) -> bool:
+        """
+        Offload blocks from device to host memory.
+
+        This is a coordination method. Actual data transfer happens in Worker.
+
+        Args:
+            block_indices: List of block indices to offload
+
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            with self._lock:
+                # Allocate host blocks
+                host_indices = self._host_pool.allocate(len(block_indices))
+                if host_indices is None or len(host_indices) != len(block_indices):
+                    # Not enough host memory, release what we allocated
+                    if host_indices:
+                        self._host_pool.release(host_indices)
+                    return False
+
+                # Perform the offload (actual data transfer would happen in Worker)
+                for i, dev_idx in enumerate(block_indices):
+                    host_idx = host_indices[i]
+                    metadata = self._device_pool.get_metadata(dev_idx)
+                    if metadata:
+                        self._host_pool.set_metadata(host_idx, metadata)
+
+                # Release device blocks
+                self._device_pool.release(block_indices)
+
+                return True
+        except Exception as e:
+            logger.error(f"offload_to_host error: {e}, {str(traceback.format_exc())}")
+            return False
+
+    def load_from_host(self, block_indices: List[int]) -> bool:
+        """
+        Load blocks from host to device memory.
+
+        This is a coordination method. Actual data transfer happens in Worker.
+
+        Args:
+            block_indices: List of host block indices to load
+
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            with self._lock:
+                # Allocate device blocks
+                dev_indices = self._device_pool.allocate(len(block_indices))
+                if dev_indices is None or len(dev_indices) != len(block_indices):
+                    if dev_indices:
+                        self._device_pool.release(dev_indices)
+                    return False
+
+                # Perform the load (actual data transfer would happen in Worker)
+
+                # Release host blocks
+                self._host_pool.release(block_indices)
+
+                return True
+        except Exception as e:
+            logger.error(f"load_from_host error: {e}, {str(traceback.format_exc())}")
+            return False
+
+    # ============ Prefetch Methods ============
+
+    def prepare_prefetch_metadata(
+        self,
+        storage_hashes: List[str],
+    ) -> Optional[List["BlockNode"]]:
+        """
+        Prepare metadata for storage prefetch operation.
+
+        Called when storage cache is matched, allocates host blocks
+        for the prefetch target.
+
+        Args:
+            storage_hashes: List of storage hash values to prefetch
+
+        Returns:
+            List of BlockNode objects if successful, None or empty list otherwise.
+            Each node's block_id contains the actual block assigned
+            (may differ from originally allocated if node was reused).
+        """
+        if not storage_hashes:
+            return None
+
+        try:
+            with self._lock:
+                # Check if we have enough host blocks
+                if not self.can_allocate_host_blocks(len(storage_hashes)):
+                    return []
+
+                # Allocate host blocks for prefetch
+                host_block_ids = self._host_pool.allocate(len(storage_hashes))
+                if host_block_ids is None or len(host_block_ids) == 0:
+                    return []
+
+                blocks = list(zip(storage_hashes, host_block_ids))
+                prefetch_nodes, wasted_block_ids = self._radix_tree.insert(
+                    blocks=blocks, cache_status=CacheStatus.LOADING_FROM_STORAGE
+                )
+                # Release any blocks that were wasted due to node reuse
+                if wasted_block_ids:
+                    self._host_pool.release(wasted_block_ids)
+
+                return prefetch_nodes
+        except Exception as e:
+            logger.error(f"prepare_prefetch_metadata error: {e}, {str(traceback.format_exc())}")
+            return []
+
+    # ============ Reset Methods ============
+
+    def reset_cache(self) -> bool:
+        """
+        Reset cache state.
+
+        Implements abstract method from KVCacheBase.
+        Clears block pools and radix tree.
+
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            with self._lock:
+                self._device_pool.reset()
+                self._host_pool.reset()
+                if self._radix_tree is not None:
+                    self._radix_tree.reset()
+                self._eviction_in_progress = False
+            logger.info("reset_cache: all cache state cleared")
+            return True
+        except Exception as e:
+            logger.error(f"reset_cache failed: {e}, {str(traceback.format_exc())}")
+            return False
+
+    # ============ Statistics Methods ============
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cache manager statistics."""
+        return {
+            "initialized": self._initialized,
+            "num_gpu_blocks": self.num_gpu_blocks,
+            "num_cpu_blocks": self.num_cpu_blocks,
+            "block_size": self.block_size,
+            "device_pool": self._device_pool.get_stats(),
+            "host_pool": self._host_pool.get_stats(),
+            "radix_tree": self._radix_tree.get_stats() if self._radix_tree else None,
+            "num_free_device_blocks": self.num_free_device_blocks,
+            "num_free_host_blocks": self.num_free_host_blocks,
+            "storage_enabled": self._storage_scheduler is not None,
+        }
+
+    def get_memory_usage(self) -> Dict[str, Any]:
+        """
+        Get memory usage statistics.
+
+        Returns:
+            Dictionary with memory usage information
+        """
+        device_stats = self._device_pool.get_stats()
+        host_stats = self._host_pool.get_stats()
+
+        return {
+            "device": {
+                "total_blocks": device_stats["num_blocks"],
+                "used_blocks": device_stats["used"],
+                "free_blocks": device_stats["available"],
+                "usage_percent": (
+                    device_stats["used"] / device_stats["num_blocks"] * 100 if device_stats["num_blocks"] > 0 else 0
+                ),
+            },
+            "host": {
+                "total_blocks": host_stats["num_blocks"],
+                "used_blocks": host_stats["used"],
+                "free_blocks": host_stats["available"],
+                "usage_percent": (
+                    host_stats["used"] / host_stats["num_blocks"] * 100 if host_stats["num_blocks"] > 0 else 0
+                ),
+            },
+        }
diff --git a/fastdeploy/cache_manager/v1/cache_utils.py b/fastdeploy/cache_manager/v1/cache_utils.py
new file mode 100644
index 00000000000..589d2c46e7a
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/cache_utils.py
@@ -0,0 +1,628 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import hashlib
+import pickle
+import threading
+import time
+from typing import Any, Callable, Dict, List, Optional, Sequence, Set
+
+from paddleformers.utils.log import logger
+
+
+class LayerDoneCounter:
+    """
+    Independent synchronization primitive for tracking layer completion of a single transfer.
+
+    Used in compute-transfer overlap scenarios:
+    - Each LayerDoneCounter instance tracks layer completion for one transfer task.
+    - Uses CUDA Events for efficient waiting (no polling).
+    - Thread-safe.
+
+    Attributes:
+        _num_layers: Total number of layers.
+        _lock: Thread lock.
+        _completed_layers: Set of completed layer indices.
+        _callbacks: List of layer-completion callbacks.
+        _cuda_events: CUDA event per layer.
+        _layer_complete_times: Mapping of layer index to completion time.
+        _wait_count: Count of active waiters.
+    """
+
+    def __init__(self, num_layers: int):
+        """
+        Initialize the layer done counter.
+
+        Args:
+            num_layers: Total number of layers to track
+        """
+        self._num_layers = num_layers
+        self._lock = threading.RLock()
+        self._completed_layers: Set[int] = set()
+        self._callbacks: List[Callable[[int], None]] = []
+        self._start_time: float = time.time()
+
+        # ============ CUDA Events for efficient waiting (no polling) ============
+        # Initialized to None; set by set_layer_event() after kernel submission to transfer stream.
+        # None means no event recorded yet for that layer (must fall back to polling).
+        self._cuda_events: List[Any] = [None] * num_layers
+        self._layer_complete_times: Dict[int, float] = {}
+
+        # ============ Reference count for active waiters (prevents premature cleanup) ============
+        self._wait_count: int = 0
+
+    def get_num_layers(self) -> int:
+        """Get the total number of layers."""
+        return self._num_layers
+
+    # ============ Mark Methods (called by transfer thread) ============
+
+    def set_layer_event(self, layer_idx: int, cuda_event: Any) -> None:
+        """
+        Set the CUDA event for a specific layer (used for cross-stream synchronization).
+
+        Called by transfer thread after submitting a layer's kernel to a non-default
+        stream (e.g., input_stream), so that wait_for_layer() can correctly synchronize
+        on the actual stream where the transfer runs.
+
+        Args:
+            layer_idx: Index of the layer
+            cuda_event: CUDA event recorded on the transfer stream after kernel submission
+        """
+        with self._lock:
+            if 0 <= layer_idx < len(self._cuda_events):
+                self._cuda_events[layer_idx] = cuda_event
+
+    def mark_layer_done(self, layer_idx: int, cuda_event: Any = None) -> bool:
+        """
+        Mark a layer as completed.
+
+        Args:
+            layer_idx: Index of the completed layer
+            cuda_event: Optional CUDA event to record completion
+
+        Returns:
+            True if this was the last layer, False otherwise
+        """
+        with self._lock:
+            if layer_idx in self._completed_layers:
+                logger.warning(f"[mark_layer_done] layer {layer_idx} already marked done")
+                return len(self._completed_layers) >= self._num_layers
+
+            self._completed_layers.add(layer_idx)
+            self._layer_complete_times[layer_idx] = time.time()
+
+            # Record CUDA event if provided
+            if cuda_event is not None:
+                try:
+                    cuda_event.record()
+                except Exception as e:
+                    logger.warning(f"Failed to record CUDA event for layer {layer_idx}: {e}")
+
+            # Execute callbacks for this layer
+            for callback in self._callbacks:
+                try:
+                    callback(layer_idx)
+                except Exception:
+                    pass
+
+            return len(self._completed_layers) >= self._num_layers
+
+    def mark_all_done(self, cuda_event: Any = None) -> bool:
+        """
+        Mark all layers as completed at once (used for D2H all-layers evict mode).
+
+        Args:
+            cuda_event: Optional CUDA event to record completion
+
+        Returns:
+            True (always returns True since all layers are marked done)
+        """
+        with self._lock:
+            now = time.time()
+            self._completed_layers = set(range(self._num_layers))
+            self._layer_complete_times = {i: now for i in range(self._num_layers)}
+
+            # Record CUDA event if provided
+            if cuda_event is not None:
+                try:
+                    cuda_event.record()
+                except Exception as e:
+                    logger.warning(f"Failed to record CUDA event: {e}")
+
+            # Execute all callbacks (call with -1 to indicate all layers done)
+            for callback in self._callbacks:
+                try:
+                    callback(-1)
+                except Exception:
+                    pass
+
+            return True
+
+    # ============ Query Methods ============
+
+    def is_layer_done(self, layer_idx: int) -> bool:
+        """
+        Check if a specific layer is completed.
+
+        Args:
+            layer_idx: Index of the layer to check
+
+        Returns:
+            True if the layer is completed, False otherwise
+        """
+        with self._lock:
+            return layer_idx in self._completed_layers
+
+    def is_all_done(self) -> bool:
+        """
+        Check if all layers are completed.
+
+        Returns:
+            True if all layers are completed, False otherwise
+        """
+        with self._lock:
+            return len(self._completed_layers) >= self._num_layers
+
+    def get_completed_count(self) -> int:
+        """
+        Get the number of completed layers.
+
+        Returns:
+            Number of completed layers
+        """
+        with self._lock:
+            return len(self._completed_layers)
+
+    def get_pending_layers(self) -> List[int]:
+        """
+        Get list of pending layer indices.
+
+        Returns:
+            List of pending layer indices
+        """
+        with self._lock:
+            return [i for i in range(self._num_layers) if i not in self._completed_layers]
+
+    # ============ Wait Methods (called by forward thread) ============
+
+    def wait_for_layer(self, layer_idx: int, timeout: Optional[float] = None) -> bool:
+        """
+        Wait for a specific layer to complete (CUDA Event synchronization).
+
+        Always synchronizes the CUDA event before returning to guarantee the GPU
+        transfer has actually completed, not just that the kernel was submitted.
+        The fast path that only checked is_layer_done() was unsafe because
+        mark_layer_done() is called immediately after kernel submission (async),
+        before the GPU has finished the transfer.
+
+        Args:
+            layer_idx: Index of the layer to wait for
+            timeout: Maximum wait time in seconds (default: 1s)
+
+        Returns:
+            True if layer completed
+
+        Raises:
+            LayerSwapTimeoutError: If timeout occurs before layer completes
+        """
+        self._increment_wait_count()
+        try:
+            start_time = time.time()
+            timeout = timeout if timeout is not None else 1.0
+            while True:
+                # Always try CUDA event sync first: set_layer_event() is called before
+                # mark_layer_done(), so once is_layer_done() is True the event is present.
+                cuda_event = self._cuda_events[layer_idx] if layer_idx < len(self._cuda_events) else None
+                if cuda_event is not None:
+                    try:
+                        cuda_event.synchronize()
+                        return True
+                    except Exception as e:
+                        logger.warning(f"CUDA event sync failed for layer {layer_idx}: {e}")
+                        # Event sync failed; fall through to is_layer_done check
+
+                # No event yet (or sync failed): check software state as fallback
+                # (covers non-cupy scenarios where events are never set)
+                if self.is_layer_done(layer_idx):
+                    return True
+
+                elapsed = time.time() - start_time
+                if elapsed >= timeout:
+                    logger.error(f"[WaitForLayer] layer={layer_idx} TIMEOUT after {elapsed:.2f}s")
+                    raise LayerSwapTimeoutError(f"Layer swap timeout: layer={layer_idx}, elapsed={elapsed:.2f}s")
+
+                time.sleep(0.001)
+        finally:
+            self._decrement_wait_count()
+
+    def wait_all(self, timeout: Optional[float] = None) -> bool:
+        """
+        Wait for all layers to complete (used for D2H all-layers evict mode).
+
+        Always synchronizes _cuda_events[-1] (set by set_layer_event for the last layer)
+        before returning, for the same reason as wait_for_layer.
+
+        Args:
+            timeout: Maximum wait time in seconds (default: 300s)
+
+        Returns:
+            True if all layers completed
+
+        Raises:
+            LayerSwapTimeoutError: If timeout occurs
+        """
+        self._increment_wait_count()
+        try:
+            start_time = time.time()
+            timeout = timeout if timeout is not None else 300.0
+            while True:
+                # _cuda_events[-1] is set by set_layer_event(num_layers-1, ...) before mark_all_done()
+                last_event = self._cuda_events[-1] if self._cuda_events else None
+                if last_event is not None:
+                    try:
+                        last_event.synchronize()
+                        return True
+                    except Exception as e:
+                        logger.warning(f"CUDA event sync failed for wait_all: {e}")
+
+                # No event yet (or sync failed): check software state as fallback
+                if self.is_all_done():
+                    return True
+
+                elapsed = time.time() - start_time
+                if elapsed >= timeout:
+                    logger.error(f"[wait_all] TIMEOUT after {elapsed:.2f}s")
+                    raise LayerSwapTimeoutError(f"wait_all timeout: elapsed={elapsed:.2f}s")
+
+                time.sleep(0.001)
+        finally:
+            self._decrement_wait_count()
+
+    # ============ Callback Methods ============
+
+    def register_callback(self, callback: Callable[[int], None]) -> None:
+        """
+        Register a callback to be called when each layer completes.
+
+        Args:
+            callback: Function to call with layer index when completed
+        """
+        with self._lock:
+            self._callbacks.append(callback)
+
+    # ============ Internal Helper Methods ============
+
+    def _increment_wait_count(self) -> None:
+        """Increment the wait count."""
+        with self._lock:
+            self._wait_count += 1
+
+    def _decrement_wait_count(self) -> None:
+        """Decrement the wait count."""
+        with self._lock:
+            if self._wait_count > 0:
+                self._wait_count -= 1
+
+    def _should_cleanup(self) -> bool:
+        """Check if cleanup is safe (no active waiters and all done)."""
+        with self._lock:
+            return self._wait_count == 0 and self.is_all_done()
+
+    # ============ Time Tracking Methods ============
+
+    def get_layer_complete_time(self, layer_idx: int) -> Optional[float]:
+        """
+        Get the completion time for a specific layer.
+
+        Args:
+            layer_idx: Index of the layer
+
+        Returns:
+            Completion time as Unix timestamp, or None if not completed
+        """
+        with self._lock:
+            return self._layer_complete_times.get(layer_idx)
+
+    def get_layer_wait_time(self, layer_idx: int) -> Optional[float]:
+        """
+        Get the time from transfer start to layer completion.
+
+        Args:
+            layer_idx: Index of the layer
+
+        Returns:
+            Time in seconds, or None if not completed
+        """
+        with self._lock:
+            complete_time = self._layer_complete_times.get(layer_idx)
+            if complete_time is None:
+                return None
+            return complete_time - self._start_time
+
+    def get_all_layer_times(self) -> Dict[int, float]:
+        """
+        Get completion times for all layers.
+
+        Returns:
+            Dictionary mapping layer_idx to completion time
+        """
+        with self._lock:
+            return self._layer_complete_times.copy()
+
+    def get_elapsed_time(self) -> float:
+        """
+        Get elapsed time since transfer start.
+
+        Returns:
+            Elapsed time in seconds
+        """
+        return time.time() - self._start_time
+
+    def get_stats(self) -> Dict:
+        """
+        Get current statistics.
+
+        Returns:
+            Dictionary with statistics
+        """
+        with self._lock:
+            return {
+                "num_layers": self._num_layers,
+                "completed_layers": len(self._completed_layers),
+                "pending_layers": self._num_layers - len(self._completed_layers),
+                "wait_count": self._wait_count,
+            }
+
+    # ============ Cleanup Methods ============
+
+    def cleanup(self) -> None:
+        """
+        Explicit cleanup method to release CUDA events.
+
+        Called when the transfer is complete and no more waiting is needed.
+        """
+        with self._lock:
+            # Check if safe to cleanup
+            if self._wait_count > 0:
+                return
+
+            # Clear CUDA events
+            self._cuda_events.clear()
+
+    def __del__(self) -> None:
+        """
+        Destructor to ensure CUDA events are released.
+
+        Note: This is a fallback. For explicit cleanup, call cleanup() method.
+        """
+        try:
+            if self._cuda_events:
+                self._cuda_events.clear()
+        except Exception:
+            pass  # Ignore errors during destruction
+
+
+class LayerSwapTimeoutError(Exception):
+    """Exception raised when layer swap operation times out."""
+
+    pass
+
+
+# ============ Block Hash Computation ============
+
+
+def hash_block_tokens(
+    token_ids: Sequence[int],
+    parent_block_hash: str | None = None,
+    extra_keys: Any = None,
+) -> str:
+    """
+    Compute hash value for a single block.
+
+    Reference: vLLM's hash_block_tokens implementation using chained hash:
+    hash = SHA256((parent_block_hash, token_ids_tuple, extra_keys))
+
+    Args:
+        token_ids: Token IDs of the current block.
+        parent_block_hash: Hash of the parent block (chained hash).
+        extra_keys: Additional keys (e.g., multimodal info, LoRA).
+
+    Returns:
+        Computed block hash as hex string.
+    """
+    if parent_block_hash is None:
+        parent_block_hash = ""
+
+    value = (parent_block_hash, tuple(token_ids), extra_keys)
+    return hashlib.sha256(pickle.dumps(value)).hexdigest()
+
+
+def get_block_hash_extra_keys(
+    request: Any,
+    start_idx: int,
+    end_idx: int,
+    mm_idx: int,
+) -> tuple:
+    """
+    Retrieve additional hash keys for a block based on multimodal information.
+
+    Mirrors the logic from prefix_cache_manager.PrefixCacheManager.get_block_hash_extra_keys.
+
+    For each block [start_idx, end_idx), scans the multimodal positions starting
+    from mm_idx and collects hashes of any multimodal items that overlap with the block.
+
+    Args:
+        request: Request object.  Must expose a ``multimodal_inputs`` attribute which
+            is either None or a dict with keys:
+                - ``mm_positions``: list of objects with ``.offset`` and ``.length``
+                - ``mm_hashes``:    list of hash strings, one per multimodal item
+        start_idx: Token index of the block start (inclusive).
+        end_idx:   Token index of the block end (exclusive).
+        mm_idx:    Index into mm_positions / mm_hashes to start scanning from
+                   (avoids re-scanning already-processed items).
+
+    Returns:
+        (next_mm_idx, hash_keys):
+            next_mm_idx: updated mm_idx for the next block.
+            hash_keys  : list of multimodal hash strings that fall within this block.
+    """
+    hash_keys: List[str] = []
+    mm_inputs = getattr(request, "multimodal_inputs", None)
+    if (
+        mm_inputs is None
+        or "mm_positions" not in mm_inputs
+        or "mm_hashes" not in mm_inputs
+        or len(mm_inputs["mm_positions"]) == 0
+    ):
+        return mm_idx, hash_keys
+
+    mm_positions = mm_inputs["mm_positions"]
+    mm_hashes = mm_inputs["mm_hashes"]
+
+    # Fast exit: last multimodal item ends before this block starts
+    if mm_positions[-1].offset + mm_positions[-1].length <= start_idx:
+        return mm_idx, hash_keys
+
+    for img_idx in range(mm_idx, len(mm_positions)):
+        image_offset = mm_positions[img_idx].offset
+        image_length = mm_positions[img_idx].length
+
+        if image_offset + image_length <= start_idx:
+            # Multimodal item ends before block starts – skip
+            continue
+        elif image_offset >= end_idx:
+            # Multimodal item starts after block ends – stop
+            return img_idx, hash_keys
+        elif image_offset + image_length > end_idx:
+            # Multimodal item spans beyond block end – include hash, stop at this item
+            hash_keys.append(mm_hashes[img_idx])
+            return img_idx, hash_keys
+        else:
+            # Multimodal item is fully contained within the block
+            hash_keys.append(mm_hashes[img_idx])
+
+    return len(mm_positions) - 1, hash_keys
+
+
+def get_request_block_hasher(
+    block_size: int,
+) -> Callable[[Any], List[str]]:
+    """
+    Factory function: returns a block hash calculator bound to block_size.
+
+    The returned function computes hashes for new complete blocks in a request.
+    Computation logic:
+    1. Get all token IDs (prompt + output)
+    2. Determine starting position based on existing block_hashes count
+    3. Compute hashes for new complete blocks (chained hash, with multimodal extra_keys)
+
+    Usage:
+        # Create hasher at service startup
+        block_hasher = get_request_block_hasher(block_size=64)
+
+        # Use in Request.prompt_hashes property
+        new_hashes = block_hasher(self)
+        self._prompt_hashes.extend(new_hashes)
+
+    Args:
+        block_size: Number of tokens per block.
+
+    Returns:
+        A function that takes a request and returns a list of newly computed
+        block hashes.
+    """
+
+    def request_block_hasher(request: Any) -> List[str]:
+        """
+        Compute hashes for uncomputed complete blocks in a request.
+
+        Args:
+            request: Request object with the following attributes:
+                - prompt_token_ids: Input token IDs.
+                - _prompt_hashes: List of existing block hashes (private attr).
+                - output_token_ids: Output token IDs (optional).
+                - multimodal_inputs (optional): Multimodal info dict with
+                  ``mm_positions`` and ``mm_hashes``.
+
+        Returns:
+            List of newly computed block hashes (only new complete blocks).
+        """
+        # Get prompt token IDs
+        prompt_ids = request.prompt_token_ids
+        if hasattr(prompt_ids, "tolist"):
+            prompt_ids = prompt_ids.tolist()
+        if prompt_ids is None:
+            prompt_ids = []
+
+        # Get output token IDs
+        output_ids = getattr(request, "output_token_ids", [])
+        if hasattr(output_ids, "tolist"):
+            output_ids = output_ids.tolist()
+        if output_ids is None:
+            output_ids = []
+
+        # Combine all token IDs
+        all_token_ids = list(prompt_ids) + list(output_ids)
+        num_tokens = len(all_token_ids)
+
+        # Get existing block hashes
+        existing_hashes = getattr(request, "_prompt_hashes", [])
+        if existing_hashes is None:
+            existing_hashes = []
+
+        # Calculate starting position (skip already computed blocks)
+        start_token_idx = len(existing_hashes) * block_size
+
+        # Return empty if no new complete blocks
+        if start_token_idx + block_size > num_tokens:
+            return []
+
+        new_block_hashes: List[str] = []
+        prev_block_hash = existing_hashes[-1] if existing_hashes else None
+
+        # mm_idx tracks which multimodal item to scan from, avoiding redundant iteration
+        mm_idx = 0
+
+        # Compute hashes for new complete blocks
+        while True:
+            end_token_idx = start_token_idx + block_size
+            if end_token_idx > num_tokens:
+                break
+
+            # Get tokens for current block
+            block_tokens = all_token_ids[start_token_idx:end_token_idx]
+
+            # Collect multimodal extra_keys for this block
+            mm_idx, extra_keys = get_block_hash_extra_keys(
+                request=request,
+                start_idx=start_token_idx,
+                end_idx=end_token_idx,
+                mm_idx=mm_idx,
+            )
+            extra_keys_value = tuple(extra_keys) if extra_keys else None
+
+            # Compute hash (chained hash)
+            block_hash = hash_block_tokens(block_tokens, prev_block_hash, extra_keys_value)
+            new_block_hashes.append(block_hash)
+
+            # Update state
+            start_token_idx += block_size
+            prev_block_hash = block_hash
+
+        return new_block_hashes
+
+    return request_block_hasher
diff --git a/fastdeploy/cache_manager/v1/metadata.py b/fastdeploy/cache_manager/v1/metadata.py
new file mode 100644
index 00000000000..5337eeb5458
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/metadata.py
@@ -0,0 +1,590 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import time
+import uuid
+from dataclasses import dataclass, field
+from enum import Enum, auto
+from typing import Any, Dict, List, Optional
+
+
+class TransferStatus(Enum):
+    """Status of a transfer task."""
+
+    PENDING = "pending"
+    IN_PROGRESS = "in_progress"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+
+class StorageType(Enum):
+    """Supported storage backend types."""
+
+    MOONCAKE = "mooncake"
+    ATTNSTORE = "attnstore"
+    LOCAL = "local"
+
+
+class TransferType(Enum):
+    """Supported transfer mechanism types."""
+
+    RDMA = "rdma"
+    IPC = "ipc"
+
+
+class CacheLevel(Enum):
+    """Cache hierarchy levels for transfer operations."""
+
+    DEVICE = "device"
+    HOST = "host"
+    STORAGE = "storage"
+
+
+class CacheStatus(Enum):
+    """Cache status enum representing the current location and state of a BlockNode.
+
+    Attributes:
+        DEVICE: Block is in device (GPU) memory, ready for use. Can be matched.
+        HOST: Block is in host (CPU) memory, needs to be loaded to device. Can be matched.
+        SWAP_TO_HOST: Block is being evicted from device to host. Cannot be matched.
+        SWAP_TO_DEVICE: Block is being loaded from host to device.
+        LOADING_FROM_STORAGE: Block is being loaded from storage.
+        DELETING: Block is being deleted (removed from host or deleted when no host cache). Cannot be matched.
+    """
+
+    DEVICE = auto()
+    HOST = auto()
+    SWAP_TO_HOST = auto()
+    SWAP_TO_DEVICE = auto()
+    DELETING = auto()
+    LOADING_FROM_STORAGE = auto()
+
+
+@dataclass
+class RadixTreeStats:
+    """
+    Snapshot of RadixTree statistics.
+
+    Encapsulates all state counters for monitoring and statistics.
+    Returns as a snapshot to ensure consistent values across all fields.
+
+    Attributes:
+        node_count: Total number of nodes in the tree.
+        evictable_device_count: GPU nodes available for eviction (ref_count==0, status==DEVICE).
+        evictable_host_count: CPU nodes available for deletion (ref_count==0, status==HOST).
+    """
+
+    node_count: int = 0
+    evictable_device_count: int = 0
+    evictable_host_count: int = 0
+
+    @property
+    def evictable_count(self) -> int:
+        """Total evictable nodes count."""
+        return self.evictable_device_count + self.evictable_host_count
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "node_count": self.node_count,
+            "evictable_device_count": self.evictable_device_count,
+            "evictable_host_count": self.evictable_host_count,
+            "evictable_count": self.evictable_count,
+        }
+
+
+@dataclass
+class CacheBlockMetadata:
+    """
+    Metadata for a cache block.
+
+    Attributes:
+        block_id: Unique identifier for the block
+        device_id: GPU device ID where the block resides
+        block_size: Size of the block in bytes
+        ref_count: Reference count for the block
+        is_pinned: Whether the block is pinned in memory
+        layer_indices: List of layer indices stored in this block
+        token_count: Number of tokens in this block
+        hash_value: Hash value for the block content
+        last_access_time: Last access timestamp
+    """
+
+    block_id: int
+    device_id: int
+    block_size: int
+    ref_count: int = 0
+    is_pinned: bool = False
+    layer_indices: List[int] = field(default_factory=list)
+    token_count: int = 0
+    hash_value: Optional[str] = None
+    last_access_time: float = 0.0
+
+
+@dataclass
+class TransferTask:
+    """
+    Represents a cache transfer task.
+
+    Attributes:
+        task_id: Unique identifier for the task
+        src_location: Source location (device/host/storage/remote)
+        dst_location: Destination location
+        block_indices: List of block indices to transfer
+        layer_indices: List of layer indices to transfer
+        status: Current status of the task
+        priority: Task priority (lower is higher priority)
+        created_time: Task creation timestamp
+        started_time: Task start timestamp
+        completed_time: Task completion timestamp
+        error_message: Error message if task failed
+        metadata: Additional task metadata
+    """
+
+    task_id: str
+    src_location: str
+    dst_location: str
+    block_indices: List[int] = field(default_factory=list)
+    layer_indices: List[int] = field(default_factory=list)
+    status: TransferStatus = TransferStatus.PENDING
+    priority: int = 0
+    created_time: float = 0.0
+    started_time: Optional[float] = None
+    completed_time: Optional[float] = None
+    error_message: Optional[str] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class StorageConfig:
+    """
+    Configuration for storage backend.
+
+    Attributes:
+        storage_type: Type of storage backend
+        storage_path: Base path for storage
+        max_size_bytes: Maximum storage size in bytes
+        enable_compression: Whether to enable compression
+        compression_algorithm: Compression algorithm to use
+        connection_timeout: Connection timeout in seconds
+        read_timeout: Read timeout in seconds
+        write_timeout: Write timeout in seconds
+        extra_config: Additional backend-specific configuration
+    """
+
+    storage_type: StorageType = StorageType.MOONCAKE
+    storage_path: str = ""
+    max_size_bytes: int = 0
+    enable_compression: bool = False
+    compression_algorithm: str = "lz4"
+    connection_timeout: float = 30.0
+    read_timeout: float = 60.0
+    write_timeout: float = 60.0
+    extra_config: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class TransferConfig:
+    """
+    Configuration for transfer mechanism.
+
+    Attributes:
+        transfer_type: Type of transfer mechanism
+        enable_async: Whether to enable async transfer
+        max_concurrent_transfers: Maximum concurrent transfer tasks
+        buffer_size: Buffer size for transfer in bytes
+        enable_checksum: Whether to enable checksum verification
+        retry_count: Number of retries on failure
+        retry_delay: Delay between retries in seconds
+        extra_config: Additional transfer-specific configuration
+    """
+
+    transfer_type: TransferType = TransferType.RDMA
+    enable_async: bool = True
+    max_concurrent_transfers: int = 4
+    buffer_size: int = 1024 * 1024  # 1MB
+    enable_checksum: bool = True
+    retry_count: int = 3
+    retry_delay: float = 1.0
+    extra_config: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class BlockNode:
+    """
+    Node in the block management tree.
+
+    Represents a node in the radix tree or block allocation structure,
+    tracking block relationships and reference counts.
+
+    Attributes:
+        node_id: Globally unique identifier for this node (UUID)
+        block_id: Block identifier (may be reused across device/host)
+        parent: Parent BlockNode reference (None for root)
+        children: Dict mapping hash values to child BlockNodes (for radix tree)
+        children_ids: List of child block IDs
+        ref_count: Number of references to this block (defaults to 1 on creation)
+        token_count: Number of tokens stored in this block
+        hash_value: Hash value for prefix matching
+        cache_status: Current cache status (DEVICE/HOST/SWAP_TO_HOST/SWAP_TO_DEVICE)
+        last_access_time: Last access timestamp (defaults to current time on creation)
+        backuped: Whether this block has a backup on host memory
+        host_block_id: Host block ID where the backup is stored (if backuped=True)
+    """
+
+    node_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    block_id: int = 0
+    parent: Optional["BlockNode"] = None
+    children: Dict[str, "BlockNode"] = field(default_factory=dict)
+    children_ids: List[int] = field(default_factory=list)
+    ref_count: int = 0
+    token_count: int = 0
+    hash_value: Optional[str] = None
+    cache_status: CacheStatus = CacheStatus.DEVICE
+    last_access_time: float = field(default_factory=time.time)
+    # Backup-related fields
+    backuped: bool = False  # Whether a backup exists on host memory
+    host_block_id: Optional[int] = None  # Host block ID where the backup is stored
+    hit_count: int = 1  # triggers backup when reaching the threshold
+
+    def __post_init__(self):
+        """Initialize instance with current time if last_access_time not set."""
+        if self.last_access_time == 0.0:
+            self.last_access_time = time.time()
+
+    def add_child(self, child_id: int) -> None:
+        """Add a child block ID."""
+        if child_id not in self.children_ids:
+            self.children_ids.append(child_id)
+
+    def remove_child(self, child_id: int) -> bool:
+        """Remove a child block ID. Returns True if removed."""
+        if child_id in self.children_ids:
+            self.children_ids.remove(child_id)
+            return True
+        return False
+
+    def increment_ref(self) -> int:
+        """Increment reference count and return new count."""
+        self.ref_count += 1
+        return self.ref_count
+
+    def decrement_ref(self) -> int:
+        """Decrement reference count and return new count."""
+        if self.ref_count > 0:
+            self.ref_count -= 1
+        return self.ref_count
+
+    def touch(self) -> None:
+        """
+        Update last_access_time to current time.
+
+        This method should be called whenever the block is accessed
+        to track access recency for eviction policies.
+        """
+        self.last_access_time = time.time()
+
+    def update_access(self, delta_ref: int = 0) -> None:
+        """
+        Update reference count and last_access_time.
+
+        Args:
+            delta_ref: Change in reference count (positive to increment, negative to decrement)
+        """
+        if delta_ref > 0:
+            self.ref_count += delta_ref
+        elif delta_ref < 0:
+            self.ref_count = max(0, self.ref_count + delta_ref)
+        self.touch()
+
+    def is_leaf(self) -> bool:
+        """Check if this is a leaf node (no children)."""
+        return len(self.children_ids) == 0 and len(self.children) == 0
+
+    def is_root(self) -> bool:
+        """Check if this is a root node (no parent)."""
+        return self.parent is None
+
+    def is_on_device(self) -> bool:
+        """Check if block is on device (GPU) memory."""
+        return self.cache_status == CacheStatus.DEVICE
+
+    def is_on_host(self) -> bool:
+        """Check if block is on host (CPU) memory."""
+        return self.cache_status == CacheStatus.HOST
+
+    def is_swapping(self) -> bool:
+        """Check if block is currently being swapped or deleted."""
+        return self.cache_status in (
+            CacheStatus.SWAP_TO_HOST,
+            CacheStatus.SWAP_TO_DEVICE,
+            CacheStatus.DELETING,
+        )
+
+
+@dataclass
+class MatchResult:
+    """
+    Three-level cache prefix match result.
+
+    Contains matched nodes from Device, Host, and Storage levels.
+
+    Attributes:
+        storage_nodes: List of matched BlockNodes in Storage.
+        device_nodes: List of matched BlockNodes in Device.
+        host_nodes: List of matched BlockNodes in Host.
+    """
+
+    device_nodes: List["BlockNode"] = field(default_factory=list)
+    host_nodes: List["BlockNode"] = field(default_factory=list)
+    storage_nodes: List["BlockNode"] = field(default_factory=list)
+    uncached_block_ids: List[int] = field(default_factory=list)
+
+    @property
+    def device_block_ids(self) -> List[int]:
+        """Get list of matched device block IDs."""
+        return [node.block_id for node in self.device_nodes]
+
+    @property
+    def total_matched_blocks(self) -> int:
+        """Get total number of matched device blocks."""
+        return self.matched_device_nums + self.matched_host_nums + self.matched_storage_nums
+
+    @property
+    def matched_device_nums(self) -> int:
+        """Get total number of matched device blocks."""
+        return len(self.device_nodes)
+
+    @property
+    def matched_host_nums(self) -> int:
+        """Get total number of matched host blocks."""
+        return len(self.host_nodes)
+
+    @property
+    def matched_storage_nums(self) -> int:
+        """Get total number of matched storage hashes."""
+        return len(self.storage_nodes)
+
+
+@dataclass
+class StorageMetadata:
+    """
+    Base metadata for storage transfer operations.
+
+    Encapsulates all information for storage load/evict operations.
+    Different storage implementations can extend this class with additional fields.
+
+    Attributes:
+        hash_values: List of hash values to transfer.
+        block_ids: Target/source host block IDs (pre-allocated by Scheduler).
+        direction: Transfer direction ("load" from storage, "evict" to storage).
+        storage_type: Storage type ("mooncake", "attnstore", "rdma", etc.).
+        endpoint: Storage service endpoint address.
+        timeout: Operation timeout in seconds.
+        layer_num: Number of layers to transfer (for layer-by-layer transfer).
+        extra_params: Storage-specific extra parameters.
+    """
+
+    hash_values: List[str] = field(default_factory=list)
+    block_ids: List[int] = field(default_factory=list)
+    direction: str = "load"
+    storage_type: str = "mooncake"
+    endpoint: Optional[str] = None
+    timeout: float = 30.0
+    layer_num: int = 0
+    extra_params: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class PDTransferMetadata:
+    """
+    Base metadata for PD separation transfer operations.
+
+    Encapsulates all information for cross-node transfer in PD separation architecture.
+    Different transfer mechanisms (RDMA, IPC) can extend this class with additional fields.
+
+    Attributes:
+        source_node_id: Source node identifier (P node ID).
+        target_node_id: Target node identifier (D node ID).
+        block_ids: List of block IDs to transfer.
+        layer_num: Total number of model layers (for layer-by-layer transfer sync).
+        timeout: Operation timeout in seconds.
+        extra_params: Transfer-specific extra parameters.
+    """
+
+    source_node_id: str = ""
+    target_node_id: str = ""
+    block_ids: List[int] = field(default_factory=list)
+    layer_num: int = 0
+    timeout: float = 30.0
+    extra_params: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class CacheSwapMetadata:
+    """
+    Metadata for cache transfer operations.
+
+    Encapsulates the mapping between source and destination block IDs
+    for Host↔Device, Storage→Host, and other transfer operations.
+
+    Attributes:
+        src_block_ids: Source block IDs (transfer origin).
+        dst_block_ids: Destination block IDs (transfer target).
+        src_type: Source cache level (CacheLevel.DEVICE/HOST/STORAGE).
+        dst_type: Destination cache level (CacheLevel.DEVICE/HOST/STORAGE).
+        hash_values: Corresponding hash values (used for storage-related operations).
+        success: Whether the transfer succeeded.
+        error_message: Error message if transfer failed.
+        async_handler: Async task handler for tracking the swap task execution state.
+    """
+
+    src_block_ids: List[int] = field(default_factory=list)
+    dst_block_ids: List[int] = field(default_factory=list)
+    src_type: Optional[CacheLevel] = None
+    dst_type: Optional[CacheLevel] = None
+    hash_values: List[str] = field(default_factory=list)
+    success: bool = False
+    error_message: Optional[str] = None
+    async_handler: Optional["AsyncTaskHandler"] = None
+
+    def is_success(self) -> bool:
+        """Return whether the transfer succeeded."""
+        return self.success
+
+    @property
+    def mapping(self) -> Dict[int, int]:
+        """Get the src -> dst block ID mapping dict."""
+        if not self.success:
+            return {}
+        return dict(zip(self.src_block_ids, self.dst_block_ids))
+
+
+@dataclass
+class TransferResult:
+    """
+    Cache transfer operation result.
+
+    Encapsulates the mapping between source and destination block IDs
+    for Host↔Device, Storage→Host, and other transfer operations.
+
+    Attributes:
+        src_block_ids: Source block IDs (transfer origin).
+        dst_block_ids: Destination block IDs (transfer target).
+        src_type: Source cache level (CacheLevel.DEVICE/HOST/STORAGE).
+        dst_type: Destination cache level (CacheLevel.DEVICE/HOST/STORAGE).
+        success: Whether the transfer succeeded.
+        error_message: Error message if transfer failed.
+    """
+
+    src_block_ids: List[int] = field(default_factory=list)
+    dst_block_ids: List[int] = field(default_factory=list)
+    src_type: Optional[CacheLevel] = None
+    dst_type: Optional[CacheLevel] = None
+    success: bool = True
+    error_message: Optional[str] = None
+
+
+@dataclass
+class AsyncTaskHandler:
+    """
+    Async task handler.
+
+    Used for submitting and tracking the state of async tasks.
+    External callers use this handler to check whether a task has completed.
+
+    Attributes:
+        task_id: Unique task identifier.
+        is_completed: Whether the task has completed.
+        result: Task result (available after completion).
+        error: Task error message (if failed).
+    """
+
+    task_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    is_completed: bool = False
+    result: Optional[Any] = None
+    error: Optional[str] = None
+    _event: Any = field(default=None, repr=False)
+
+    def __post_init__(self):
+        """Initialize event for synchronization."""
+        import threading
+
+        object.__setattr__(self, "_event", threading.Event())
+
+    def wait(self, timeout: Optional[float] = None) -> bool:
+        """
+        Wait for the task to complete.
+
+        Args:
+            timeout: Maximum wait time in seconds. None means wait indefinitely.
+
+        Returns:
+            True if completed, False if timed out.
+        """
+        return self._event.wait(timeout=timeout)
+
+    def cancel(self) -> bool:
+        """
+        Cancel the task.
+
+        Returns:
+            True if successfully cancelled, False otherwise.
+        """
+        if self.is_completed:
+            return False
+        self.error = "Task cancelled"
+        self.is_completed = True
+        self._event.set()
+        return True
+
+    def get_result(self) -> Any:
+        """
+        Get the task result (blocking).
+
+        Returns:
+            Task result.
+
+        Raises:
+            RuntimeError: If the task failed or was cancelled.
+        """
+        self._event.wait()
+        if self.error:
+            raise RuntimeError(self.error)
+        return self.result
+
+    def set_result(self, result: Any) -> None:
+        """
+        Set the task result and mark as completed.
+
+        Args:
+            result: Task result.
+        """
+        self.result = result
+        self.is_completed = True
+        self._event.set()
+
+    def set_error(self, error: str) -> None:
+        """
+        Set the error message and mark as completed.
+
+        Args:
+            error: Error message.
+        """
+        self.error = error
+        self.is_completed = True
+        self._event.set()
diff --git a/fastdeploy/cache_manager/v1/radix_tree.py b/fastdeploy/cache_manager/v1/radix_tree.py
new file mode 100644
index 00000000000..f8f2639fb86
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/radix_tree.py
@@ -0,0 +1,731 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import heapq
+import threading
+from typing import Dict, List, Optional, Tuple
+
+from fastdeploy.utils import get_logger
+
+from .metadata import BlockNode, CacheStatus, RadixTreeStats
+
+logger = get_logger("radix_tree", "cache_manager.log")
+
+
+class RadixTree:
+    """
+    Radix tree for efficient prefix matching in KV cache.
+
+    Used to find matching prefixes across different sequences,
+    enabling KV cache reuse for shared prefixes.
+
+    Uses separate min-heaps for DEVICE and HOST evictable nodes with true deletion,
+    ensuring heap contents are always consistent with the evictable set.
+
+    API Usage Guidelines
+    ====================
+
+    1. Reference Count Management (CRITICAL)
+    -----------------------------------------
+    The reference count (ref_count) determines whether a node can be evicted.
+    A node is evictable ONLY when ref_count == 0.
+
+    IMPORTANT: You MUST pair increment_ref_nodes() and decrement_ref_nodes() calls:
+    - After insert(): nodes have ref_count >= 1, NOT evictable
+    - After decrement_ref_nodes(): ref_count decreases, may become evictable
+    - After increment_ref_nodes(): ref_count increases, removed from evictable set
+
+    WARNING: Unbalanced ref_count management can cause:
+    - Memory leaks: nodes never become evictable (ref_count > 0 forever)
+    - Premature eviction: nodes evicted while still in use (ref_count == 0)
+
+    Example:
+        nodes, wasted_ids = tree.insert(blocks)  # ref_count = 1, wasted_ids may be non-empty if nodes were reused
+        if wasted_ids:
+            # Release wasted block_ids that were not used due to node reuse
+            release_blocks(wasted_ids)
+        # ... use the nodes ...
+        tree.decrement_ref_nodes(nodes)       # ref_count = 0, now evictable
+        # Do NOT use nodes after decrement - they may be evicted!
+
+    2. Eviction Operation Order
+    ---------------------------
+    The correct eviction order is:
+
+        DEVICE -> HOST -> Storage
+
+    Step 1: evict_device_to_host() - Move DEVICE nodes to HOST
+        - Input: num_blocks, host_block_ids (pre-allocated)
+        - Output: released device block_ids
+        - Nodes transition: DEVICE -> HOST (still in tree)
+
+    Step 2: evict_host_nodes() - Remove HOST nodes permanently
+        - Input: num_blocks
+        - Output: evicted host block_ids
+        - Nodes removed from tree completely
+
+    WARNING: Do NOT call evict_host_nodes() before evict_device_to_host() for
+    the same nodes - this will fail since nodes are still in DEVICE state.
+
+    3. Atomicity Guarantee
+    ----------------------
+    All eviction methods provide atomic operation:
+    - Pre-check: verify enough evictable nodes exist
+    - If pre-check fails, return None immediately (no partial eviction)
+    - If success, all requested blocks are processed
+
+    Check return value:
+    - None: Not enough evictable blocks, operation failed
+    - Empty list: num_blocks == 0, nothing to do
+    - List of block_ids: Success
+
+    4. Thread Safety
+    ----------------
+    All public methods are thread-safe using RLock.
+    However, be careful with the following pattern:
+
+    WARNING: Do NOT hold references to nodes across method calls:
+        # DANGEROUS - node may be evicted by another thread
+        nodes = tree.find_prefix(hashes)
+        # ... some operation without lock ...
+        tree.increment_ref_nodes(nodes)  # nodes may already be evicted!
+
+    Instead, use the returned nodes immediately:
+        nodes = tree.find_prefix(hashes)
+        tree.increment_ref_nodes(nodes)  # Safe: immediate operation
+
+    5. Node Lifecycle
+    -----------------
+    Node states and valid transitions:
+
+        [New] --insert()--> DEVICE (ref_count >= 1)
+        DEVICE --decrement_ref()--> DEVICE (ref_count == 0, evictable)
+        DEVICE --evict_device_to_host()--> HOST (ref_count == 0)
+        HOST --evict_host_nodes()--> [Deleted from tree]
+
+        HOST --swap_to_device()--> SWAP_TO_DEVICE
+        SWAP_TO_DEVICE --complete_swap_to_device()--> DEVICE
+
+    WARNING: Once a node's ref_count becomes 0, it can be evicted at any time.
+    Do NOT access or modify a node after decrementing its ref_count unless
+    you increment it first.
+
+    6. Common Pitfalls
+    ------------------
+    a) Forgetting to decrement ref_count after use:
+       -> Memory leak, blocks never released
+
+    b) Decrementing ref_count multiple times:
+       -> ref_count becomes negative, undefined behavior
+
+    c) Using nodes after decrement_ref_nodes():
+       -> Nodes may be evicted, accessing invalid memory
+
+    d) Evicting nodes with ref_count > 0:
+       -> Not possible, eviction methods skip non-zero ref_count nodes
+
+    e) Calling find_prefix() on DELETING/SWAP_TO_HOST nodes:
+       -> These states are skipped, prefix match stops at these nodes
+    """
+
+    def __init__(
+        self,
+        enable_host_cache: bool = False,
+        write_policy: str = "write_through",
+    ):
+        """
+        Initialize the radix tree.
+
+        Args:
+            enable_host_cache: If True, evict() moves nodes to HOST state
+                              instead of removing them from tree.
+            write_policy: Write policy for backup to lower tier.
+                          - "write_through": Every matched node triggers backup check
+                          - "write_through_selective": Only nodes with hit_count >= threshold trigger backup
+                          - "write_back": Backup only when evicted (not implemented yet)
+        """
+        self._root = BlockNode()
+        self._lock = threading.RLock()
+        self._node_count = 1  # Root node
+        self._enable_host_cache = enable_host_cache
+        self._write_policy = write_policy
+
+        # Use dict for O(1) add/remove instead of heap's O(n) removal
+        # Format: {node_id: (last_access_time, node)}
+        self._evictable_device: Dict[str, Tuple[float, BlockNode]] = {}
+        self._evictable_host: Dict[str, Tuple[float, BlockNode]] = {}
+
+    def insert(
+        self,
+        blocks: List[Tuple[str, int]],
+        cache_status: CacheStatus = CacheStatus.DEVICE,
+        start_node: Optional[BlockNode] = None,
+    ) -> Tuple[List[BlockNode], List[int]]:
+        """
+        Insert a sequence of blocks into the tree.
+
+        Args:
+            blocks: List of (block_hash, block_id) tuples.
+                    Each tuple represents a complete block.
+            cache_status: Initial cache status for new nodes.
+                         Defaults to DEVICE.
+            start_node: Node to start insertion from. If None, starts from root.
+                       Used for incremental insertion after prefix match.
+
+        Returns:
+            Tuple of (result_nodes, wasted_block_ids):
+            - result_nodes: List of inserted or updated BlockNode objects.
+            - wasted_block_ids: List of block_ids that were not used due to
+              node reuse (should be released by caller).
+        """
+        result_nodes = []
+        wasted_block_ids = []
+
+        if not blocks:
+            return result_nodes, wasted_block_ids
+
+        with self._lock:
+            node = self._root if start_node is None else start_node
+            for i, (block_hash, block_id) in enumerate(blocks):
+                if block_hash not in node.children:
+                    # Create new BlockNode with block_id, parent, and hash_value
+                    new_node = BlockNode(
+                        block_id=block_id,
+                        parent=node,
+                        hash_value=block_hash,
+                        cache_status=cache_status,
+                    )
+                    node.children[block_hash] = new_node
+                    self._node_count += 1
+                else:
+                    # Node already exists for this hash - the new block_id is wasted
+                    existing_node = node.children[block_hash]
+                    if existing_node.block_id != block_id:
+                        # Track the wasted block_id for caller to release
+                        wasted_block_ids.append(block_id)
+
+                node = node.children[block_hash]
+                # Increment ref and update evictable status
+                node.increment_ref()
+                # If node in evictable, remove it from evictable dict
+                if node.cache_status == CacheStatus.DEVICE and node.node_id in self._evictable_device:
+                    del self._evictable_device[node.node_id]
+                elif node.cache_status == CacheStatus.HOST and node.node_id in self._evictable_host:
+                    del self._evictable_host[node.node_id]
+                result_nodes.append(node)
+
+        return result_nodes, wasted_block_ids
+
+    def find_prefix(
+        self,
+        block_hashes: List[str],
+    ) -> List[BlockNode]:
+        """
+        Find the longest matching prefix.
+
+        Args:
+            block_hashes: List of block hash values to match.
+
+        Returns:
+            List of matched BlockNode objects in order.
+            Empty list if no match found.
+        """
+        matched_nodes = []
+
+        with self._lock:
+            node = self._root
+            for i, block_hash in enumerate(block_hashes):
+                if block_hash not in node.children:
+                    break
+
+                node = node.children[block_hash]
+                if node.cache_status in (CacheStatus.DELETING, CacheStatus.SWAP_TO_HOST):
+                    break
+
+                node.touch()
+                matched_nodes.append(node)
+
+        return matched_nodes
+
+    def increment_ref_nodes(self, nodes: List[BlockNode]) -> None:
+        """
+        Increment reference count for a list of nodes.
+
+        Removes nodes from evictable set (no longer available for eviction).
+        Also updates last_access_time for each node.
+
+        Args:
+            nodes: List of BlockNode objects to increment ref_count.
+        """
+        if not nodes:
+            return
+        with self._lock:
+            for node in nodes:
+                node.increment_ref()
+                node.hit_count += 1
+                node.touch()
+                self._remove_from_evictable(node)
+
+    def decrement_ref_nodes(self, nodes: List[BlockNode]) -> None:
+        """
+        Decrement reference count for a list of nodes.
+
+        When ref_count becomes 0, the node is added to evictable heap
+        and becomes available for eviction. Also updates last_access_time.
+
+        Args:
+            nodes: List of BlockNode objects to decrement ref_count.
+        """
+        if not nodes:
+            return
+        with self._lock:
+            for node in nodes:
+                old_ref = node.ref_count
+                node.decrement_ref()
+                node.touch()
+                # If ref_count goes from 1 to 0, add to evictable
+                if old_ref == 1 and node.ref_count == 0:
+                    self._add_to_evictable(node)
+
+    def reset(self) -> None:
+        """
+        Reset the tree to initial state.
+
+        Clears all nodes except root, evictable tracking, and node mappings.
+        """
+        with self._lock:
+            self._root = BlockNode(block_id=0)
+            self._node_count = 1
+            self._evictable_device.clear()
+            self._evictable_host.clear()
+
+    def get_stats(self) -> RadixTreeStats:
+        """
+        Get tree statistics snapshot.
+
+        Returns a snapshot of all tree statistics. Using a snapshot ensures
+        consistent values across all fields in a single call.
+
+        Returns:
+            RadixTreeStats containing all tree statistics.
+        """
+        return RadixTreeStats(
+            node_count=self._node_count,
+            evictable_device_count=len(self._evictable_device),
+            evictable_host_count=len(self._evictable_host),
+        )
+
+    def node_count(self) -> int:
+        """Get total number of nodes in the tree."""
+        return self._node_count
+
+    def evict_host_nodes(
+        self,
+        num_blocks: int,
+    ) -> Optional[List[int]]:
+        """
+        Evict HOST nodes from the tree.
+
+        Removes HOST nodes permanently and returns their block_ids.
+
+        Args:
+            num_blocks: Number of HOST blocks to evict
+
+        Returns:
+            List of evicted host block_ids, or None if not enough
+            evictable HOST blocks.
+        """
+        if num_blocks == 0:
+            return []
+
+        with self._lock:
+            if len(self._evictable_host) < num_blocks:
+                return None
+
+            nodes = self._get_lru_nodes(self._evictable_host, num_blocks)
+            evicted_block_ids = []
+
+            for node in nodes:
+                self._remove_node_from_tree(node)
+                evicted_block_ids.append(node.block_id)
+
+            logger.debug(
+                f"evict_host_nodes: evicted={evicted_block_ids}, " f"remaining_host={len(self._evictable_host)}"
+            )
+
+        return evicted_block_ids
+
+    def _get_lru_nodes(
+        self,
+        evictable_dict: Dict[str, Tuple[float, BlockNode]],
+        num_blocks: int,
+    ) -> List[BlockNode]:
+        """
+        Get the coldest (LRU) nodes from an evictable dict.
+
+        Args:
+            evictable_dict: The evictable dict to get nodes from (_evictable_device or _evictable_host).
+            num_blocks: Number of nodes to get.
+
+        Returns:
+            List of BlockNode objects in LRU order (coldest first).
+        """
+        if num_blocks <= 0 or not evictable_dict:
+            return []
+
+        smallest = heapq.nsmallest(
+            min(num_blocks, len(evictable_dict)), evictable_dict.items(), key=lambda item: item[1][0]
+        )
+
+        nodes = [node for _, (_, node) in smallest]
+        for node_id, _ in smallest:
+            del evictable_dict[node_id]
+        return nodes
+
+    def evict_device_nodes(
+        self,
+        num_blocks: int,
+    ) -> Optional[List[int]]:
+        """
+        Evict DEVICE nodes from the tree directly.
+
+        Removes DEVICE nodes permanently without moving to HOST.
+        This is used when host cache is disabled.
+
+        Args:
+            num_blocks: Number of DEVICE blocks to evict.
+
+        Returns:
+            List of evicted device block_ids, or None if not enough
+            evictable DEVICE blocks.
+        """
+        if num_blocks == 0:
+            return []
+
+        with self._lock:
+            if len(self._evictable_device) < num_blocks:
+                return None
+
+            nodes = self._get_lru_nodes(self._evictable_device, num_blocks)
+            evicted_block_ids = []
+
+            for node in nodes:
+                self._remove_node_from_tree(node)
+                evicted_block_ids.append(node.block_id)
+
+            logger.debug(
+                f"evict_device_nodes: evicted={evicted_block_ids}, " f"remaining_device={len(self._evictable_device)}"
+            )
+
+        return evicted_block_ids
+
+    def evict_device_to_host(
+        self,
+        num_blocks: int,
+        host_block_ids: List[int],
+    ) -> Optional[List[int]]:
+        """
+        Evict DEVICE nodes to host memory.
+
+        Changes node status from DEVICE to HOST and updates block_id
+        to the provided host_block_ids.
+
+        Args:
+            num_blocks: Number of DEVICE blocks to evict
+            host_block_ids: Pre-allocated host block IDs to use
+
+        Returns:
+            List of released device block_ids, or None if not enough
+            evictable DEVICE blocks.
+        """
+        if num_blocks == 0:
+            return []
+
+        if len(host_block_ids) < num_blocks:
+            return None
+
+        released_block_ids = []
+
+        with self._lock:
+            if len(self._evictable_device) < num_blocks:
+                return None
+
+            nodes = self._get_lru_nodes(self._evictable_device, num_blocks)
+            released_block_ids = []
+
+            for i, node in enumerate(nodes):
+                # Save the original device block_id
+                original_block_id = node.block_id
+                new_host_block_id = host_block_ids[i]
+
+                # Update status and block_id
+                node.cache_status = CacheStatus.HOST
+                node.block_id = new_host_block_id
+                node.touch()
+
+                # Add to host evictable dict
+                self._evictable_host[node.node_id] = (node.last_access_time, node)
+
+                released_block_ids.append(original_block_id)
+
+            logger.debug(
+                f"evict_device_to_host: released_device={released_block_ids} -> host={host_block_ids[:len(released_block_ids)]}, "
+                f"evictable_device={len(self._evictable_device)}, evictable_host={len(self._evictable_host)}"
+            )
+
+        return released_block_ids
+
+    def _add_to_evictable(self, node: BlockNode) -> None:
+        """
+        Add a node to the appropriate evictable dict based on cache status.
+        """
+        if node.cache_status == CacheStatus.DEVICE:
+            if node.node_id not in self._evictable_device:
+                self._evictable_device[node.node_id] = (node.last_access_time, node)
+        elif node.cache_status == CacheStatus.HOST:
+            if node.node_id not in self._evictable_host:
+                self._evictable_host[node.node_id] = (node.last_access_time, node)
+
+    def _remove_from_evictable(self, node: BlockNode) -> None:
+        """
+        Remove a node from evictable tracking (O(1) deletion from dict).
+        """
+        if node.cache_status == CacheStatus.DEVICE and node.node_id in self._evictable_device:
+            del self._evictable_device[node.node_id]
+        elif node.cache_status == CacheStatus.HOST and node.node_id in self._evictable_host:
+            del self._evictable_host[node.node_id]
+
+    def _remove_node_from_tree(self, node: BlockNode) -> None:
+        """
+        Remove a single node from the tree permanently.
+
+        Args:
+            node: Node to remove
+        """
+        if node.parent is None:
+            return  # Cannot remove root
+
+        # Remove from parent's children
+        if node.hash_value and node.hash_value in node.parent.children:
+            del node.parent.children[node.hash_value]
+            self._node_count -= 1
+
+    def swap_to_device(
+        self,
+        nodes: List[BlockNode],
+        gpu_block_ids: List[int],
+    ) -> List[int]:
+        """
+        Swap CPU blocks to device.
+
+        Changes node status to SWAP_TO_DEVICE and updates block_id to GPU block ID.
+        This is used when loading host blocks back to device memory.
+
+        Args:
+            nodes: List of BlockNode objects on host to swap to device.
+                   Caller guarantees all nodes are on HOST.
+            gpu_block_ids: Corresponding GPU block IDs
+
+        Returns:
+            List of original host block_ids
+        """
+        if len(nodes) != len(gpu_block_ids):
+            return []
+
+        original_block_ids = []
+
+        with self._lock:
+            for node, gpu_block_id in zip(nodes, gpu_block_ids):
+                # Save the original host block_id
+                original_block_ids.append(node.block_id)
+
+                # Remove from evictable before changing status
+                self._remove_from_evictable(node)
+
+                # Update status to SWAP_TO_DEVICE and block_id to GPU block ID
+                node.cache_status = CacheStatus.DEVICE  # Temporary status for test
+                node.block_id = gpu_block_id
+                node.touch()
+
+        return original_block_ids
+
+    def complete_swap_to_device(
+        self,
+        nodes: List[BlockNode],
+    ) -> List[int]:
+        """
+        Complete the swap to device operation.
+
+        Changes node status from SWAP_TO_DEVICE to DEVICE.
+        This should be called after the actual data transfer is complete.
+
+        Args:
+            nodes: List of BlockNode objects that were swapped to device
+
+        Returns:
+            List of GPU block_ids
+        """
+        gpu_block_ids = []
+
+        with self._lock:
+            for node in nodes:
+                # Update status to DEVICE
+                node.cache_status = CacheStatus.DEVICE
+                node.touch()
+
+                gpu_block_ids.append(node.block_id)
+
+        return gpu_block_ids
+
+    def select_blocks_for_backup(
+        self,
+        needed_num: int,
+    ) -> List[BlockNode]:
+        """
+        Select blocks to backup from evictable device nodes.
+
+        Selects the coldest blocks (LRU) from _evictable_device that don't
+        already have a backup.
+
+        Args:
+            needed_num: Number of blocks to select for backup
+
+        Returns:
+            List of BlockNode objects to backup
+        """
+        if needed_num <= 0:
+            return []
+
+        with self._lock:
+            # Find candidates: evictable device nodes without backup
+            candidates = []
+            for node_id, (_, node) in self._evictable_device.items():
+                if not node.backuped:
+                    candidates.append(node)
+
+            if not candidates:
+                return []
+
+            # Sort by last_access_time (LRU - oldest first)
+            candidates.sort(key=lambda n: n.last_access_time)
+
+            return candidates[:needed_num]
+
+    def backup_blocks(
+        self,
+        nodes: List[BlockNode],
+        host_block_ids: List[int],
+    ) -> List[int]:
+        """
+        Mark blocks as backed up and record their host block IDs.
+
+        This method marks the given nodes as backuped and stores the
+        host block IDs. It does NOT perform the actual data transfer -
+        that should be done by the caller via cache_evict_metadata.
+
+        Args:
+            nodes: List of BlockNode objects to backup
+            host_block_ids: Corresponding host block IDs for the backup
+
+        Returns:
+            List of device block IDs that were marked as backuped
+        """
+        if len(nodes) != len(host_block_ids):
+            return []
+
+        backed_up_ids = []
+
+        with self._lock:
+            for node, host_block_id in zip(nodes, host_block_ids):
+                node.backuped = True
+                node.host_block_id = host_block_id
+                backed_up_ids.append(node.block_id)
+
+        return backed_up_ids
+
+    def get_candidates_for_backup(self, threshold: int, pending_block_ids: list[int] = []) -> List[BlockNode]:
+        """
+        Get nodes that are candidates for backup based on write_through_selective policy.
+
+        Returns evictable device nodes that:
+        1. Have hit_count >= threshold
+        2. Are not already backed up
+
+        Args:
+            threshold: Minimum hit_count required for backup candidacy.
+            pending_block_ids: List of block IDs already in the pending backup queue,
+                               used to avoid duplicate scheduling.
+
+        Returns:
+            List of BlockNode objects that are candidates for backup,
+            sorted by LRU (coldest first).
+        """
+        if self._write_policy != "write_through_selective":
+            return []
+
+        candidates = []
+        with self._lock:
+            for node_id, (_, node) in self._evictable_device.items():
+                if not node.backuped and node.hit_count >= threshold and node.block_id not in pending_block_ids:
+                    candidates.append(node)
+
+            # Sort by LRU (oldest last_access_time first)
+            candidates.sort(key=lambda n: n.last_access_time)
+
+        return candidates
+
+    def evict_nodes_selective(
+        self,
+        num_blocks: int,
+    ) -> List[int]:
+        """
+        Evict device nodes with write_through_selective optimization.
+
+        First selects the coldest (LRU) nodes, then categorizes them:
+        - without_backup: Release directly (cold data, no transfer needed)
+        - with_backup: Update metadata to HOST (data already in host)
+
+        Args:
+            num_blocks: Number of blocks to evict
+
+        Returns:
+            List of released device block IDs
+        """
+        if num_blocks <= 0:
+            return []
+
+        with self._lock:
+            if len(self._evictable_device) < num_blocks:
+                return []
+
+            # Get LRU nodes first (this pops them from _evictable_device)
+            nodes = self._get_lru_nodes(self._evictable_device, num_blocks)
+
+            released_device_ids = []
+            for node in nodes:
+                if node.backuped:
+                    released_device_ids.append(node.block_id)
+
+                    node.cache_status = CacheStatus.HOST
+                    node.block_id = node.host_block_id
+                    node.touch()
+                    # Move to host evictable
+                    self._evictable_host[node.node_id] = (node.last_access_time, node)
+                else:
+                    self._remove_node_from_tree(node)
+                    released_device_ids.append(node.block_id)
+
+            return released_device_ids
diff --git a/fastdeploy/cache_manager/v1/storage/__init__.py b/fastdeploy/cache_manager/v1/storage/__init__.py
new file mode 100644
index 00000000000..b1c986b9a4e
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/storage/__init__.py
@@ -0,0 +1,232 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+if TYPE_CHECKING:
+    from fastdeploy.config import CacheConfig
+
+from ..metadata import StorageType
+from .base import StorageConnector, StorageScheduler
+
+
+def create_storage_scheduler(
+    config: Any,
+) -> Optional[StorageScheduler]:
+    """
+    Create a StorageScheduler instance based on configuration.
+
+    This is a factory function that creates the appropriate StorageScheduler
+    based on the storage backend type specified in the configuration.
+
+    Args:
+        config: Configuration object, can be:
+            - CacheConfig: FastDeploy configuration object
+            - Dict: Dictionary with 'storage_type' and backend-specific settings
+            - StorageConfig: StorageConfig dataclass instance
+
+    Returns:
+        StorageScheduler instance if successful, None otherwise
+
+    Example:
+        # Using CacheConfig
+        scheduler = create_storage_scheduler(fd_config)
+
+        # Using dict config
+        config = {
+            'storage_type': 'mooncake',
+            'server_addr': 'localhost:8080',
+            'namespace': 'kv_cache',
+        }
+        scheduler = create_storage_scheduler(config)
+    """
+    if config.kvcache_storage_backend is None:
+        return None
+
+    scheduler: Optional[StorageScheduler] = None
+
+    # Create scheduler based on storage type
+    if config.kvcache_storage_backend == "mooncake":
+        from .mooncake.connector import MooncakeStorageScheduler
+
+        scheduler = MooncakeStorageScheduler(config)
+
+    elif config.kvcache_storage_backend == "attention_store":
+        from .attnstore.connector import AttnStoreScheduler
+
+        scheduler = AttnStoreScheduler(config)
+
+    else:
+        raise ValueError(
+            f"Unsupported storage type: {config.kvcache_storage_backend}. "
+            f"Supported types: mooncake, attention_store, local"
+        )
+
+    # Attempt connection
+    if scheduler is not None:
+        if not scheduler.connect():
+            # Log warning but still return the scheduler
+            pass
+
+    return scheduler
+
+
+def create_storage_connector(
+    config: Any,
+) -> Optional[StorageConnector]:
+    """
+    Create a StorageConnector instance based on configuration.
+
+    This is a factory function that creates the appropriate StorageConnector
+    based on the storage backend type specified in the configuration.
+
+    Args:
+        config: Configuration object, can be:
+            - CacheConfig: FastDeploy configuration object
+            - Dict: Dictionary with 'storage_type' and backend-specific settings
+            - StorageConfig: StorageConfig dataclass instance
+
+    Returns:
+        StorageConnector instance if successful, None otherwise
+
+    Example:
+        # Using CacheConfig
+        connector = create_storage_connector(fd_config)
+
+        # Using dict config
+        config = {
+            'storage_type': 'mooncake',
+            'server_addr': 'localhost:8080',
+            'buffer_size': 1024 * 1024,
+        }
+        connector = create_storage_connector(config)
+    """
+    if config.kvcache_storage_backend is None:
+        return None
+
+    connector: Optional[StorageConnector] = None
+
+    # Create connector based on storage type
+    if config.kvcache_storage_backend == "mooncake":
+        from .mooncake.connector import MooncakeStorageConnector
+
+        connector = MooncakeStorageConnector(config)
+
+    elif config.kvcache_storage_backend == "attention_store":
+        from .attnstore.connector import AttnStoreConnector
+
+        connector = AttnStoreConnector(config)
+
+    else:
+        raise ValueError(
+            f"Unsupported storage type: {config.kvcache_storage_backend}. "
+            f"Supported types: mooncake, attention_store, local"
+        )
+
+    # Attempt connection
+    if connector is not None:
+        if not connector.connect():
+            # Log warning but still return the connector
+            pass
+
+    return connector
+
+
+def _parse_storage_config(config: "CacheConfig") -> tuple:
+    """
+    Parse storage configuration from various input types.
+
+    Args:
+        config: Configuration object (CacheConfig, Dict, or StorageConfig)
+
+    Returns:
+        Tuple of (storage_type, backend_config)
+    """
+    storage_type = None
+    backend_config: Dict[str, Any] = {}
+
+    # Handle CacheConfig
+    if hasattr(config, "cache_config") and config.cache_config is not None:
+        cache_config = config.cache_config
+
+        # Get storage type from cache_config
+        if hasattr(cache_config, "kvcache_storage_backend"):
+            storage_backend = cache_config.kvcache_storage_backend
+            if storage_backend:
+                storage_type = _normalize_storage_type(storage_backend)
+
+        # Extract backend-specific configuration
+        if hasattr(cache_config, "kvcache_storage_config"):
+            backend_config = cache_config.kvcache_storage_config or {}
+
+    # Handle dict config
+    elif isinstance(config, dict):
+        if "storage_type" in config:
+            storage_type = _normalize_storage_type(config["storage_type"])
+            # Copy other keys as backend config
+            backend_config = {k: v for k, v in config.items() if k != "storage_type"}
+        elif "kvcache_storage_backend" in config:
+            storage_type = _normalize_storage_type(config["kvcache_storage_backend"])
+            backend_config = config.get("kvcache_storage_config", {})
+
+    # Handle StorageConfig dataclass
+    elif hasattr(config, "storage_type"):
+        storage_type = config.storage_type
+        backend_config = {
+            "storage_path": getattr(config, "storage_path", ""),
+            "max_size_bytes": getattr(config, "max_size_bytes", 0),
+            "enable_compression": getattr(config, "enable_compression", False),
+            "compression_algorithm": getattr(config, "compression_algorithm", "lz4"),
+            "connection_timeout": getattr(config, "connection_timeout", 30.0),
+            "read_timeout": getattr(config, "read_timeout", 60.0),
+            "write_timeout": getattr(config, "write_timeout", 60.0),
+            "extra_config": getattr(config, "extra_config", {}),
+        }
+
+    return storage_type, backend_config
+
+
+def _normalize_storage_type(storage_type: Any) -> Optional[str]:
+    """
+    Normalize storage type to lowercase string.
+
+    Args:
+        storage_type: Storage type (enum, string, etc.)
+
+    Returns:
+        Normalized storage type string
+    """
+    if storage_type is None:
+        return None
+
+    # Handle enum
+    if isinstance(storage_type, StorageType):
+        return storage_type.value
+
+    # Handle string
+    if isinstance(storage_type, str):
+        return storage_type.lower()
+
+    # Handle other types
+    return str(storage_type).lower()
+
+
+__all__ = [
+    "StorageScheduler",
+    "StorageConnector",
+    "create_storage_scheduler",
+    "create_storage_connector",
+]
diff --git a/fastdeploy/cache_manager/v1/storage/attnstore/__init__.py b/fastdeploy/cache_manager/v1/storage/attnstore/__init__.py
new file mode 100644
index 00000000000..823f89a2d59
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/storage/attnstore/__init__.py
@@ -0,0 +1,22 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .connector import AttnStoreConnector, AttnStoreScheduler
+
+__all__ = [
+    "AttnStoreScheduler",
+    "AttnStoreConnector",
+]
diff --git a/fastdeploy/cache_manager/v1/storage/attnstore/connector.py b/fastdeploy/cache_manager/v1/storage/attnstore/connector.py
new file mode 100644
index 00000000000..c63f1b74d68
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/storage/attnstore/connector.py
@@ -0,0 +1,140 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Any, Dict, List, Optional
+
+from ..base import StorageConnector, StorageScheduler
+
+
+class AttnStoreScheduler(StorageScheduler):
+    """
+    AttnStore scheduler for Scheduler process.
+
+    Provides query operations for AttnStore system.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize AttnStore scheduler.
+
+        Args:
+            config: Configuration with keys:
+                - store_path: Base path for AttnStore
+                - cache_size: Cache size in bytes
+        """
+        super().__init__(config)
+
+    def connect(self) -> bool:
+        """Connect to AttnStore."""
+        try:
+            # Placeholder implementation
+            self._connected = True
+            return True
+        except Exception:
+            self._connected = False
+            return False
+
+    def disconnect(self) -> None:
+        """Disconnect from AttnStore."""
+        self._connected = False
+
+    def exists(self, key: str) -> bool:
+        """Check if key exists in AttnStore."""
+        if not self._connected:
+            return False
+        # Placeholder implementation
+        return False
+
+    def query(self, keys: List[str]) -> Dict[str, bool]:
+        """Query multiple keys for existence."""
+        if not self._connected:
+            return {k: False for k in keys}
+        # Placeholder implementation
+        return {k: False for k in keys}
+
+    def get_metadata(self, key: str) -> Optional[Dict[str, Any]]:
+        """Get metadata for a key."""
+        if not self._connected:
+            return None
+        # Placeholder implementation
+        return None
+
+    def list_keys(self, prefix: str = "") -> List[str]:
+        """List keys with a given prefix."""
+        if not self._connected:
+            return []
+        # Placeholder implementation
+        return []
+
+
+class AttnStoreConnector(StorageConnector):
+    """
+    AttnStore connector for Worker process.
+
+    Provides data transfer operations for AttnStore system.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize AttnStore connector.
+
+        Args:
+            config: Configuration with keys:
+                - store_path: Base path for AttnStore
+                - transfer_threads: Number of transfer threads
+        """
+        super().__init__(config)
+
+    def connect(self) -> bool:
+        """Connect to AttnStore."""
+        try:
+            self._connected = True
+            return True
+        except Exception:
+            self._connected = False
+            return False
+
+    def disconnect(self) -> None:
+        """Disconnect from AttnStore."""
+        self._connected = False
+
+    def get(self, key: str, dst_buffer: Any) -> bool:
+        """Get data from AttnStore."""
+        if not self._connected:
+            return False
+        # Placeholder implementation
+        return False
+
+    def set(self, key: str, src_buffer: Any, size: int) -> bool:
+        """Set data in AttnStore."""
+        if not self._connected:
+            return False
+        # Placeholder implementation
+        return False
+
+    def delete(self, key: str) -> bool:
+        """Delete data from AttnStore."""
+        if not self._connected:
+            return False
+        # Placeholder implementation
+        return False
+
+    def clear(self, prefix: str = "") -> int:
+        """Clear data from AttnStore."""
+        if not self._connected:
+            return 0
+        # Placeholder implementation
+        return 0
diff --git a/fastdeploy/cache_manager/v1/storage/base.py b/fastdeploy/cache_manager/v1/storage/base.py
new file mode 100644
index 00000000000..3ad64480e9d
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/storage/base.py
@@ -0,0 +1,218 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import threading
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+
+
+class StorageScheduler(ABC):
+    """
+    Abstract base class for storage scheduler operations.
+
+    Used by CacheManager (Scheduler process) to query storage
+    existence and metadata without performing actual data transfer.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize the storage scheduler.
+
+        Args:
+            config: Storage configuration
+        """
+        self.config = config or {}
+        self._lock = threading.RLock()
+        self._connected = False
+
+    @abstractmethod
+    def connect(self) -> bool:
+        """
+        Connect to the storage backend.
+
+        Returns:
+            True if connection was successful
+        """
+        pass
+
+    @abstractmethod
+    def disconnect(self) -> None:
+        """Disconnect from the storage backend."""
+        pass
+
+    @abstractmethod
+    def exists(self, key: str) -> bool:
+        """
+        Check if a key exists in storage.
+
+        Args:
+            key: Storage key to check
+
+        Returns:
+            True if key exists
+        """
+        pass
+
+    @abstractmethod
+    def query(self, keys: List[str]) -> Dict[str, bool]:
+        """
+        Query multiple keys for existence.
+
+        Args:
+            keys: List of keys to query
+
+        Returns:
+            Dictionary mapping keys to existence status
+        """
+        pass
+
+    @abstractmethod
+    def get_metadata(self, key: str) -> Optional[Dict[str, Any]]:
+        """
+        Get metadata for a key.
+
+        Args:
+            key: Storage key
+
+        Returns:
+            Metadata dictionary or None if not found
+        """
+        pass
+
+    @abstractmethod
+    def list_keys(self, prefix: str = "") -> List[str]:
+        """
+        List keys with a given prefix.
+
+        Args:
+            prefix: Key prefix to filter
+
+        Returns:
+            List of matching keys
+        """
+        pass
+
+    def is_connected(self) -> bool:
+        """Check if connected to storage."""
+        return self._connected
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get storage statistics."""
+        return {
+            "connected": self._connected,
+            "config": self.config,
+        }
+
+
+class StorageConnector(ABC):
+    """
+    Abstract base class for storage connector operations.
+
+    Used by CacheController (Worker process) to perform actual
+    data transfer operations with the storage backend.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize the storage connector.
+
+        Args:
+            config: Storage configuration
+        """
+        self.config = config or {}
+        self._lock = threading.RLock()
+        self._connected = False
+
+    @abstractmethod
+    def connect(self) -> bool:
+        """
+        Connect to the storage backend.
+
+        Returns:
+            True if connection was successful
+        """
+        pass
+
+    @abstractmethod
+    def disconnect(self) -> None:
+        """Disconnect from the storage backend."""
+        pass
+
+    @abstractmethod
+    def get(self, key: str, dst_buffer: Any) -> bool:
+        """
+        Get data from storage.
+
+        Args:
+            key: Storage key
+            dst_buffer: Destination buffer to write data
+
+        Returns:
+            True if get was successful
+        """
+        pass
+
+    @abstractmethod
+    def set(self, key: str, src_buffer: Any, size: int) -> bool:
+        """
+        Set data in storage.
+
+        Args:
+            key: Storage key
+            src_buffer: Source buffer to read data from
+            size: Size of data in bytes
+
+        Returns:
+            True if set was successful
+        """
+        pass
+
+    @abstractmethod
+    def delete(self, key: str) -> bool:
+        """
+        Delete data from storage.
+
+        Args:
+            key: Storage key to delete
+
+        Returns:
+            True if deletion was successful
+        """
+        pass
+
+    @abstractmethod
+    def clear(self, prefix: str = "") -> int:
+        """
+        Clear data from storage.
+
+        Args:
+            prefix: Key prefix to clear (empty for all)
+
+        Returns:
+            Number of keys cleared
+        """
+        pass
+
+    def is_connected(self) -> bool:
+        """Check if connected to storage."""
+        return self._connected
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get connector statistics."""
+        return {
+            "connected": self._connected,
+            "config": self.config,
+        }
diff --git a/fastdeploy/cache_manager/v1/storage/mooncake/__init__.py b/fastdeploy/cache_manager/v1/storage/mooncake/__init__.py
new file mode 100644
index 00000000000..6268bd6fd6e
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/storage/mooncake/__init__.py
@@ -0,0 +1,22 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .connector import MooncakeStorageConnector, MooncakeStorageScheduler
+
+__all__ = [
+    "MooncakeStorageScheduler",
+    "MooncakeStorageConnector",
+]
diff --git a/fastdeploy/cache_manager/v1/storage/mooncake/connector.py b/fastdeploy/cache_manager/v1/storage/mooncake/connector.py
new file mode 100644
index 00000000000..a8e0d01010d
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/storage/mooncake/connector.py
@@ -0,0 +1,168 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Any, Dict, List, Optional
+
+from ..base import StorageConnector, StorageScheduler
+
+
+class MooncakeStorageScheduler(StorageScheduler):
+    """
+    Mooncake storage scheduler for Scheduler process.
+
+    Provides query operations for Mooncake distributed storage.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize Mooncake storage scheduler.
+
+        Args:
+            config: Configuration with keys:
+                - server_addr: Mooncake server address
+                - namespace: Storage namespace
+                - timeout: Connection timeout
+        """
+        super().__init__(config)
+        self._client = None
+
+    def connect(self) -> bool:
+        """Connect to Mooncake storage."""
+        try:
+            # Initialize Mooncake client
+            # This would be implemented with actual Mooncake SDK
+            # import mooncake
+            # self._client = mooncake.Client(**self.config)
+            self._connected = True
+            return True
+        except Exception:
+            self._connected = False
+            return False
+
+    def disconnect(self) -> None:
+        """Disconnect from Mooncake storage."""
+        self._client = None
+        self._connected = False
+
+    def exists(self, key: str) -> bool:
+        """Check if key exists in Mooncake storage."""
+        if not self._connected or self._client is None:
+            return False
+
+        # Placeholder implementation
+        # return self._client.exists(key)
+        return False
+
+    def query(self, keys: List[str]) -> Dict[str, bool]:
+        """Query multiple keys for existence."""
+        if not self._connected or self._client is None:
+            return {k: False for k in keys}
+
+        # Placeholder implementation
+        # return self._client.batch_exists(keys)
+        return {k: False for k in keys}
+
+    def get_metadata(self, key: str) -> Optional[Dict[str, Any]]:
+        """Get metadata for a key."""
+        if not self._connected or self._client is None:
+            return None
+
+        # Placeholder implementation
+        # return self._client.get_metadata(key)
+        return None
+
+    def list_keys(self, prefix: str = "") -> List[str]:
+        """List keys with a given prefix."""
+        if not self._connected or self._client is None:
+            return []
+
+        # Placeholder implementation
+        # return self._client.list_keys(prefix)
+        return []
+
+
+class MooncakeStorageConnector(StorageConnector):
+    """
+    Mooncake storage connector for Worker process.
+
+    Provides data transfer operations for Mooncake distributed storage.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize Mooncake storage connector.
+
+        Args:
+            config: Configuration with keys:
+                - server_addr: Mooncake server address
+                - namespace: Storage namespace
+                - transfer_timeout: Transfer timeout
+                - buffer_size: Transfer buffer size
+        """
+        super().__init__(config)
+        self._client = None
+
+    def connect(self) -> bool:
+        """Connect to Mooncake storage."""
+        try:
+            # Initialize Mooncake client
+            # This would be implemented with actual Mooncake SDK
+            self._connected = True
+            return True
+        except Exception:
+            self._connected = False
+            return False
+
+    def disconnect(self) -> None:
+        """Disconnect from Mooncake storage."""
+        self._client = None
+        self._connected = False
+
+    def get(self, key: str, dst_buffer: Any) -> bool:
+        """Get data from Mooncake storage."""
+        if not self._connected or self._client is None:
+            return False
+
+        # Placeholder implementation
+        # return self._client.get(key, dst_buffer)
+        return False
+
+    def set(self, key: str, src_buffer: Any, size: int) -> bool:
+        """Set data in Mooncake storage."""
+        if not self._connected or self._client is None:
+            return False
+
+        # Placeholder implementation
+        # return self._client.set(key, src_buffer, size)
+        return False
+
+    def delete(self, key: str) -> bool:
+        """Delete data from Mooncake storage."""
+        if not self._connected or self._client is None:
+            return False
+
+        # Placeholder implementation
+        # return self._client.delete(key)
+        return False
+
+    def clear(self, prefix: str = "") -> int:
+        """Clear data from Mooncake storage."""
+        if not self._connected or self._client is None:
+            return 0
+
+        # Placeholder implementation
+        # return self._client.clear(prefix)
+        return 0
diff --git a/fastdeploy/cache_manager/v1/transfer/__init__.py b/fastdeploy/cache_manager/v1/transfer/__init__.py
new file mode 100644
index 00000000000..eef01a4932a
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/transfer/__init__.py
@@ -0,0 +1,176 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Any, Dict, Optional
+
+from .base import TransferConnector
+
+
+def create_transfer_connector(
+    config: Any,
+) -> Optional[TransferConnector]:
+    """
+    Create a TransferConnector instance based on configuration.
+
+    This is a factory function that creates the appropriate TransferConnector
+    based on the transfer backend type specified in the configuration.
+
+    Args:
+        config: Configuration object, can be:
+            - CacheConfig: FastDeploy configuration object
+            - Dict: Dictionary with 'transfer_type' and backend-specific settings
+
+    Returns:
+        TransferConnector instance if successful, None otherwise
+
+    Example:
+        # Using CacheConfig
+        connector = create_transfer_connector(fd_config)
+
+        # Using dict config
+        config = {
+            'transfer_type': 'rdma',
+            'device': 'mlx5_0',
+            'port': 1,
+        }
+        connector = create_transfer_connector(config)
+    """
+    transfer_type = _get_transfer_type(config)
+
+    if transfer_type is None:
+        return None
+
+    connector: Optional[TransferConnector] = None
+
+    # Create connector based on transfer type
+    if transfer_type == "rdma":
+        from .rdma.connector import RDMAConnector
+
+        connector = RDMAConnector(_get_backend_config(config))
+
+    elif transfer_type == "ipc":
+        from .ipc.connector import IPCConnector
+
+        connector = IPCConnector(_get_backend_config(config))
+
+    else:
+        raise ValueError(f"Unsupported transfer type: {transfer_type}. " f"Supported types: rdma, ipc")
+
+    # Attempt connection
+    if connector is not None:
+        if not connector.connect():
+            # Log warning but still return the connector
+            pass
+
+    return connector
+
+
+def _get_transfer_type(config: Any) -> Optional[str]:
+    """
+    Get transfer type from configuration.
+
+    Args:
+        config: Configuration object
+
+    Returns:
+        Transfer type string or None
+    """
+    # Handle CacheConfig (from FDConfig)
+    if hasattr(config, "kvcache_transfer_backend"):
+        transfer_backend = config.kvcache_transfer_backend
+        if transfer_backend:
+            return _normalize_transfer_type(transfer_backend)
+
+    # Handle dict config
+    if isinstance(config, dict):
+        if "transfer_type" in config:
+            return _normalize_transfer_type(config["transfer_type"])
+        elif "kvcache_transfer_backend" in config:
+            return _normalize_transfer_type(config["kvcache_transfer_backend"])
+
+    # Handle object with cache_config attribute
+    if hasattr(config, "cache_config") and config.cache_config is not None:
+        cache_config = config.cache_config
+        if hasattr(cache_config, "kvcache_transfer_backend"):
+            transfer_backend = cache_config.kvcache_transfer_backend
+            if transfer_backend:
+                return _normalize_transfer_type(transfer_backend)
+
+    return None
+
+
+def _get_backend_config(config: Any) -> Dict[str, Any]:
+    """
+    Extract backend-specific configuration.
+
+    Args:
+        config: Configuration object
+
+    Returns:
+        Dictionary with backend configuration
+    """
+    backend_config: Dict[str, Any] = {}
+
+    # Handle CacheConfig
+    if hasattr(config, "kvcache_transfer_config"):
+        backend_config = config.kvcache_transfer_config or {}
+
+    # Handle dict config
+    elif isinstance(config, dict):
+        if "transfer_config" in config:
+            backend_config = config["transfer_config"]
+        elif "kvcache_transfer_config" in config:
+            backend_config = config["kvcache_transfer_config"]
+        else:
+            # Copy all keys except transfer_type
+            backend_config = {
+                k: v for k, v in config.items() if k not in ("transfer_type", "kvcache_transfer_backend")
+            }
+
+    # Handle object with cache_config attribute
+    if hasattr(config, "cache_config") and config.cache_config is not None:
+        cache_config = config.cache_config
+        if hasattr(cache_config, "kvcache_transfer_config"):
+            backend_config = cache_config.kvcache_transfer_config or {}
+
+    return backend_config
+
+
+def _normalize_transfer_type(transfer_type: Any) -> Optional[str]:
+    """
+    Normalize transfer type to lowercase string.
+
+    Args:
+        transfer_type: Transfer type (enum, string, etc.)
+
+    Returns:
+        Normalized transfer type string
+    """
+    if transfer_type is None:
+        return None
+
+    # Handle string
+    if isinstance(transfer_type, str):
+        return transfer_type.lower()
+
+    # Handle other types
+    return str(transfer_type).lower()
+
+
+__all__ = [
+    "TransferConnector",
+    "create_transfer_connector",
+]
diff --git a/fastdeploy/cache_manager/v1/transfer/base.py b/fastdeploy/cache_manager/v1/transfer/base.py
new file mode 100644
index 00000000000..1cddc06ab07
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/transfer/base.py
@@ -0,0 +1,194 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import threading
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+
+
+class TransferConnector(ABC):
+    """
+    Abstract base class for transfer connector operations.
+
+    Used by CacheController (Worker process) to perform cross-node
+    and cross-process data transfer operations.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize the transfer connector.
+
+        Args:
+            config: Transfer configuration
+        """
+        self.config = config or {}
+        self._lock = threading.RLock()
+        self._connected = False
+
+    @abstractmethod
+    def connect(self) -> bool:
+        """
+        Connect to the transfer backend.
+
+        Returns:
+            True if connection was successful
+        """
+        pass
+
+    @abstractmethod
+    def disconnect(self) -> None:
+        """Disconnect from the transfer backend."""
+        pass
+
+    @abstractmethod
+    def send(
+        self,
+        dst_addr: str,
+        src_buffer: Any,
+        size: int,
+        dst_offset: int = 0,
+    ) -> bool:
+        """
+        Send data to a remote destination.
+
+        Args:
+            dst_addr: Destination address
+            src_buffer: Source buffer to read data from
+            size: Size of data in bytes
+            dst_offset: Offset at destination
+
+        Returns:
+            True if send was successful
+        """
+        pass
+
+    @abstractmethod
+    def recv(
+        self,
+        src_addr: str,
+        dst_buffer: Any,
+        size: int,
+        src_offset: int = 0,
+    ) -> bool:
+        """
+        Receive data from a remote source.
+
+        Args:
+            src_addr: Source address
+            dst_buffer: Destination buffer to write data
+            size: Size of data in bytes
+            src_offset: Offset at source
+
+        Returns:
+            True if receive was successful
+        """
+        pass
+
+    @abstractmethod
+    def send_async(
+        self,
+        dst_addr: str,
+        src_buffer: Any,
+        size: int,
+        dst_offset: int = 0,
+    ) -> Any:
+        """
+        Asynchronously send data to a remote destination.
+
+        Args:
+            dst_addr: Destination address
+            src_buffer: Source buffer to read data from
+            size: Size of data in bytes
+            dst_offset: Offset at destination
+
+        Returns:
+            Handle for tracking the async operation
+        """
+        pass
+
+    @abstractmethod
+    def recv_async(
+        self,
+        src_addr: str,
+        dst_buffer: Any,
+        size: int,
+        src_offset: int = 0,
+    ) -> Any:
+        """
+        Asynchronously receive data from a remote source.
+
+        Args:
+            src_addr: Source address
+            dst_buffer: Destination buffer to write data
+            size: Size of data in bytes
+            src_offset: Offset at source
+
+        Returns:
+            Handle for tracking the async operation
+        """
+        pass
+
+    @abstractmethod
+    def wait(self, handle: Any, timeout: float = -1) -> bool:
+        """
+        Wait for an async operation to complete.
+
+        Args:
+            handle: Handle from send_async or recv_async
+            timeout: Timeout in seconds (-1 for infinite)
+
+        Returns:
+            True if operation completed successfully
+        """
+        pass
+
+    @abstractmethod
+    def register_buffer(self, buffer: Any, addr: str) -> bool:
+        """
+        Register a buffer for RDMA operations.
+
+        Args:
+            buffer: Buffer to register
+            addr: Address to associate with buffer
+
+        Returns:
+            True if registration was successful
+        """
+        pass
+
+    @abstractmethod
+    def unregister_buffer(self, addr: str) -> bool:
+        """
+        Unregister a buffer.
+
+        Args:
+            addr: Address of buffer to unregister
+
+        Returns:
+            True if unregistration was successful
+        """
+        pass
+
+    def is_connected(self) -> bool:
+        """Check if connected to transfer backend."""
+        return self._connected
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get connector statistics."""
+        return {
+            "connected": self._connected,
+            "config": self.config,
+        }
diff --git a/fastdeploy/cache_manager/v1/transfer/ipc/__init__.py b/fastdeploy/cache_manager/v1/transfer/ipc/__init__.py
new file mode 100644
index 00000000000..231c16ecbf7
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/transfer/ipc/__init__.py
@@ -0,0 +1,21 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .connector import IPCConnector
+
+__all__ = [
+    "IPCConnector",
+]
diff --git a/fastdeploy/cache_manager/v1/transfer/ipc/connector.py b/fastdeploy/cache_manager/v1/transfer/ipc/connector.py
new file mode 100644
index 00000000000..18914253b2e
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/transfer/ipc/connector.py
@@ -0,0 +1,201 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import mmap
+import os
+from typing import Any, Dict, Optional
+
+from ..base import TransferConnector
+
+
+class IPCConnector(TransferConnector):
+    """
+    IPC connector for cross-process transfer on same node.
+
+    Uses shared memory for efficient data transfer between
+    processes on the same machine.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize IPC connector.
+
+        Args:
+            config: Configuration with keys:
+                - shm_path: Shared memory path prefix
+                - buffer_size: Default buffer size
+                - max_buffers: Maximum number of buffers
+        """
+        super().__init__(config)
+        self._shm_buffers: Dict[str, mmap.mmap] = {}
+        self._shm_paths: Dict[str, str] = {}
+
+    def connect(self) -> bool:
+        """Connect to IPC backend."""
+        try:
+            self._connected = True
+            return True
+        except Exception:
+            self._connected = False
+            return False
+
+    def disconnect(self) -> None:
+        """Disconnect from IPC backend."""
+        # Clean up shared memory
+        for name, shm in self._shm_buffers.items():
+            try:
+                shm.close()
+            except Exception:
+                pass
+
+        # Remove shared memory files
+        for name, path in self._shm_paths.items():
+            try:
+                os.unlink(path)
+            except Exception:
+                pass
+
+        self._shm_buffers.clear()
+        self._shm_paths.clear()
+        self._connected = False
+
+    def send(
+        self,
+        dst_addr: str,
+        src_buffer: Any,
+        size: int,
+        dst_offset: int = 0,
+    ) -> bool:
+        """Send data via shared memory."""
+        if not self._connected:
+            return False
+
+        if dst_addr not in self._shm_buffers:
+            return False
+
+        try:
+            shm = self._shm_buffers[dst_addr]
+            shm.seek(dst_offset)
+            shm.write(src_buffer[:size])
+            return True
+        except Exception:
+            return False
+
+    def recv(
+        self,
+        src_addr: str,
+        dst_buffer: Any,
+        size: int,
+        src_offset: int = 0,
+    ) -> bool:
+        """Receive data via shared memory."""
+        if not self._connected:
+            return False
+
+        if src_addr not in self._shm_buffers:
+            return False
+
+        try:
+            shm = self._shm_buffers[src_addr]
+            shm.seek(src_offset)
+            data = shm.read(size)
+            dst_buffer[:size] = data
+            return True
+        except Exception:
+            return False
+
+    def send_async(
+        self,
+        dst_addr: str,
+        src_buffer: Any,
+        size: int,
+        dst_offset: int = 0,
+    ) -> Any:
+        """Asynchronously send data via shared memory."""
+        # For shared memory, async is similar to sync
+        success = self.send(dst_addr, src_buffer, size, dst_offset)
+        return {"success": success, "addr": dst_addr}
+
+    def recv_async(
+        self,
+        src_addr: str,
+        dst_buffer: Any,
+        size: int,
+        src_offset: int = 0,
+    ) -> Any:
+        """Asynchronously receive data via shared memory."""
+        # For shared memory, async is similar to sync
+        success = self.recv(src_addr, dst_buffer, size, src_offset)
+        return {"success": success, "addr": src_addr}
+
+    def wait(self, handle: Any, timeout: float = -1) -> bool:
+        """Wait for IPC operation completion."""
+        if handle is None:
+            return False
+        return handle.get("success", False)
+
+    def register_buffer(self, buffer: Any, addr: str) -> bool:
+        """Register a shared memory buffer."""
+        if not self._connected:
+            return False
+
+        try:
+            # Create shared memory file
+            shm_path = f"/dev/shm/kv_cache_{addr}"
+            shm_fd = os.open(shm_path, os.O_CREAT | os.O_RDWR, 0o666)
+
+            # Size the file
+            buffer_size = len(buffer) if hasattr(buffer, "__len__") else self.config.get("buffer_size", 1024 * 1024)
+            os.ftruncate(shm_fd, buffer_size)
+
+            # Map the file
+            shm = mmap.mmap(shm_fd, buffer_size)
+            os.close(shm_fd)
+
+            self._shm_buffers[addr] = shm
+            self._shm_paths[addr] = shm_path
+
+            return True
+        except Exception:
+            return False
+
+    def unregister_buffer(self, addr: str) -> bool:
+        """Unregister a shared memory buffer."""
+        if addr not in self._shm_buffers:
+            return False
+
+        try:
+            self._shm_buffers[addr].close()
+            del self._shm_buffers[addr]
+
+            if addr in self._shm_paths:
+                os.unlink(self._shm_paths[addr])
+                del self._shm_paths[addr]
+
+            return True
+        except Exception:
+            return False
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get IPC connector statistics."""
+        stats = super().get_stats()
+        stats.update(
+            {
+                "registered_buffers": len(self._shm_buffers),
+                "buffer_addresses": list(self._shm_buffers.keys()),
+            }
+        )
+        return stats
diff --git a/fastdeploy/cache_manager/v1/transfer/rdma/__init__.py b/fastdeploy/cache_manager/v1/transfer/rdma/__init__.py
new file mode 100644
index 00000000000..f300bf93bc8
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/transfer/rdma/__init__.py
@@ -0,0 +1,21 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .connector import RDMAConnector
+
+__all__ = [
+    "RDMAConnector",
+]
diff --git a/fastdeploy/cache_manager/v1/transfer/rdma/connector.py b/fastdeploy/cache_manager/v1/transfer/rdma/connector.py
new file mode 100644
index 00000000000..4f306ad10fe
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/transfer/rdma/connector.py
@@ -0,0 +1,173 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Any, Dict, Optional
+
+from ..base import TransferConnector
+
+
+class RDMAConnector(TransferConnector):
+    """
+    RDMA connector for high-performance cross-node transfer.
+
+    Uses RDMA for zero-copy, low-latency data transfer between
+    nodes in PD separation deployments.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize RDMA connector.
+
+        Args:
+            config: Configuration with keys:
+                - device: RDMA device name
+                - port: RDMA port
+                - max_wr: Maximum work requests
+                - buffer_size: Buffer size for transfers
+        """
+        super().__init__(config)
+        self._pd = None  # Protection domain
+        self._cq = None  # Completion queue
+        self._qp = None  # Queue pair
+        self._mr = None  # Memory region
+        self._buffers: Dict[str, Any] = {}
+
+    def connect(self) -> bool:
+        """Connect to RDMA backend."""
+        try:
+            # Initialize RDMA resources
+            # This would be implemented with actual RDMA libraries
+            # import pyverbs
+            # self._pd = pyverbs.PD(...)
+            # self._cq = pyverbs.CQ(...)
+            # self._qp = pyverbs.QP(...)
+            self._connected = True
+            return True
+        except Exception:
+            self._connected = False
+            return False
+
+    def disconnect(self) -> None:
+        """Disconnect from RDMA backend."""
+        self._buffers.clear()
+        self._mr = None
+        self._qp = None
+        self._cq = None
+        self._pd = None
+        self._connected = False
+
+    def send(
+        self,
+        dst_addr: str,
+        src_buffer: Any,
+        size: int,
+        dst_offset: int = 0,
+    ) -> bool:
+        """Send data via RDMA write."""
+        if not self._connected:
+            return False
+
+        # Placeholder implementation
+        # This would use RDMA write operations
+        # self._qp.post_send(...)
+        # self._cq.poll()
+        return False
+
+    def recv(
+        self,
+        src_addr: str,
+        dst_buffer: Any,
+        size: int,
+        src_offset: int = 0,
+    ) -> bool:
+        """Receive data via RDMA read."""
+        if not self._connected:
+            return False
+
+        # Placeholder implementation
+        # This would use RDMA read operations
+        # self._qp.post_recv(...)
+        # self._cq.poll()
+        return False
+
+    def send_async(
+        self,
+        dst_addr: str,
+        src_buffer: Any,
+        size: int,
+        dst_offset: int = 0,
+    ) -> Any:
+        """Asynchronously send data via RDMA."""
+        if not self._connected:
+            return None
+
+        # Placeholder implementation
+        # Return a work request handle
+        return None
+
+    def recv_async(
+        self,
+        src_addr: str,
+        dst_buffer: Any,
+        size: int,
+        src_offset: int = 0,
+    ) -> Any:
+        """Asynchronously receive data via RDMA."""
+        if not self._connected:
+            return None
+
+        # Placeholder implementation
+        # Return a work request handle
+        return None
+
+    def wait(self, handle: Any, timeout: float = -1) -> bool:
+        """Wait for RDMA operation completion."""
+        if not self._connected:
+            return False
+
+        # Placeholder implementation
+        # Poll completion queue for the work request
+        return False
+
+    def register_buffer(self, buffer: Any, addr: str) -> bool:
+        """Register a buffer for RDMA operations."""
+        if not self._connected:
+            return False
+
+        try:
+            # Register memory region for RDMA
+            # self._mr = pyverbs.MR(self._pd, buffer, ...)
+            self._buffers[addr] = buffer
+            return True
+        except Exception:
+            return False
+
+    def unregister_buffer(self, addr: str) -> bool:
+        """Unregister a buffer."""
+        if addr in self._buffers:
+            del self._buffers[addr]
+            return True
+        return False
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get RDMA connector statistics."""
+        stats = super().get_stats()
+        stats.update(
+            {
+                "registered_buffers": len(self._buffers),
+            }
+        )
+        return stats
diff --git a/fastdeploy/cache_manager/v1/transfer_manager.py b/fastdeploy/cache_manager/v1/transfer_manager.py
new file mode 100644
index 00000000000..12552da030d
--- /dev/null
+++ b/fastdeploy/cache_manager/v1/transfer_manager.py
@@ -0,0 +1,663 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import threading
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import paddle
+from paddleformers.utils.log import logger
+
+# Import cupy for independent CUDA stream management
+try:
+    import cupy as cp
+
+    _HAS_CUPY = True
+except ImportError:
+    _HAS_CUPY = False
+    logger.warning("cupy not available, falling back to synchronous transfers")
+
+# Import ops for cache swap
+from fastdeploy.cache_manager.ops import (
+    swap_cache_per_layer,  # sync fallback (used when cupy not available)
+)
+from fastdeploy.cache_manager.ops import (
+    swap_cache_per_layer_async,  # async per-layer op (no cudaStreamSynchronize)
+)
+from fastdeploy.cache_manager.ops import swap_cache_all_layers
+from fastdeploy.cache_manager.v1.storage import create_storage_connector
+from fastdeploy.cache_manager.v1.transfer import create_transfer_connector
+
+if TYPE_CHECKING:
+    from fastdeploy.config import FDConfig
+
+
+class CacheTransferManager:
+    """
+    KV Cache Transfer Manager.
+
+    H2D (load): layer-by-layer on _input_stream, overlaps with forward compute.
+    D2H (evict): all-layers on _output_stream, fire-and-forget.
+
+    Data organization:
+    1. Name-indexed storage (_cache_kvs_map, _host_cache_kvs_map): for building layer indices
+    2. Layer-indexed storage (_device_key_caches, etc.): passed to swap operators
+
+    Attributes:
+        config: FDConfig instance.
+    """
+
+    def __init__(
+        self,
+        config: "FDConfig",
+        local_rank: int = 0,
+        device_id: int = 0,
+    ):
+        """
+        Initialize the transfer manager.
+
+        Args:
+            config: FDConfig instance.
+            local_rank: Local rank for tensor parallel.
+            device_id: Device ID.
+        """
+        self.config = config
+        self.cache_config = config.cache_config
+        self.quant_config = config.quant_config
+
+        self._local_rank = local_rank
+        self._device_id = device_id
+        self._num_layers = config.model_config.num_hidden_layers
+        self._cache_dtype = config.cache_config.cache_dtype
+        self._num_host_blocks = self.cache_config.num_cpu_blocks or 0
+
+        self._lock = threading.RLock()
+
+        # ============ Async Transfer Streams (cupy-based) ============
+        # Two independent CUDA streams for fully async transfer
+        # _input_stream: H2D transfer (load to device, layer-by-layer)
+        # _output_stream: D2H transfer (evict to host, all-layers)
+        # They run in parallel without waiting for each other
+        # Using cupy to avoid affecting Paddle's internal stream state
+        if _HAS_CUPY and paddle.is_compiled_with_cuda():
+            cupy_current_device = cp.cuda.runtime.getDevice()
+            logger.info(
+                f"[TransferManager] Creating streams: local_rank={self._local_rank}, device_id={self._device_id}, "
+                f"cupy_current_device={cupy_current_device}"
+            )
+            with cp.cuda.Device(self._device_id):
+                self._input_stream = cp.cuda.Stream(non_blocking=False)
+                self._output_stream = cp.cuda.Stream(non_blocking=False)
+            logger.info(
+                f"[TransferManager] Using cupy streams: input={id(self._input_stream)}, output={id(self._output_stream)}"
+            )
+        else:
+            self._input_stream = None
+            self._output_stream = None
+            logger.warning("[TransferManager] cupy not available, async transfers disabled")
+
+        # ============ KV Cache Data Storage ============
+        # Name-indexed storage (used to build layer-indexed structures below)
+        self._cache_kvs_map: Dict[str, Any] = {}
+        self._host_cache_kvs_map: Dict[str, Any] = {}
+
+        # Layer-indexed lists (for all-layer transfers, compatible with swap_cache_all_layers operator)
+        # Device cache tensors per layer (GPU)
+        self._device_key_caches: List[Any] = []  # key cache per layer
+        self._device_value_caches: List[Any] = []  # value cache per layer
+        self._device_key_scales: List[Any] = []  # key scales (fp8)
+        self._device_value_scales: List[Any] = []  # value scales (fp8)
+
+        # Host cache pointers per layer (CPU pinned memory)
+        self._host_key_ptrs: List[int] = []  # key host pointers
+        self._host_value_ptrs: List[int] = []  # value host pointers
+        self._host_key_scales_ptrs: List[int] = []  # key scale pointers (fp8)
+        self._host_value_scales_ptrs: List[int] = []  # value scale pointers (fp8)
+
+        # ============ Connectors (for future use) ============
+        self._storage_connector = create_storage_connector(self.cache_config)
+        self._transfer_connector = create_transfer_connector(self.cache_config)
+
+    # ============ Cache Map Setters ============
+
+    @property
+    def cache_kvs_map(self) -> Dict[str, Any]:
+        return self._cache_kvs_map
+
+    def set_cache_kvs_map(self, cache_kvs_map: Dict[str, Any]) -> None:
+        """
+        Share the KV cache tensor map from CacheController.
+
+        Args:
+            cache_kvs_map: Dictionary mapping cache names to tensors.
+                Format: {
+                    "key_caches_{layer_id}_rank{rank}.device{device}": paddle.Tensor,
+                    "value_caches_{layer_id}_rank{rank}.device{device}": paddle.Tensor,
+                    "key_cache_scales_{layer_id}_rank{rank}.device{device}": paddle.Tensor,  # fp8
+                    "value_cache_scales_{layer_id}_rank{rank}.device{device}": paddle.Tensor, # fp8
+                    ...
+                }
+        """
+        with self._lock:
+            self._cache_kvs_map = cache_kvs_map
+            self._build_device_layer_indices()
+
+    def _build_device_layer_indices(self) -> None:
+        """Build layer-indexed Device cache lists from _cache_kvs_map."""
+        if not self._cache_kvs_map:
+            return
+
+        self._device_key_caches = []
+        self._device_value_caches = []
+        self._device_key_scales = []
+        self._device_value_scales = []
+
+        for layer_idx in range(self._num_layers):
+            key_name = f"key_caches_{layer_idx}_rank{self._local_rank}.device{self._device_id}"
+            val_name = f"value_caches_{layer_idx}_rank{self._local_rank}.device{self._device_id}"
+            key_scale_name = f"key_cache_scales_{layer_idx}_rank{self._local_rank}.device{self._device_id}"
+            val_scale_name = f"value_cache_scales_{layer_idx}_rank{self._local_rank}.device{self._device_id}"
+
+            self._device_key_caches.append(self._cache_kvs_map.get(key_name))
+            self._device_value_caches.append(self._cache_kvs_map.get(val_name))
+
+            if self._is_fp8_quantization():
+                self._device_key_scales.append(self._cache_kvs_map.get(key_scale_name))
+                self._device_value_scales.append(self._cache_kvs_map.get(val_scale_name))
+
+    @property
+    def host_cache_kvs_map(self) -> Dict[str, Any]:
+        return self._host_cache_kvs_map
+
+    def set_host_cache_kvs_map(self, host_cache_kvs_map: Dict[str, Any]) -> None:
+        """
+        Share the Host KV cache tensor map from CacheController.
+
+        Args:
+            host_cache_kvs_map: Dictionary mapping cache names to Host pointers (int).
+                Format: {
+                    "key_caches_{layer_id}_rank{rank}.device{device}": pointer (int),
+                    ...
+                }
+        """
+        with self._lock:
+            self._host_cache_kvs_map = host_cache_kvs_map
+            self._build_host_layer_indices()
+
+    def _build_host_layer_indices(self) -> None:
+        """Build layer-indexed Host pointer lists from _host_cache_kvs_map."""
+        if self._num_host_blocks <= 0:
+            return
+        if not self._host_cache_kvs_map:
+            return
+        if self._num_layers == 0:
+            return
+
+        self._host_key_ptrs = []
+        self._host_value_ptrs = []
+        self._host_key_scales_ptrs = []
+        self._host_value_scales_ptrs = []
+
+        for layer_idx in range(self._num_layers):
+            key_name = f"key_caches_{layer_idx}_rank{self._local_rank}.device{self._device_id}"
+            val_name = f"value_caches_{layer_idx}_rank{self._local_rank}.device{self._device_id}"
+            key_scale_name = f"key_cache_scales_{layer_idx}_rank{self._local_rank}.device{self._device_id}"
+            val_scale_name = f"value_cache_scales_{layer_idx}_rank{self._local_rank}.device{self._device_id}"
+
+            self._host_key_ptrs.append(self._host_cache_kvs_map.get(key_name, 0))
+            self._host_value_ptrs.append(self._host_cache_kvs_map.get(val_name, 0))
+
+            if self._is_fp8_quantization():
+                self._host_key_scales_ptrs.append(self._host_cache_kvs_map.get(key_scale_name, 0))
+                self._host_value_scales_ptrs.append(self._host_cache_kvs_map.get(val_scale_name, 0))
+
+    # ============ Metadata Properties ============
+
+    def _get_kv_cache_quant_type(self) -> Optional[str]:
+        """Get KV cache quantization type."""
+        if (
+            self.quant_config
+            and hasattr(self.quant_config, "kv_cache_quant_type")
+            and self.quant_config.kv_cache_quant_type is not None
+        ):
+            return self.quant_config.kv_cache_quant_type
+        return None
+
+    def _is_fp8_quantization(self, quant_type: Optional[str] = None) -> bool:
+        """Check if using fp8 quantization."""
+        if quant_type is None:
+            quant_type = self._get_kv_cache_quant_type()
+        return quant_type == "block_wise_fp8"
+
+    @property
+    def num_layers(self) -> int:
+        return self._num_layers
+
+    @property
+    def local_rank(self) -> int:
+        return self._local_rank
+
+    @property
+    def device_id(self) -> int:
+        return self._device_id
+
+    @property
+    def cache_dtype(self) -> str:
+        return self._cache_dtype
+
+    @property
+    def has_cache_scale(self) -> bool:
+        """Check if cache has scale tensors (fp8)."""
+        return self._is_fp8_quantization()
+
+    @property
+    def num_host_blocks(self) -> int:
+        return self._num_host_blocks
+
+    # ============ Layer Indexed Access ============
+
+    def get_device_key_cache(self, layer_idx: int) -> Optional[Any]:
+        """Get Device key cache tensor for a specific layer."""
+        if 0 <= layer_idx < len(self._device_key_caches):
+            return self._device_key_caches[layer_idx]
+        return None
+
+    def get_device_value_cache(self, layer_idx: int) -> Optional[Any]:
+        """Get Device value cache tensor for a specific layer."""
+        if 0 <= layer_idx < len(self._device_value_caches):
+            return self._device_value_caches[layer_idx]
+        return None
+
+    def get_host_key_ptr(self, layer_idx: int) -> int:
+        """Get Host key cache pointer for a specific layer."""
+        if self._num_host_blocks <= 0:
+            return 0
+        if 0 <= layer_idx < len(self._host_key_ptrs):
+            return self._host_key_ptrs[layer_idx]
+        return 0
+
+    def get_host_value_ptr(self, layer_idx: int) -> int:
+        """Get Host value cache pointer for a specific layer."""
+        if self._num_host_blocks <= 0:
+            return 0
+        if 0 <= layer_idx < len(self._host_value_ptrs):
+            return self._host_value_ptrs[layer_idx]
+        return 0
+
+    # ============ Internal Sync Fallbacks (used when cupy not available) ============
+
+    def _swap_all_layers(
+        self,
+        device_block_ids: List[int],
+        host_block_ids: List[int],
+        mode: int,
+    ) -> bool:
+        """
+        Synchronous all-layer transfer fallback (used when cupy streams unavailable).
+
+        Args:
+            device_block_ids: Device block IDs to swap.
+            host_block_ids: Host block IDs to swap.
+            mode: 0=Device→Host (evict), 1=Host→Device (load).
+        """
+        if self._num_host_blocks <= 0:
+            return False
+
+        try:
+            swap_cache_all_layers(
+                self._device_key_caches,
+                self._host_key_ptrs,
+                self._num_host_blocks,
+                device_block_ids,
+                host_block_ids,
+                self._device_id,
+                mode,
+            )
+            swap_cache_all_layers(
+                self._device_value_caches,
+                self._host_value_ptrs,
+                self._num_host_blocks,
+                device_block_ids,
+                host_block_ids,
+                self._device_id,
+                mode,
+            )
+            if self._is_fp8_quantization() and self._device_key_scales and self._host_key_scales_ptrs:
+                swap_cache_all_layers(
+                    self._device_key_scales,
+                    self._host_key_scales_ptrs,
+                    self._num_host_blocks,
+                    device_block_ids,
+                    host_block_ids,
+                    self._device_id,
+                    mode,
+                )
+                swap_cache_all_layers(
+                    self._device_value_scales,
+                    self._host_value_scales_ptrs,
+                    self._num_host_blocks,
+                    device_block_ids,
+                    host_block_ids,
+                    self._device_id,
+                    mode,
+                )
+            return True
+        except Exception:
+            import traceback
+
+            traceback.print_exc()
+            return False
+
+    def _swap_single_layer(
+        self,
+        layer_idx: int,
+        device_block_ids: List[int],
+        host_block_ids: List[int],
+        mode: int,
+    ) -> bool:
+        """
+        Synchronous single-layer transfer fallback (used when cupy streams unavailable).
+
+        Args:
+            layer_idx: Layer index to transfer.
+            device_block_ids: Device block IDs to swap.
+            host_block_ids: Host block IDs to swap.
+            mode: 0=Device→Host (evict), 1=Host→Device (load).
+        """
+        if self._num_host_blocks <= 0:
+            return False
+        if not device_block_ids or not host_block_ids:
+            return False
+        if len(device_block_ids) != len(host_block_ids):
+            return False
+
+        try:
+            key_cache = self.get_device_key_cache(layer_idx)
+            value_cache = self.get_device_value_cache(layer_idx)
+            if key_cache is None or value_cache is None:
+                return False
+
+            key_ptr = self.get_host_key_ptr(layer_idx)
+            value_ptr = self.get_host_value_ptr(layer_idx)
+            if key_ptr == 0 or value_ptr == 0:
+                return False
+
+            swap_cache_per_layer(
+                key_cache,
+                key_ptr,
+                self._num_host_blocks,
+                device_block_ids,
+                host_block_ids,
+                self._device_id,
+                mode,
+            )
+            swap_cache_per_layer(
+                value_cache,
+                value_ptr,
+                self._num_host_blocks,
+                device_block_ids,
+                host_block_ids,
+                self._device_id,
+                mode,
+            )
+            return True
+        except Exception:
+            import traceback
+
+            traceback.print_exc()
+            return False
+
+    # ============ Async Transfer Methods ============
+
+    def _swap_all_layers_async(
+        self,
+        device_block_ids: List[int],
+        host_block_ids: List[int],
+        mode: int,
+    ) -> bool:
+        """
+        Async all-layer transfer on dedicated stream.
+
+        D2H uses _output_stream (fire-and-forget).
+        H2D uses _input_stream (but H2D always goes through _swap_single_layer_async).
+        Falls back to _swap_all_layers if cupy not available.
+
+        Args:
+            device_block_ids: Device block IDs to swap.
+            host_block_ids: Host block IDs to swap.
+            mode: 0=Device→Host (evict), 1=Host→Device (load).
+        """
+        if self._num_host_blocks <= 0:
+            return False
+
+        if self._input_stream is None or self._output_stream is None:
+            return self._swap_all_layers(device_block_ids, host_block_ids, mode)
+
+        stream = self._output_stream if mode == 0 else self._input_stream
+        try:
+            cupy_current_device = cp.cuda.runtime.getDevice()
+            logger.debug(
+                f"[TransferManager] _swap_all_layers_async: local_rank={self._local_rank}, device_id={self._device_id}, "
+                f"cupy_current_device={cupy_current_device}, stream_device={stream.device_id}, mode={mode}"
+            )
+            with cp.cuda.Device(self._device_id):
+                with stream:
+                    swap_cache_all_layers(
+                        self._device_key_caches,
+                        self._host_key_ptrs,
+                        self._num_host_blocks,
+                        device_block_ids,
+                        host_block_ids,
+                        self._device_id,
+                        mode,
+                    )
+                    swap_cache_all_layers(
+                        self._device_value_caches,
+                        self._host_value_ptrs,
+                        self._num_host_blocks,
+                        device_block_ids,
+                        host_block_ids,
+                        self._device_id,
+                        mode,
+                    )
+                    if self._is_fp8_quantization() and self._device_key_scales and self._host_key_scales_ptrs:
+                        swap_cache_all_layers(
+                            self._device_key_scales,
+                            self._host_key_scales_ptrs,
+                            self._num_host_blocks,
+                            device_block_ids,
+                            host_block_ids,
+                            self._device_id,
+                            mode,
+                        )
+                        swap_cache_all_layers(
+                            self._device_value_scales,
+                            self._host_value_scales_ptrs,
+                            self._num_host_blocks,
+                            device_block_ids,
+                            host_block_ids,
+                            self._device_id,
+                            mode,
+                        )
+            return True
+        except Exception:
+            import traceback
+
+            traceback.print_exc()
+            return False
+
+    def _swap_single_layer_async(
+        self,
+        layer_idx: int,
+        device_block_ids: List[int],
+        host_block_ids: List[int],
+        mode: int,
+    ) -> bool:
+        """
+        Async single-layer transfer on _input_stream (H2D) or _output_stream (D2H).
+
+        Falls back to _swap_single_layer if cupy not available.
+
+        Args:
+            layer_idx: Layer index to transfer.
+            device_block_ids: Device block IDs to swap.
+            host_block_ids: Host block IDs to swap.
+            mode: 0=Device→Host (evict), 1=Host→Device (load).
+        """
+        if self._num_host_blocks <= 0:
+            return False
+
+        if self._input_stream is None or self._output_stream is None:
+            return self._swap_single_layer(layer_idx, device_block_ids, host_block_ids, mode)
+
+        stream = self._output_stream if mode == 0 else self._input_stream
+        key_cache = self.get_device_key_cache(layer_idx)
+        value_cache = self.get_device_value_cache(layer_idx)
+        if key_cache is None or value_cache is None:
+            return False
+
+        key_ptr = self.get_host_key_ptr(layer_idx)
+        value_ptr = self.get_host_value_ptr(layer_idx)
+        if key_ptr == 0 or value_ptr == 0:
+            return False
+
+        try:
+            with cp.cuda.Device(self._device_id):
+                with stream:
+                    swap_cache_per_layer_async(
+                        key_cache,
+                        key_ptr,
+                        self._num_host_blocks,
+                        device_block_ids,
+                        host_block_ids,
+                        self._device_id,
+                        mode,
+                    )
+                    swap_cache_per_layer_async(
+                        value_cache,
+                        value_ptr,
+                        self._num_host_blocks,
+                        device_block_ids,
+                        host_block_ids,
+                        self._device_id,
+                        mode,
+                    )
+            return True
+        except Exception:
+            import traceback
+
+            traceback.print_exc()
+            return False
+
+    # ============ Public Async API ============
+
+    def evict_to_host_async(
+        self,
+        device_block_ids: List[int],
+        host_block_ids: List[int],
+    ) -> bool:
+        """
+        Async evict all layers of KV Cache from Device to Host (D2H).
+
+        Runs on _output_stream, fire-and-forget.
+
+        Args:
+            device_block_ids: Device block IDs to evict.
+            host_block_ids: Host block IDs to receive.
+        """
+        return self._swap_all_layers_async(device_block_ids, host_block_ids, mode=0)
+
+    def load_layers_to_device_async(
+        self,
+        layer_indices: List[int],
+        host_block_ids: List[int],
+        device_block_ids: List[int],
+        on_layer_complete: Optional[callable] = None,
+    ) -> bool:
+        """
+        Async load KV Cache from Host to Device layer-by-layer (H2D).
+
+        Each layer runs on _input_stream. Overlaps with forward compute:
+        the callback is invoked after each layer's kernel is submitted so
+        the forward thread can start using that layer's data once the event fires.
+
+        Args:
+            layer_indices: Layer indices to load.
+            host_block_ids: Host block IDs to load from.
+            device_block_ids: Device block IDs to receive.
+            on_layer_complete: Optional callback(layer_idx) after each layer is submitted.
+        """
+        if self._num_host_blocks <= 0:
+            return False
+
+        all_success = True
+        for layer_idx in layer_indices:
+            success = self._swap_single_layer_async(layer_idx, device_block_ids, host_block_ids, mode=1)
+            if not success:
+                all_success = False
+            if on_layer_complete is not None:
+                try:
+                    on_layer_complete(layer_idx)
+                except Exception:
+                    pass
+        return all_success
+
+    # ============ Stream Utilities ============
+
+    def sync_input_stream(self):
+        """Wait for all pending _input_stream (H2D) transfers to complete."""
+        if self._input_stream is not None:
+            self._input_stream.synchronize()
+
+    def sync_output_stream(self):
+        """Wait for all pending _output_stream (D2H) transfers to complete."""
+        if self._output_stream is not None:
+            self._output_stream.synchronize()
+
+    def record_input_stream_event(self) -> Any:
+        """
+        Record a CUDA event on _input_stream and return it.
+
+        Used by _on_layer_complete callback in CacheController so that
+        LayerDoneCounter.wait_for_layer() can synchronize on the actual
+        H2D transfer stream rather than Paddle's default stream.
+
+        Returns:
+            cupy.cuda.Event if cupy streams are available, else None.
+        """
+        if not _HAS_CUPY or self._input_stream is None:
+            return None
+        try:
+            with cp.cuda.Device(self._device_id):
+                event = cp.cuda.Event()
+                with self._input_stream:
+                    event.record()
+            return event
+        except Exception as e:
+            logger.warning(f"[TransferManager] Failed to record input_stream event: {e}")
+            return None
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get transfer manager statistics."""
+        return {
+            "num_layers": self._num_layers,
+            "local_rank": self._local_rank,
+            "device_id": self._device_id,
+            "cache_dtype": self._cache_dtype,
+            "num_host_blocks": self._num_host_blocks,
+            "has_device_cache": len(self._device_key_caches) > 0,
+            "has_host_cache": len(self._host_key_ptrs) > 0,
+            "is_fp8": self._is_fp8_quantization(),
+        }
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index b15a6dc824b..b2568eb919f 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1576,7 +1576,8 @@ def __init__(self, args):
         self.enable_output_caching = False
         self.disable_chunked_mm_input = False
         self.kvcache_storage_backend = None
-        self.write_policy = None
+        self.write_policy = "write_through_selective"
+        self.write_through_threshold = 2
         self.num_cpu_blocks = None
         self.use_mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN"
 
@@ -1584,6 +1585,10 @@ def __init__(self, args):
             if hasattr(self, key):
                 setattr(self, key, value)
 
+        # ENABLE_V1_KVCACHE_MANAGER=0 uses the old cache_transfer_manager subprocess which only supports write_through.
+        if not envs.ENABLE_V1_KVCACHE_MANAGER:
+            self.write_policy = "write_through"
+
         self.cache_queue_port = parse_ports(self.cache_queue_port)
         self.rdma_comm_ports = parse_ports(self.rdma_comm_ports)
         self.pd_comm_port = parse_ports(self.pd_comm_port)
@@ -1639,6 +1644,15 @@ def _verify_args(self):
         if self.kv_cache_ratio > 1.0:
             raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.")
 
+        if envs.ENABLE_V1_KVCACHE_MANAGER:
+            allowed_write_policies = ["write_through_selective", "write_back", "write_through"]
+        else:
+            allowed_write_policies = ["write_through"]
+        if self.write_policy not in allowed_write_policies:
+            raise ValueError(
+                f"Invalid write_policy: {self.write_policy!r}. " f"Expected one of {allowed_write_policies}."
+            )
+
     def postprocess(self, num_total_tokens, number_of_tasks):
         """
         calculate block num
@@ -2110,6 +2124,21 @@ def postprocess(self):
                 "Static Graph does not support to be started together with RL Training, and automatically switch to dynamic graph!"
             )
 
+        # Layer-by-layer swap (H2D) is always incompatible with CUDA Graph prefill capture.
+        # Force only decode to use CUDA Graph when host cache is configured.
+        if (
+            self.cache_config is not None
+            and self.cache_config.num_cpu_blocks
+            and self.graph_opt_config.cudagraph_only_prefill
+        ):
+            original_value = self.graph_opt_config.cudagraph_only_prefill
+            self.graph_opt_config.cudagraph_only_prefill = False
+            logger.warning(
+                f"[CacheConfig] Layer-by-layer swap-in is incompatible "
+                f"with CUDA Graph prefill capture. Forcing cudagraph_only_prefill=False "
+                f"(only decode will use CUDA Graph). Original cudagraph_only_prefill={original_value}"
+            )
+
         if (
             not current_platform.is_cuda()
             and not current_platform.is_maca()
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index afb7095a449..bc9a0369f16 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -250,9 +250,13 @@ class EngineArgs:
     """
     The storage backend for kvcache storage. If set, it will use the kvcache storage backend.
     """
-    write_policy: str = "write_through"
+    write_policy: str = "write_through_selective"
     """
-    The policy of write cache to storage.
+    The policy of write cache to storage. Options: write_through (alias for write_through_selective with threshold=1), write_through_selective, write_back.
+    """
+    write_through_threshold: int = 2
+    """
+    The threshold of hit count for write_through_selective policy. Only effective when write_policy is write_through_selective.
     """
 
     # System configuration parameters
@@ -1131,11 +1135,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         cache_group.add_argument(
             "--write-policy",
             type=str,
-            choices=["write_through"],
+            choices=["write_through", "write_through_selective", "write_back"],
             default=EngineArgs.write_policy,
             help="KVCache write policy",
         )
 
+        cache_group.add_argument(
+            "--write-through-threshold",
+            type=int,
+            default=EngineArgs.write_through_threshold,
+            help="Hit count threshold for write_through_selective policy. Only effective when write_policy is write_through_selective.",
+        )
+
         # Cluster system parameters group
         system_group = parser.add_argument_group("System Configuration")
         system_group.add_argument(
diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
index fdc160735b7..e8e710839ad 100644
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -209,6 +209,11 @@ def __init__(self, cfg: FDConfig, start_queue=True, use_async_llm=False):
             self.ipc_signal_suffix = None
             self.cache_manager_processes = None
 
+        if envs.ENABLE_V1_KVCACHE_MANAGER:
+            from fastdeploy.cache_manager.v1.cache_utils import get_request_block_hasher
+
+            self._block_hasher = get_request_block_hasher(block_size=self.cfg.cache_config.block_size)
+
         self._finalizer = weakref.finalize(self, self._exit_sub_services)
 
     def start(self, async_llm_pid=None):
@@ -245,7 +250,11 @@ def start_worker_service(self, async_llm_pid=None):
         self.launch_components()
 
         # If block number is specified and model is deployed in splitwise mode, start cache manager first
-        if not self.do_profile and self.cfg.scheduler_config.splitwise_role != "mixed":
+        if (
+            not self.do_profile
+            and self.cfg.scheduler_config.splitwise_role != "mixed"
+            and not envs.ENABLE_V1_KVCACHE_MANAGER
+        ):
             device_ids = self.cfg.parallel_config.device_ids.split(",")
             self.cache_manager_processes = self.start_cache_service(device_ids, self.ipc_signal_suffix)
 
@@ -277,7 +286,11 @@ def check_worker_initialize_status_func(res: dict):
         # and then start the cache manager
         if self.do_profile:
             self._stop_profile()
-        elif self.cfg.scheduler_config.splitwise_role == "mixed" and self.cfg.cache_config.enable_prefix_caching:
+        elif (
+            self.cfg.scheduler_config.splitwise_role == "mixed"
+            and self.cfg.cache_config.enable_prefix_caching
+            and not envs.ENABLE_V1_KVCACHE_MANAGER
+        ):
             device_ids = self.cfg.parallel_config.device_ids.split(",")
             self.cache_manager_processes = self.start_cache_service(device_ids, self.ipc_signal_suffix)
 
@@ -452,19 +465,20 @@ def start_worker_queue_service(self, start_queue):
                         self.cfg.parallel_config.local_engine_worker_queue_port,
                     )
 
-            if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
-                self.llm_logger.info(
-                    f"Starting engine cache queue server service at {self.cfg.cache_config.local_cache_queue_port}"
-                )
-                self.cache_task_queue = EngineCacheQueue(
-                    address=(self.cfg.master_ip, self.cfg.cache_config.local_cache_queue_port),
-                    authkey=b"cache_queue_service",
-                    is_server=True,
-                    num_client=self.cfg.parallel_config.tensor_parallel_size,
-                    client_id=-1,
-                    local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
-                )
-                self.cfg.cache_config.local_cache_queue_port = self.cache_task_queue.get_server_port()
+            if not envs.ENABLE_V1_KVCACHE_MANAGER:
+                if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
+                    self.llm_logger.info(
+                        f"Starting engine cache queue server service at {self.cfg.cache_config.local_cache_queue_port}"
+                    )
+                    self.cache_task_queue = EngineCacheQueue(
+                        address=(self.cfg.master_ip, self.cfg.cache_config.local_cache_queue_port),
+                        authkey=b"cache_queue_service",
+                        is_server=True,
+                        num_client=self.cfg.parallel_config.tensor_parallel_size,
+                        client_id=-1,
+                        local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
+                    )
+                    self.cfg.cache_config.local_cache_queue_port = self.cache_task_queue.get_server_port()
 
         self.engine_worker_queue = EngineWorkerQueue(
             address=address,
@@ -880,6 +894,10 @@ def _fetch_request():
                     task.metrics.engine_get_req_time = time.time()
                     trace_print(LoggingEventName.REQUEST_QUEUE_END, task.request_id, getattr(task, "user", ""))
 
+                    # cache_manager_v1 set block_hasher to request
+                    if hasattr(self, "_block_hasher"):
+                        task.set_block_hasher(self._block_hasher)
+
                 if self.cfg.scheduler_config.splitwise_role == "decode":
                     # TODO: refine scheduler to remove this limitation
                     # Decode will process and schedule the request sent by prefill to engine,
@@ -1044,12 +1062,12 @@ def _fetch_request():
                 if hasattr(self.resource_manager, "scheduler_unhandled_request_num"):
                     self.resource_manager.scheduler_unhandled_request_num = self._get_scheduler_unhandled_request_num()
                 # 2. Schedule requests
-                tasks, error_tasks = self.resource_manager.schedule()
+                batch_request, error_tasks = self.resource_manager.schedule()
 
                 # 3. Send to engine
-                if tasks:
+                if len(batch_request) > 0:
                     if self.cfg.scheduler_config.splitwise_role == "decode":
-                        for task in tasks:
+                        for task in batch_request:
                             if task.task_type == RequestType.PREEMPTED:
                                 msg = f"{task.request_id} decode not enough blocks, need to be rescheduled."
                                 self.llm_logger.error(msg)
@@ -1064,7 +1082,7 @@ def _fetch_request():
                                     ]
                                 )
                     self.resource_manager.get_real_bsz()
-                    for task in tasks:
+                    for task in batch_request:
                         if task.task_type == RequestType.PREFILL:
                             rid = task.request_id.split("_")[0]
                             if isinstance(task, Request) and task.has_been_preempted_before:
@@ -1099,13 +1117,13 @@ def _fetch_request():
                                 task.metrics.decode_inference_start_time = time.time()
                             elif not task.has_been_preempted_before:
                                 task.metrics.inference_start_time = time.time()
-                    self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz))
+                    self.engine_worker_queue.put_tasks((batch_request, self.resource_manager.real_bsz))
                 else:
                     # When there are no actual tasks to schedule, send an empty task batch to EP workers.
                     # This helps EP workers barrier for syncing tasks not hang.
                     if self.cfg.parallel_config.enable_expert_parallel:
                         self.engine_worker_queue.put_tasks(
-                            ([], self.resource_manager.real_bsz)
+                            (batch_request, self.resource_manager.real_bsz)
                         )  # Empty (as idle tasks for ep)
 
                 # 4. Response error tasks
@@ -1116,7 +1134,7 @@ def _fetch_request():
                             continue
                         self._send_error_response(request_id, failed)
 
-                if not tasks and not error_tasks:
+                if len(batch_request) <= 0 and not error_tasks:
                     time.sleep(0.005)
 
             except RuntimeError as e:
@@ -2493,6 +2511,8 @@ def _stop_profile(self):
         self.cfg.cache_config.reset(num_gpu_blocks)
         self.resource_manager.reset_cache_config(self.cfg.cache_config)
         if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
+            if envs.ENABLE_V1_KVCACHE_MANAGER:
+                return
             device_ids = self.cfg.parallel_config.device_ids.split(",")
             self.cache_manager_processes = self.start_cache_service(device_ids, self.ipc_signal_suffix)
 
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index 9f78f8584ac..a0a66468e53 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -177,7 +177,7 @@ def check_worker_initialize_status_func(res: dict):
         if self.do_profile:
             self._stop_profile()
         elif self.cfg.scheduler_config.splitwise_role == "mixed" and self.cfg.cache_config.enable_prefix_caching:
-            if not current_platform.is_intel_hpu():
+            if not current_platform.is_intel_hpu() and not envs.ENABLE_V1_KVCACHE_MANAGER:
                 device_ids = self.cfg.parallel_config.device_ids.split(",")
                 self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix)
 
@@ -758,7 +758,7 @@ def _stop_profile(self):
         self.cfg.cache_config.reset(num_gpu_blocks)
         self.engine.resource_manager.reset_cache_config(self.cfg.cache_config)
         if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
-            if not current_platform.is_intel_hpu():
+            if not current_platform.is_intel_hpu() and not envs.ENABLE_V1_KVCACHE_MANAGER:
                 device_ids = self.cfg.parallel_config.device_ids.split(",")
                 self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix)
 
diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
index 0e95cd5e1fb..dcdbc33a522 100644
--- a/fastdeploy/engine/request.py
+++ b/fastdeploy/engine/request.py
@@ -21,16 +21,20 @@
 import traceback
 from dataclasses import asdict, dataclass, fields
 from enum import Enum
-from typing import Any, Dict, Generic, Optional
+from typing import TYPE_CHECKING, Any, Dict, Generic, List, Optional
 from typing import TypeVar as TypingTypeVar
 from typing import Union
 
+if TYPE_CHECKING:
+    from fastdeploy.cache_manager.v1.metadata import MatchResult
+
 import numpy as np
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from typing_extensions import TypeVar
 
 from fastdeploy import envs
+from fastdeploy.cache_manager.v1.metadata import CacheSwapMetadata
 from fastdeploy.engine.pooling_params import PoolingParams
 from fastdeploy.engine.sampling_params import SamplingParams
 from fastdeploy.entrypoints.openai.protocol import (
@@ -130,6 +134,8 @@ def __init__(
         # from PoolingRequest
         add_special_tokens: Optional[bool] = False,
         zmq_worker_pid: Optional[int] = None,
+        # block hasher for dynamic hash computation
+        block_hasher: Optional[callable] = None,
     ) -> None:
         self.request_id = request_id
         self.prompt = prompt
@@ -143,11 +149,18 @@ def __init__(
         self.tools = tools
         # model specific token ids: end of sentence token ids
         self.eos_token_ids = eos_token_ids
-        self.num_cached_tokens = 0
-        self.num_cached_blocks = 0
         self.disable_chat_template = disable_chat_template
         self.disaggregate_info = disaggregate_info
 
+        # prefix caching related
+        self.num_cached_tokens = 0
+        self.num_cached_blocks = 0
+        self._prompt_hashes: list[str] = []
+        self._block_hasher = block_hasher
+        self._match_result: Optional[MatchResult] = None
+        self.cache_swap_metadata: list[CacheSwapMetadata] = []
+        self.cache_evict_metadata: list[CacheSwapMetadata] = []
+
         # speculative method in disaggregate-mode
         self.draft_token_ids = draft_token_ids
 
@@ -220,6 +233,38 @@ def __init__(
         self.add_special_tokens = add_special_tokens
         self.zmq_worker_pid = zmq_worker_pid
 
+    @property
+    def prompt_hashes(self) -> list[str]:
+        """
+        Dynamically get prompt_hashes, automatically computing new block hashes.
+
+        When accessing this property, it checks if there are new complete blocks
+        that need hash computation, and if so, computes and appends them.
+        """
+        if self._block_hasher is not None:
+            new_hashes = self._block_hasher(self)
+            if new_hashes:
+                self._prompt_hashes.extend(new_hashes)
+        return self._prompt_hashes
+
+    @property
+    def match_result(self) -> MatchResult:
+        return self._match_result
+
+    def set_block_hasher(self, block_hasher: callable):
+        """Set the block hasher for dynamic hash computation."""
+        self._block_hasher = block_hasher
+
+    def pop_cache_swap_metadata(self) -> list[CacheSwapMetadata]:
+        result = self.cache_swap_metadata
+        self.cache_swap_metadata = []
+        return result
+
+    def pop_cache_evict_metadata(self) -> list[CacheSwapMetadata]:
+        result = self.cache_evict_metadata
+        self.cache_evict_metadata = []
+        return result
+
     @classmethod
     def _process_guided_json(cls, r: T):
         guided_json_object = None
@@ -414,6 +459,9 @@ def __getstate__(self):
             # Skip attributes that are known to contain unpicklable objects
             if key == "async_process_futures":
                 filtered_dict[key] = []
+            elif key == "_block_hasher":
+                # Skip _block_hasher (closure function, cannot be pickled)
+                continue
             else:
                 filtered_dict[key] = value
 
@@ -548,6 +596,127 @@ def __contains__(self, key: str) -> bool:
         return hasattr(self, key)
 
 
+class BatchRequest:
+    def __init__(self):
+        self.requests: list[Request] = []
+
+        self.cache_swap_metadata: Optional[CacheSwapMetadata] = None
+        self.cache_evict_metadata: Optional[CacheSwapMetadata] = None
+
+    def add_request(self, request):
+        if hasattr(request, "cache_swap_metadata") and request.cache_swap_metadata:
+            self.append_swap_metadata(request.pop_cache_swap_metadata())
+            request.cache_swap_metadata = []
+        if hasattr(request, "cache_evict_metadata") and request.cache_evict_metadata:
+            self.append_evict_metadata(request.pop_cache_evict_metadata())
+            request.cache_evict_metadata = []
+
+        self.requests.append(request)
+
+    def append_swap_metadata(self, metadata: List[CacheSwapMetadata]):
+        for meta in metadata:
+            if self.cache_swap_metadata:
+                self.cache_swap_metadata.src_block_ids.extend(meta.src_block_ids)
+                self.cache_swap_metadata.dst_block_ids.extend(meta.dst_block_ids)
+                self.cache_swap_metadata.hash_values.extend(meta.hash_values)
+            else:
+                self.cache_swap_metadata = CacheSwapMetadata(
+                    src_block_ids=meta.src_block_ids,
+                    dst_block_ids=meta.dst_block_ids,
+                    src_type="host",
+                    dst_type="device",
+                    hash_values=meta.hash_values,
+                )
+
+    def append_evict_metadata(self, metadata: List[CacheSwapMetadata]):
+        for meta in metadata:
+            if self.cache_evict_metadata:
+                self.cache_evict_metadata.src_block_ids.extend(meta.src_block_ids)
+                self.cache_evict_metadata.dst_block_ids.extend(meta.dst_block_ids)
+                self.cache_evict_metadata.hash_values.extend(meta.hash_values)
+            else:
+                self.cache_evict_metadata = CacheSwapMetadata(
+                    src_block_ids=meta.src_block_ids,
+                    dst_block_ids=meta.dst_block_ids,
+                    src_type="device",
+                    dst_type="host",
+                    hash_values=meta.hash_values,
+                )
+
+    def __repr__(self):
+        requests_repr = repr(self.requests)
+        return f"BatchRequest(requests={requests_repr}, swap_metadata={self.cache_swap_metadata}, evict_metadata={self.cache_evict_metadata})"
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["requests"] = [req.__getstate__() if hasattr(req, "__getstate__") else req for req in state["requests"]]
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        restored_requests = []
+        for req_data in self.requests:
+            if isinstance(req_data, dict):
+                req = Request.__new__(Request)
+                req.__dict__.update(req_data)
+                restored_requests.append(req)
+            else:
+                restored_requests.append(req_data)
+        self.requests = restored_requests
+
+    def __iter__(self):
+        for req in self.requests:
+            yield req
+
+    def __getitem__(self, index):
+        return self.requests[index]
+
+    def __len__(self):
+        return len(self.requests)
+
+    def append(self, batch_request: "BatchRequest"):
+        self.requests.extend(batch_request.requests)
+        if batch_request.cache_swap_metadata:
+            self.append_swap_metadata([batch_request.cache_swap_metadata])
+        if batch_request.cache_evict_metadata:
+            self.append_evict_metadata([batch_request.cache_evict_metadata])
+
+    def extend(self, batch_requests: list["BatchRequest"]):
+        for br in batch_requests:
+            self.append(br)
+
+    @classmethod
+    def from_tasks(cls, tasks: list) -> tuple["BatchRequest", list, int]:
+        """Classify tasks from the engine worker queue into inference requests and control requests.
+
+        Args:
+            tasks: List of (payload, real_bsz) tuples from task_queue.get_tasks().
+                   payload is one of: BatchRequest, List[Request], or [ControlRequest].
+
+        Returns:
+            (batch_request, control_reqs, max_occupied_batch_index)
+              - batch_request: merged BatchRequest containing all inference requests
+              - control_reqs: list of ControlRequest objects
+              - max_occupied_batch_index: real_bsz of the last inference task batch
+        """
+        batch_request = cls()
+        control_reqs = []
+        max_occupied_batch_index = 0
+
+        for payload, bsz in tasks:
+            if len(payload) > 0 and isinstance(payload[0], ControlRequest):
+                control_reqs.append(payload[0])
+            else:
+                max_occupied_batch_index = int(bsz)
+                if isinstance(payload, cls):
+                    batch_request.append(payload)
+                else:
+                    for req in payload:
+                        batch_request.add_request(req)
+
+        return batch_request, control_reqs, max_occupied_batch_index
+
+
 class ControlRequest:
     """A generic control request that supports method and args for control operations.
 
diff --git a/fastdeploy/engine/resource_manager.py b/fastdeploy/engine/resource_manager.py
index 609c88533bd..452a699b131 100644
--- a/fastdeploy/engine/resource_manager.py
+++ b/fastdeploy/engine/resource_manager.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager
+from fastdeploy import envs
 from fastdeploy.metrics.metrics import main_process_metrics
 from fastdeploy.utils import llm_logger
 
@@ -53,7 +53,17 @@ def __init__(
         self.max_num_seqs = max_num_seqs
         self.stop_flags = [True] * max_num_seqs  # flag set to true if the slot has not been taken
         self.enable_prefix_cache = config.cache_config.enable_prefix_caching
-        self.cache_manager = PrefixCacheManager(config, tensor_parallel_size, splitwise_role, local_data_parallel_id)
+        self.enable_cache_manager_v1 = envs.ENABLE_V1_KVCACHE_MANAGER
+        if self.enable_cache_manager_v1:
+            from fastdeploy.cache_manager.v1 import CacheManager
+
+            self.cache_manager = CacheManager(config)
+        else:
+            from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager
+
+            self.cache_manager = PrefixCacheManager(
+                config, tensor_parallel_size, splitwise_role, local_data_parallel_id
+            )
         self.tasks_list = [None] * max_num_seqs  # task slots
         self.req_dict = dict()
         # current batch status of the engine
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
index 632c6672345..34f4626b92f 100644
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -21,8 +21,8 @@
 from collections import deque
 from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass
-from typing import Union
+from dataclasses import dataclass, field
+from typing import List, Union
 
 import numpy as np
 import paddle
@@ -32,8 +32,10 @@
     EncoderCacheManager,
     ProcessorCacheManager,
 )
+from fastdeploy.cache_manager.v1.metadata import CacheSwapMetadata
 from fastdeploy.config import ErnieArchitectures
 from fastdeploy.engine.request import (
+    BatchRequest,
     ImagePosition,
     Request,
     RequestOutput,
@@ -53,46 +55,61 @@
 
 
 @dataclass
-class ScheduledDecodeTask:
+class ScheduledTaskBase:
     """
-    Task for allocating new blocks to decode.
+    Task for Scheduled.
     """
 
     idx: int
     request_id: str
-    block_tables: list[int]
     task_type: RequestType = RequestType.DECODE
 
+    cache_swap_metadata: list[CacheSwapMetadata] = field(default_factory=list)
+    cache_evict_metadata: list[CacheSwapMetadata] = field(default_factory=list)
+
+    def pop_cache_swap_metadata(self) -> list[CacheSwapMetadata]:
+        result = self.cache_swap_metadata
+        self.cache_swap_metadata = []
+        return result
+
+    def pop_cache_evict_metadata(self) -> list[CacheSwapMetadata]:
+        result = self.cache_evict_metadata
+        self.cache_evict_metadata = []
+        return result
+
 
 @dataclass
-class ScheduledPreemptTask:
+class ScheduledDecodeTask(ScheduledTaskBase):
+    """
+    Task for allocating new blocks to decode.
+    """
+
+    block_tables: list[int] = field(default_factory=list)
+
+
+@dataclass
+class ScheduledPreemptTask(ScheduledTaskBase):
     """
     Task for terminating inference to recycle resource.
     """
 
-    idx: int
-    request_id: str
     task_type: RequestType = RequestType.PREEMPTED
 
 
 @dataclass
-class ScheduledExtendBlocksTask:
+class ScheduledExtendBlocksTask(ScheduledTaskBase):
     """
     Task for allocating new blocks to extend.
     """
 
-    idx: int
-    request_id: str
-    extend_block_tables: list[int]
     task_type: RequestType = RequestType.EXTEND
+    extend_block_tables: list[int] = field(default_factory=list)
 
 
 @dataclass
-class ScheduledAbortTask:
+class ScheduledAbortTask(ScheduledTaskBase):
     """Task for allocating new blocks to skip."""
 
-    idx: int
-    request_id: str
     task_type: RequestType = RequestType.ABORT
 
 
@@ -243,6 +260,7 @@ def get_new_block_nums(self, request: Request, num_new_tokens: int):
             block_num = min(block_num + 1, self.config.cache_config.max_block_num_per_seq)
         else:
             block_num = min(block_num, self.config.cache_config.max_block_num_per_seq)
+
         return block_num
 
     def _prepare_prefill_task(self, request, new_token_num):
@@ -252,13 +270,29 @@ def _prepare_prefill_task(self, request, new_token_num):
         return request
 
     def _prepare_decode_task(self, request):
-        return ScheduledDecodeTask(idx=request.idx, request_id=request.request_id, block_tables=request.block_tables)
+        return ScheduledDecodeTask(
+            idx=request.idx,
+            request_id=request.request_id,
+            block_tables=request.block_tables,
+            cache_swap_metadata=request.pop_cache_swap_metadata(),
+            cache_evict_metadata=request.pop_cache_evict_metadata(),
+        )
 
     def _prepare_preempt_task(self, request):
-        return ScheduledPreemptTask(idx=request.idx, request_id=request.request_id)
+        return ScheduledPreemptTask(
+            idx=request.idx,
+            request_id=request.request_id,
+            cache_swap_metadata=request.pop_cache_swap_metadata(),
+            cache_evict_metadata=request.pop_cache_evict_metadata(),
+        )
 
     def _prepare_abort_task(self, request):
-        return ScheduledAbortTask(idx=request.idx, request_id=request.request_id)
+        return ScheduledAbortTask(
+            idx=request.idx,
+            request_id=request.request_id,
+            cache_swap_metadata=request.pop_cache_swap_metadata(),
+            cache_evict_metadata=request.pop_cache_evict_metadata(),
+        )
 
     def reschedule_preempt_task(self, request_id, process_func=None):
         with self.lock:
@@ -284,14 +318,14 @@ def recycle_abort_task(self, request_id):
                 self.to_be_aborted_req_id_set.remove(request_id)
         self.update_metrics()
 
-    def _trigger_abort(self, request_id, scheduled_reqs):
+    def _trigger_abort(self, request_id, batch_request):
         if request_id in self.requests:
             abort_request = self.requests[request_id]
             abort_request.status = RequestStatus.PREEMPTED
             abort_request.num_computed_tokens = 0
             self._free_blocks(abort_request)  # 释放KV cache blocks
             abort_request.cached_block_num = 0
-            scheduled_reqs.append(self._prepare_abort_task(abort_request))
+            batch_request.add_request(self._prepare_abort_task(abort_request))
             self.to_be_aborted_req_id_set.add(request_id)
             self.waiting_abort_req_id_set.remove(request_id)
 
@@ -347,7 +381,7 @@ def wait_worker_inflight_requests_finish(self, timeout=60):
                 f"still {len(self.to_be_rescheduled_request_id_set)} requests running"
             )
 
-    def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_reqs):
+    def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, batch_request):
         """
         If the request cannot be scheduled, preempt the running request one by one until it can be scheduled. Last in, first out.
         """
@@ -378,7 +412,7 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re
                     )
                     llm_logger.info(f"Preemption is triggered! Preempted request id: {preempted_req.request_id}")
                 preempted_reqs.append(preempted_req)
-                scheduled_reqs.append(self._prepare_preempt_task(preempted_req))
+                batch_request.add_request(self._prepare_preempt_task(preempted_req))
 
                 llm_logger.debug(
                     f"preempt {preempted_req.request_id} in idx {preempted_req.idx} with generated ids {preempted_req.output_token_ids}"
@@ -717,18 +751,12 @@ def _compute_audio_prefix_count(end_idx, end_patch_idx):
         # Compatible with scenarios without images and videos.
         return num_new_tokens
 
-    def exist_mm_prefill(self, scheduled_reqs):
-        for request in scheduled_reqs:
+    def exist_mm_prefill(self, batch_request):
+        for request in batch_request:
             if request.task_type == RequestType.PREFILL and self._is_mm_request(request):
                 return True
         return False
 
-    def exist_prefill(self, scheduled_reqs):
-        for request in scheduled_reqs:
-            if request.task_type == RequestType.PREFILL:
-                return True
-        return False
-
     def add_abort_req_ids(self, req_ids):
         with self.lock:
             if isinstance(req_ids, list):
@@ -751,19 +779,19 @@ def schedule(self):
         Try to pull a batch of requests from the waiting queue and schedule them.
         """
 
-        def get_enough_request(request, scheduled_reqs):
+        def get_enough_request(request, batch_request):
             return (
                 ErnieArchitectures.is_ernie5_arch(self.config.model_config.architectures)
                 and self._is_mm_request(request)
-                and self.exist_mm_prefill(scheduled_reqs)
+                and self.exist_mm_prefill(batch_request)
             )
 
         with self.lock:
-            scheduled_reqs: list[Request] = []
             preempted_reqs: list[Request] = []
             error_reqs: list[tuple[str, str]] = []
             token_budget = self.config.scheduler_config.max_num_batched_tokens
             need_abort_requests = []  # users trigger abortion
+            batch_request = BatchRequest()
 
             # First, schedule the RUNNING requests.
             req_index = 0
@@ -785,7 +813,7 @@ def get_enough_request(request, scheduled_reqs):
                         request.num_computed_tokens = request.num_total_tokens - 1
 
                     if request.request_id in self.waiting_abort_req_id_set:
-                        self._trigger_abort(request.request_id, scheduled_reqs)
+                        self._trigger_abort(request.request_id, batch_request)
                         req_index += 1
                         need_abort_requests.append(request)
                         continue
@@ -800,27 +828,23 @@ def get_enough_request(request, scheduled_reqs):
                                 f"schedule decoding task: {request} request.num_total_tokens {request.num_total_tokens} request.num_computed_tokens {request.num_computed_tokens}"
                             )
                             request.block_tables.extend(
-                                self.cache_manager.allocate_gpu_blocks(
-                                    self.config.cache_config.enc_dec_block_num, request.request_id
-                                )
+                                self._allocate_gpu_blocks(request, self.config.cache_config.enc_dec_block_num)
                             )
                             # Prepare decoding task
-                            scheduled_reqs.append(self._prepare_decode_task(request))
+                            batch_request.add_request(self._prepare_decode_task(request))
                         else:
                             # Not enough blocks to allocate, trigger preemption
                             can_schedule = self._trigger_preempt(
-                                request, self.config.cache_config.enc_dec_block_num, preempted_reqs, scheduled_reqs
+                                request, self.config.cache_config.enc_dec_block_num, preempted_reqs, batch_request
                             )
                             if not can_schedule:
                                 break
                             # Allocation for next decoding blocks
                             request.block_tables.extend(
-                                self.cache_manager.allocate_gpu_blocks(
-                                    self.config.cache_config.enc_dec_block_num, request.request_id
-                                )
+                                self._allocate_gpu_blocks(request, self.config.cache_config.enc_dec_block_num)
                             )
                             # Prepare decoding task
-                            scheduled_reqs.append(self._prepare_decode_task(request))
+                            batch_request.add_request(self._prepare_decode_task(request))
                         num_decoding_req_nums += 1
                     token_budget -= 1
                     if (
@@ -832,10 +856,8 @@ def get_enough_request(request, scheduled_reqs):
                         def _allocate_decode_and_extend():
                             allocate_block_num = self.need_block_num_map[request.request_id].consume()
                             # Prepare decoding task
-                            request.block_tables.extend(
-                                self.cache_manager.allocate_gpu_blocks(allocate_block_num, request.request_id)
-                            )
-                            scheduled_reqs.append(self._prepare_decode_task(request))
+                            request.block_tables.extend(self._allocate_gpu_blocks(request, allocate_block_num))
+                            batch_request.add_request(self._prepare_decode_task(request))
 
                             # Prepare extend task
                             reuse_block_num = request.num_total_tokens // self.config.cache_config.block_size
@@ -847,14 +869,14 @@ def _allocate_decode_and_extend():
                             self.reuse_block_num_map[request.request_id] = reuse_block_num
 
                             request.extend_block_tables = request.block_tables[:reuse_block_num]  # copy prompt cache
-                            request.extend_block_tables.extend(
-                                self.cache_manager.allocate_gpu_blocks(allocate_block_num, request.request_id)
-                            )
-                            scheduled_reqs.append(
+                            request.extend_block_tables.extend(self._allocate_gpu_blocks(request, allocate_block_num))
+                            batch_request.add_request(
                                 ScheduledExtendBlocksTask(
                                     idx=request.idx,
                                     request_id=request.request_id,
                                     extend_block_tables=request.extend_block_tables,
+                                    cache_swap_metadata=request.pop_cache_swap_metadata(),
+                                    cache_evict_metadata=request.pop_cache_evict_metadata(),
                                 )
                             )
                             llm_logger.debug(f"extend blocks is {request.extend_block_tables}")
@@ -871,7 +893,7 @@ def _allocate_decode_and_extend():
                                 request,
                                 2 * self.need_block_num_map[request.request_id].watch(),
                                 preempted_reqs,
-                                scheduled_reqs,
+                                batch_request,
                             )
 
                             if can_schedule:
@@ -892,7 +914,7 @@ def _allocate_decode_and_extend():
                     ):
                         req_index += 1
                         continue
-                    if get_enough_request(request, scheduled_reqs):
+                    if get_enough_request(request, batch_request):
                         req_index += 1
                         continue
                     num_new_tokens = self._get_num_new_tokens(request, token_budget)
@@ -902,25 +924,22 @@ def _allocate_decode_and_extend():
                     num_new_block = self.get_new_block_nums(request, num_new_tokens)
                     # Allocate blocks to prefill
                     if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
-                        request.block_tables.extend(
-                            self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id)
-                        )
+                        request.block_tables.extend(self._allocate_gpu_blocks(request, num_new_block))
                         # Prepare prefill task
-                        scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
+                        batch_request.add_request(self._prepare_prefill_task(request, num_new_tokens))
                     else:  # Not enough blocks to allocate, trigger preemption
-                        can_schedule = self._trigger_preempt(request, num_new_block, preempted_reqs, scheduled_reqs)
+                        can_schedule = self._trigger_preempt(request, num_new_block, preempted_reqs, batch_request)
                         if not can_schedule:
                             break
-                        request.block_tables.extend(
-                            self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id)
-                        )
+                        request.block_tables.extend(self._allocate_gpu_blocks(request, num_new_block))
                         # Prepare prefill task
-                        scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
+                        batch_request.add_request(self._prepare_prefill_task(request, num_new_tokens))
                     token_budget -= num_new_tokens
                     request.num_computed_tokens += num_new_tokens
                     if (
                         self.config.cache_config.enable_prefix_caching
                         and self.config.scheduler_config.splitwise_role != "decode"
+                        and not self.enable_cache_manager_v1
                     ):
                         self.cache_manager.update_cache_blocks(
                             request, self.config.cache_config.block_size, request.num_computed_tokens
@@ -939,7 +958,7 @@ def _allocate_decode_and_extend():
                         break
 
                     request = self.waiting[0]
-                    if get_enough_request(request, scheduled_reqs):
+                    if get_enough_request(request, batch_request):
                         break
                     if request.status == RequestStatus.WAITING:
                         result = self.waiting_async_process(request)
@@ -956,15 +975,16 @@ def _allocate_decode_and_extend():
                         self._update_mm_hashes(request)
                         # Enable prefix caching
                         if self.config.cache_config.enable_prefix_caching:
-                            if (
-                                self.cache_manager.num_cpu_blocks > 0
-                                or self.config.cache_config.kvcache_storage_backend
-                            ):
-                                if not self.cache_manager.can_allocate_gpu_blocks(
-                                    (request.need_prefill_tokens + self.config.cache_config.block_size - 1)
-                                    // self.config.cache_config.block_size
-                                ):  # to prevent block allocation for matching in hierarchical cache and cause dead lock
-                                    break
+                            if not self.enable_cache_manager_v1:
+                                if (
+                                    self.cache_manager.num_cpu_blocks > 0
+                                    or self.config.cache_config.kvcache_storage_backend
+                                ):
+                                    if not self.cache_manager.can_allocate_gpu_blocks(
+                                        (request.need_prefill_tokens + self.config.cache_config.block_size - 1)
+                                        // self.config.cache_config.block_size
+                                    ):  # to prevent block allocation for matching in hierarchical cache and cause dead lock
+                                        break
                             success = self.get_prefix_cached_blocks(request)
                             if not success:
                                 self._free_blocks(request)
@@ -986,24 +1006,27 @@ def _allocate_decode_and_extend():
                             self.waiting.popleft()
                             continue
                         num_new_block = self.get_new_block_nums(request, num_new_tokens)
+
+                        llm_logger.debug(
+                            f"request.request_id {request.request_id} num_new_block {num_new_block}, request.need_prefill_tokens {request.need_prefill_tokens}, request.num_computed_tokens {request.num_computed_tokens}, token_budget {token_budget}"
+                        )
                         can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block(
                             request, num_new_block
                         )
                         # Allocate blocks to prefill
                         if self.cache_manager.can_allocate_gpu_blocks(can_schedule_block_num_threshold):
                             if num_new_block > 0:
-                                extra_gpu_block_ids = self.cache_manager.allocate_gpu_blocks(
-                                    num_new_block, request.request_id
-                                )
+                                extra_gpu_block_ids = self._allocate_gpu_blocks(request, num_new_block)
                                 request.block_tables.extend(extra_gpu_block_ids)
                             self.waiting.popleft()
                             self.running.append(request)
-                            scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
+                            batch_request.add_request(self._prepare_prefill_task(request, num_new_tokens))
                             token_budget -= num_new_tokens
                             request.num_computed_tokens += num_new_tokens
                             if (
                                 self.config.cache_config.enable_prefix_caching
                                 and self.config.scheduler_config.splitwise_role != "decode"
+                                and not self.enable_cache_manager_v1
                             ):
                                 self.cache_manager.update_cache_blocks(
                                     request, self.config.cache_config.block_size, request.num_computed_tokens
@@ -1028,15 +1051,16 @@ def _allocate_decode_and_extend():
                             self.config.cache_config.enable_prefix_caching
                             and self.config.scheduler_config.splitwise_role != "decode"
                         ):
-                            if (
-                                self.cache_manager.num_cpu_blocks > 0
-                                or self.config.cache_config.kvcache_storage_backend
-                            ):
-                                if not self.cache_manager.can_allocate_gpu_blocks(
-                                    (request.need_prefill_tokens + self.config.cache_config.block_size - 1)
-                                    // self.config.cache_config.block_size
-                                ):  # to prevent block allocation for matching in hierarchical cache and cause dead lock
-                                    break
+                            if not self.enable_cache_manager_v1:
+                                if (
+                                    self.cache_manager.num_cpu_blocks > 0
+                                    or self.config.cache_config.kvcache_storage_backend
+                                ):
+                                    if not self.cache_manager.can_allocate_gpu_blocks(
+                                        (request.need_prefill_tokens + self.config.cache_config.block_size - 1)
+                                        // self.config.cache_config.block_size
+                                    ):  # to prevent block allocation for matching in hierarchical cache and cause dead lock
+                                        break
                             success = self.get_prefix_cached_blocks(request)
                             if not success:
                                 self._free_blocks(request)
@@ -1057,18 +1081,17 @@ def _allocate_decode_and_extend():
                         # Allocate blocks to prefill
                         if self.cache_manager.can_allocate_gpu_blocks(can_schedule_block_num_threshold):
                             if num_new_block > 0:
-                                extra_gpu_block_ids = self.cache_manager.allocate_gpu_blocks(
-                                    num_new_block, request.request_id
-                                )
+                                extra_gpu_block_ids = self._allocate_gpu_blocks(request, num_new_block)
                                 request.block_tables.extend(extra_gpu_block_ids)
                             self.waiting.popleft()
                             self.running.append(request)
-                            scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
+                            batch_request.add_request(self._prepare_prefill_task(request, num_new_tokens))
                             token_budget -= num_new_tokens
                             request.num_computed_tokens += num_new_tokens
                             if (
                                 self.config.cache_config.enable_prefix_caching
                                 and self.config.scheduler_config.splitwise_role != "decode"
+                                and not self.enable_cache_manager_v1
                             ):
                                 self.cache_manager.update_cache_blocks(
                                     request, self.config.cache_config.block_size, request.num_computed_tokens
@@ -1085,8 +1108,8 @@ def _allocate_decode_and_extend():
                     # move waiting request to end of the deque
                     self.waiting.append(req)
 
-            if scheduled_reqs:
-                llm_logger.debug(f"schedued_reqs: {scheduled_reqs}")
+            if len(batch_request) > 0:
+                llm_logger.debug(f"schedued_reqs: {batch_request}")
                 self.current_reserve_output_block_num_float -= self.decay_output_block_num
                 self.current_reserve_output_block_num = max(
                     int(self.current_reserve_output_block_num_float),
@@ -1096,11 +1119,22 @@ def _allocate_decode_and_extend():
                 if self.current_reserve_output_block_num == 0:
                     self.can_relax_prefill_strategy = True
 
-            self._log_console_scheduler_metrics(scheduled_reqs)
+            self._log_console_scheduler_metrics(batch_request)
 
             self.update_metrics()
 
-            return scheduled_reqs, error_reqs
+            # Issue pending backup tasks to batch_request
+            # This handles write_through_selective policy by attaching backup tasks
+            # to the batch request, which will be processed by the worker
+            if self.enable_cache_manager_v1 and len(batch_request) > 0:
+                evict_metadata = self.cache_manager.issue_pending_backup_to_batch_request()
+                if evict_metadata:
+                    batch_request.append_evict_metadata([evict_metadata])
+
+            if self.enable_cache_manager_v1:
+                self.cache_manager.check_and_add_pending_backup()
+
+            return batch_request, error_reqs
 
     def waiting_async_process(self, request: Request) -> None:
         """
@@ -1226,11 +1260,45 @@ def get_real_bsz(self) -> int:
                 break
         return self.real_bsz
 
-    def get_prefix_cached_blocks(self, request: Request):
+    def _allocate_gpu_blocks(self, request: Request, num_blocks: int) -> List[int]:
+        llm_logger.info(f"[DEBUG allocate_gpu_blocks] request_id={request.request_id}, num_blocks={num_blocks}")
+        if self.enable_cache_manager_v1:
+            return self.cache_manager.allocate_gpu_blocks(request, num_blocks)
+        else:
+            return self.cache_manager.allocate_gpu_blocks(num_blocks, request.request_id)
+
+    def _request_match_blocks(self, request: Request, skip_storage: bool = True):
         """
-        Match and fetch cache for a task.
+        Prefixed cache manager v1 will match blocks for request and return common_block_ids.
         """
-        try:
+        if self.enable_cache_manager_v1:
+            self.cache_manager.match_prefix(request, skip_storage)
+            match_result = request.match_result
+
+            if skip_storage:
+                common_block_ids = match_result.device_block_ids
+                matched_token_num = match_result.total_matched_blocks * self.config.cache_config.block_size
+                metrics = {
+                    "gpu_match_token_num": match_result.matched_device_nums * self.config.cache_config.block_size,
+                    "cpu_match_token_num": match_result.matched_host_nums * self.config.cache_config.block_size,
+                    "storage_match_token_num": match_result.matched_storage_nums * self.config.cache_config.block_size,
+                    "match_gpu_block_ids": common_block_ids,
+                    "gpu_recv_block_ids": [],
+                    "match_storage_block_ids": [],
+                    "cpu_cache_prepare_time": 0,
+                    "storage_cache_prepare_time": 0,
+                }
+
+                no_cache_block_num = (
+                    request.need_prefill_tokens - matched_token_num + self.config.cache_config.block_size - 1
+                ) // self.config.cache_config.block_size
+                request.cache_info = [len(common_block_ids), no_cache_block_num]
+
+                return (common_block_ids, matched_token_num, metrics)
+            else:
+                # Prefetch cache from storage
+                pass
+        else:
             (common_block_ids, matched_token_num, metrics) = self.cache_manager.request_match_blocks(
                 request, self.config.cache_config.block_size
             )
@@ -1242,6 +1310,18 @@ def get_prefix_cached_blocks(self, request: Request):
             )
 
             request.cache_info = [matched_block_num, no_cache_block_num]
+
+            return (common_block_ids, matched_token_num, metrics)
+
+    def get_prefix_cached_blocks(self, request: Request):
+        """
+        Match and fetch cache for a task.
+        """
+        try:
+            (common_block_ids, matched_token_num, metrics) = self._request_match_blocks(
+                request  # skip_storage 使用默认值 True
+            )
+
             request.block_tables = common_block_ids
             request.num_cached_tokens = matched_token_num
             if self.config.cache_config.disable_chunked_mm_input:
@@ -1344,9 +1424,7 @@ def preallocate_resource_in_p(self, request: Request):
 
                 need_extra_prefill_blocks = need_prealloc_prefill_blocks - request.cache_info[0]
                 if self.cache_manager.can_allocate_gpu_blocks(need_extra_prefill_blocks):
-                    extra_gpu_block_ids = self.cache_manager.allocate_gpu_blocks(
-                        need_extra_prefill_blocks, request.request_id
-                    )
+                    extra_gpu_block_ids = self._allocate_gpu_blocks(request, need_extra_prefill_blocks)
                     request.block_tables.extend(extra_gpu_block_ids)
                     allocated_position = self.get_available_position()
                     request.idx = allocated_position
@@ -1361,9 +1439,7 @@ def preallocate_resource_in_p(self, request: Request):
 
             else:
                 if self.cache_manager.can_allocate_gpu_blocks(need_prealloc_prefill_blocks):
-                    request.block_tables.extend(
-                        self.cache_manager.allocate_gpu_blocks(need_prealloc_prefill_blocks, request.request_id)
-                    )
+                    request.block_tables.extend(self._allocate_gpu_blocks(request, need_prealloc_prefill_blocks))
                     request.num_computed_tokens = 0
                     allocated_position = self.get_available_position()
                     request.idx = allocated_position
@@ -1395,9 +1471,7 @@ def preallocate_resource_in_d(self, request: Request):
             if not self.cache_manager.can_allocate_gpu_blocks(need_prealloc_prefill_blocks):
                 return False
 
-            request.block_tables = self.cache_manager.allocate_gpu_blocks(
-                need_prealloc_prefill_blocks, request.request_id
-            )
+            request.block_tables = self._allocate_gpu_blocks(request, need_prealloc_prefill_blocks)
             request.num_computed_tokens = request.need_prefill_tokens
             request.disaggregate_info["block_tables"] = request.block_tables
             allocated_position = self.get_available_position()
@@ -1449,13 +1523,23 @@ def add_prefilled_request(self, request_output: RequestOutput):
             self.running.append(request)
 
     def _free_blocks(self, request: Request):
-        if self.config.cache_config.enable_prefix_caching and self.config.scheduler_config.splitwise_role != "decode":
+        if self.enable_cache_manager_v1:
+            self.cache_manager.request_finish(request)
+        elif (
+            self.config.cache_config.enable_prefix_caching and self.config.scheduler_config.splitwise_role != "decode"
+        ):
             self.cache_manager.release_block_ids(request)
             self.cache_manager.recycle_gpu_blocks(
                 request.block_tables[request.num_cached_blocks :], request.request_id
             )
         else:
-            self.cache_manager.recycle_gpu_blocks(request.block_tables, request.request_id)
+            if self.config.cache_config.enable_prefix_caching:
+                self.cache_manager.release_block_ids(request)
+                self.cache_manager.recycle_gpu_blocks(
+                    request.block_tables[request.num_cached_blocks :], request.request_id
+                )
+            else:
+                self.cache_manager.recycle_gpu_blocks(request.block_tables, request.request_id)
         request.block_tables = []
 
         if request.request_id in self.using_extend_tables_req_id:
@@ -1563,7 +1647,7 @@ def log_status(self):
             f")"
         )
 
-    def _log_console_scheduler_metrics(self, scheduled_reqs: list[Request | ScheduledDecodeTask]) -> None:
+    def _log_console_scheduler_metrics(self, batch_request: BatchRequest) -> None:
         if not (
             hasattr(self, "scheduler_metrics_logger")
             and self.scheduler_metrics_logger is not None
@@ -1580,8 +1664,8 @@ def _log_console_scheduler_metrics(self, scheduled_reqs: list[Request | Schedule
         scheduler_queue_cnt = max(int(getattr(self, "scheduler_unhandled_request_num", 0) or 0), 0)
         queue_cnt = len(self.waiting) + scheduler_queue_cnt
 
-        prefill_reqs = [r for r in scheduled_reqs if isinstance(r, Request) and r.task_type == RequestType.PREFILL]
-        has_decode = any(getattr(r, "task_type", None) == RequestType.DECODE for r in scheduled_reqs)
+        prefill_reqs = [r for r in batch_request if isinstance(r, Request) and r.task_type == RequestType.PREFILL]
+        has_decode = any(getattr(r, "task_type", None) == RequestType.DECODE for r in batch_request)
 
         self.scheduler_metrics_logger.log_prefill_batch(
             prefill_reqs=prefill_reqs,
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
index 5e1fd372304..4cdded61711 100644
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -252,6 +252,8 @@ def _validate_split_kv_size(value: int) -> int:
     # When v1 is enabled, the legacy /clear_load_weight and /update_model_weight
     # will adopt this new communication pattern.
     "FD_ENABLE_V1_UPDATE_WEIGHTS": lambda: bool(int(os.getenv("FD_ENABLE_V1_UPDATE_WEIGHTS", "0"))),
+    # enable kv cache manager v1
+    "ENABLE_V1_KVCACHE_MANAGER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_MANAGER", "0")),
 }
 
 
diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py
index 9e512f32355..a9dc1577d48 100644
--- a/fastdeploy/model_executor/forward_meta.py
+++ b/fastdeploy/model_executor/forward_meta.py
@@ -17,7 +17,7 @@
 import logging
 from dataclasses import dataclass, fields
 from enum import IntEnum, auto
-from typing import TYPE_CHECKING, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import paddle
 
@@ -149,6 +149,10 @@ class ForwardMeta:
     # Routing Replay table buffer
     routing_replay_table: Optional[paddle.Tensor] = None
 
+    # ============ V1 KVCACHE Manager: Swap-in waiting info ============
+    # LayerDoneCounter for layer-by-layer swap waiting (set by submit_swap_tasks return value)
+    layer_done_counter: Optional[Any] = None
+
     # chunked MoE related
     moe_num_chunk: int = 1
     max_moe_num_chunk: int = 1
diff --git a/fastdeploy/model_executor/layers/attention/attention.py b/fastdeploy/model_executor/layers/attention/attention.py
index 49d14ef33c8..96897317684 100644
--- a/fastdeploy/model_executor/layers/attention/attention.py
+++ b/fastdeploy/model_executor/layers/attention/attention.py
@@ -272,6 +272,11 @@ def forward(
             compressed_kv: optional compressed key-value cache (for MLA)
             k_pe: optional key positional encoding (for MLA)
         """
+        # ============ V1 KVCACHE Manager: Layer-by-layer swap wait ============
+        # Wait for swap-in of current layer before using cache
+        if forward_meta.layer_done_counter is not None:
+            forward_meta.layer_done_counter.wait_for_layer(self.layer_id)
+
         return forward_meta.attn_backend.forward(
             q,
             k,
diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py
index 1ab0b48f350..02da28865fa 100644
--- a/fastdeploy/output/token_processor.py
+++ b/fastdeploy/output/token_processor.py
@@ -951,6 +951,7 @@ def _process_batch_output(self):
                         envs.ENABLE_V1_KVCACHE_SCHEDULER
                         and self.cfg.cache_config.enable_prefix_caching
                         and self.cfg.cache_config.enable_output_caching
+                        and not envs.ENABLE_V1_KVCACHE_MANAGER
                     ):
                         self.resource_manager.cache_output_tokens(
                             task
diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
index 88c1bbc5614..b41eb2c6980 100644
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -435,13 +435,20 @@ def clear_mtp_cache(self, profile=False):
         if self.forward_meta is not None:
             del self.forward_meta.caches
 
-    def update_mtp_block_num(self, num_gpu_blocks) -> None:
+    def update_mtp_block_num(self, num_gpu_blocks, skip_cache_init: bool = False) -> None:
         """
         Update MTP block num by theoretical calculation
+
+        Args:
+            num_gpu_blocks: Main model GPU block count.
+            skip_cache_init: When True, skip internal initialize_kv_cache call.
+                Set this when the caller (e.g. gpu_model_runner with enable_cache_manager_v1)
+                has already re-created MTP cache via cache_controller.
         """
         # Reset block table and kv cache with global block num
         self.main_model_num_gpu_blocks = num_gpu_blocks
-        self.initialize_kv_cache(main_model_num_blocks=self.main_model_num_gpu_blocks)
+        if not skip_cache_init:
+            self.initialize_kv_cache(main_model_num_blocks=self.main_model_num_gpu_blocks)
 
         # Reset free list
         free_list = list(
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 2db362488f3..7f654a2193f 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -29,7 +29,7 @@
 
 from fastdeploy.config import FDConfig
 from fastdeploy.engine.pooling_params import PoolingParams
-from fastdeploy.engine.request import ImagePosition, Request, RequestType
+from fastdeploy.engine.request import BatchRequest, ImagePosition, Request, RequestType
 from fastdeploy.model_executor.graph_optimization.utils import (
     profile_run_guard,
     sot_warmup_guard,
@@ -83,6 +83,7 @@
 import zmq
 
 from fastdeploy import envs
+from fastdeploy.cache_manager.v1 import CacheController
 from fastdeploy.engine.tasks import PoolingTask
 from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
 from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
@@ -263,6 +264,19 @@ def __init__(
             create=False,
         )
 
+        # NOTE:(changwenbin) Determine whether it is Multi-Head Latent Attention,
+        # To rationalize the allocation of kvcache.
+        self.mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN"
+        self.dsa_cache = envs.FD_ATTENTION_BACKEND == "DSA_ATTN"
+
+        self.enable_cache_manager_v1 = envs.ENABLE_V1_KVCACHE_MANAGER
+        if self.enable_cache_manager_v1:
+            self.cache_controller = CacheController(
+                fd_config,
+                self.local_rank,
+                self.device_id,
+            )
+
         # for overlap
         self._cached_model_output_data = None
         self._cached_sampler_output = None
@@ -713,7 +727,7 @@ def _get_feature_positions(
         )
         return feature_positions
 
-    def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = None):
+    def insert_tasks_v1(self, req_dicts: BatchRequest, num_running_requests: int = None):
         """
         Process scheduler output tasks, used when ENABLE_V1_KVCACHE_SCHEDULER=1
         req_dict: A list of Request dict
@@ -730,6 +744,13 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
             "position_ids_offset": [0],
             "max_tokens_lst": [],
         }
+        if self.enable_cache_manager_v1:
+            # submit_swap_tasks handles:
+            # 1. Waiting for pending evict handlers before submitting new evict
+            # 2. write_back policy: waiting for evict to complete before submitting swap-in
+            # 3. Adding handlers to pending lists appropriately
+            self.cache_controller.submit_swap_tasks(req_dicts.cache_evict_metadata, req_dicts.cache_swap_metadata)
+
         for i in range(req_len):
             request = req_dicts[i]
             idx = self.share_inputs.get_index_by_batch_id(request.idx)
@@ -1349,10 +1370,35 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False):
         self.forward_meta.is_zero_size = self.forward_meta.ids_remove_padding.shape[0] == 0
         self.forward_meta.exist_prefill = self.exist_prefill()
 
+        # ============ V1 KVCACHE Manager: Swap-in waiting config ============
+        if self.enable_cache_manager_v1:
+            self.forward_meta.layer_done_counter = self.cache_controller.swap_layer_done_counter
+        else:
+            self.forward_meta.layer_done_counter = None
+
     def initialize_kv_cache(self, profile: bool = False) -> None:
         """
         Initialize kv cache
         """
+        if self.enable_cache_manager_v1:
+            self.share_inputs["caches"] = self.cache_controller.initialize_kv_cache(
+                attn_backend=self.attn_backends[0],
+                num_gpu_blocks=self.num_gpu_blocks,
+            )
+            self.cache_kvs_map = self.cache_controller.get_kv_caches()
+            if self.spec_method == SpecMethod.MTP:
+                mtp_num_blocks = int(self.num_gpu_blocks * self.proposer.speculative_config.num_gpu_block_expand_ratio)
+                mtp_cache_list = self.cache_controller.initialize_mtp_kv_cache(
+                    attn_backend=self.proposer.attn_backends[0],
+                    num_gpu_blocks=mtp_num_blocks,
+                    num_mtp_layers=self.proposer.model_config.num_hidden_layers,
+                    layer_offset=self.proposer.num_main_model_layers,
+                )
+                self.proposer.num_gpu_blocks = mtp_num_blocks
+                self.proposer.cache_kvs_map = self.cache_controller.get_kv_caches()
+                self.proposer.model_inputs["caches"] = mtp_cache_list
+            return
+
         # cache_kvs = {}
         max_block_num = self.num_gpu_blocks
 
@@ -1360,13 +1406,6 @@ def initialize_kv_cache(self, profile: bool = False) -> None:
         cache_type = self.model_config.dtype
         kv_cache_quant_type = None
 
-        # NOTE:(changwenbin) Determine whether it is Multi-Head Latent Attention,
-        # To rationalize the allocation of kvcache.
-        from fastdeploy import envs
-
-        self.mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN"
-        self.dsa_cache = envs.FD_ATTENTION_BACKEND == "DSA_ATTN"
-
         if (
             self.quant_config
             and hasattr(self.quant_config, "kv_cache_quant_type")
@@ -2153,15 +2192,16 @@ def _preprocess(
         return model_inputs, p_done_idxs, token_num_event
 
     def _execute(self, model_inputs: Dict[str, paddle.Tensor]) -> None:
+        model_output = None
         if model_inputs is not None and len(model_inputs) > 0:
             model_output = self.model(
                 model_inputs,
                 self.forward_meta,
             )
+
             if self.use_cudagraph:
                 model_output = model_output[: self.real_token_num]
-        else:
-            model_output = None
+
         return model_output
 
     def _postprocess(
@@ -2515,7 +2555,8 @@ def profile_run(self) -> None:
         self.num_gpu_blocks = self.cache_config.total_block_num
         self.initialize_kv_cache(profile=True)
         if self.spec_method == SpecMethod.MTP:
-            self.proposer.initialize_kv_cache(main_model_num_blocks=self.num_gpu_blocks, profile=True)
+            if not self.enable_cache_manager_v1:
+                self.proposer.initialize_kv_cache(main_model_num_blocks=self.num_gpu_blocks, profile=True)
 
         # 1. Profile with multimodal encoder & encoder cache
 
@@ -2562,7 +2603,7 @@ def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
         )
 
         if self.spec_method == SpecMethod.MTP:
-            self.proposer.update_mtp_block_num(num_gpu_blocks)
+            self.proposer.update_mtp_block_num(num_gpu_blocks, skip_cache_init=self.enable_cache_manager_v1)
 
     def cal_theortical_kvcache(self):
         """
@@ -2625,17 +2666,21 @@ def cal_theortical_kvcache(self):
 
     def clear_cache(self, profile=False):
         """Clear cached data from shared inputs and forward metadata"""
-        create_cache_tensor = profile or not (
-            self.fd_config.cache_config.num_cpu_blocks > 0
-            or self.fd_config.cache_config.kvcache_storage_backend
-            or self.fd_config.scheduler_config.splitwise_role != "mixed"
-        )
-        local_rank = self.local_rank % self.parallel_config.tensor_parallel_size
+        if self.enable_cache_manager_v1:
+            self.cache_controller.free_gpu_cache()
+        else:
+            create_cache_tensor = profile or not (
+                self.fd_config.cache_config.num_cpu_blocks > 0
+                or self.fd_config.cache_config.kvcache_storage_backend
+                or self.fd_config.scheduler_config.splitwise_role != "mixed"
+            )
+            local_rank = self.local_rank % self.parallel_config.tensor_parallel_size
+
+            if not create_cache_tensor:
+                for name, tensor in self.cache_kvs_map.items():
+                    unset_data_ipc(tensor, name, True, False)
+                self.cache_ready_signal.value[local_rank] = 0
 
-        if not create_cache_tensor:
-            for name, tensor in self.cache_kvs_map.items():
-                unset_data_ipc(tensor, name, True, False)
-            self.cache_ready_signal.value[local_rank] = 0
         self.cache_kvs_map.clear()
         self.share_inputs.pop("caches", None)
         if self.forward_meta is not None:
@@ -2682,7 +2727,8 @@ def update_parameters(self, pid):
         self.share_inputs.reset_share_inputs()
         if self.spec_method == SpecMethod.MTP:
             self.proposer.model_inputs.reset_model_inputs()
-            self.proposer.initialize_kv_cache(main_model_num_blocks=self.num_gpu_blocks)
+            if not self.enable_cache_manager_v1:
+                self.proposer.initialize_kv_cache(main_model_num_blocks=self.num_gpu_blocks)
         self.initialize_kv_cache()
         # Recapture CUDAGraph
         if self.use_cudagraph:
@@ -2751,7 +2797,8 @@ def wakeup(self, tags):
                 logger.info("GPU model runner's kv cache is not sleeping, no need to wakeup!")
                 return
             if self.spec_method == SpecMethod.MTP:
-                self.proposer.initialize_kv_cache(main_model_num_blocks=self.num_gpu_blocks)
+                if not self.enable_cache_manager_v1:
+                    self.proposer.initialize_kv_cache(main_model_num_blocks=self.num_gpu_blocks)
             self.initialize_kv_cache()
             self.is_kvcache_sleeping = False
 
diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py
index aebf3f21111..ed2af91b574 100644
--- a/fastdeploy/worker/gpu_worker.py
+++ b/fastdeploy/worker/gpu_worker.py
@@ -24,7 +24,7 @@
 
 from fastdeploy import envs
 from fastdeploy.config import FDConfig
-from fastdeploy.engine.request import Request
+from fastdeploy.engine.request import BatchRequest, Request
 from fastdeploy.plugins.model_runner import load_model_runner_plugins
 from fastdeploy.usage.usage_lib import report_usage_stats
 from fastdeploy.utils import get_logger, set_random_seed
@@ -213,7 +213,7 @@ def execute_model(
         output = self.model_runner.execute_model(model_forward_batch, num_running_request)
         return output
 
-    def preprocess_new_task(self, req_dicts: List[Request], num_running_requests: int) -> None:
+    def preprocess_new_task(self, req_dicts: BatchRequest, num_running_requests: int) -> None:
         """Process new requests and then start the decode loop
         TODO(gongshaotian):The scheduler should schedule the handling of prefill,
         and workers and modelrunners should not perceive it.
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
index 8182e06990b..a2f66d1593c 100644
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -49,7 +49,12 @@
     SpeculativeConfig,
     StructuredOutputsConfig,
 )
-from fastdeploy.engine.request import ControlRequest, ControlResponse, RequestType
+from fastdeploy.engine.request import (
+    BatchRequest,
+    ControlRequest,
+    ControlResponse,
+    RequestType,
+)
 from fastdeploy.eplb.async_expert_loader import (
     MODEL_MAIN_NAME,
     REARRANGE_EXPERT_MAGIC_NUM,
@@ -586,39 +591,27 @@ def event_loop_normal(self) -> None:
                 if self.parallel_config.use_ep and self.scheduler_config.splitwise_role == "prefill":
                     paddle.distributed.barrier(self.parallel_config.ep_group)
 
-                req_dicts, control_reqs = [], []
                 assert (
                     len(tasks) > 0
                 ), f"task_queue.get_tasks() should contain at least one tuple, [([req1, ...] ,real_bsz)], but got len(tasks)={len(tasks)}"
-                # In EP + DP prefill, empty task ([]) is delived in worker to barrier. For empty task, just skip and continue.
-                # tasks[0] contains two part, ([req1, ...] ,real_bsz)
-                # tasks[0][0] is [req1, ...]
-                # if empty batch is delived, eval(tasks[0][0]) should be False ([]),
-                # if batch with requests is delived, eval(tasks[0][0]) should be True, then to be processed as below.
-                if tasks[0][0]:
-                    for req_dict, bsz in tasks:
-                        if len(req_dict) > 0 and isinstance(req_dict[0], ControlRequest):
-                            control_reqs.append(req_dict[0])
+
+                batch_request, control_reqs, max_occupied_batch_index = BatchRequest.from_tasks(tasks)
+
+                if len(control_reqs) > 0:
+                    logger.info(f"Rank: {self.local_rank} received {len(control_reqs)} control request.")
+                    for control_req in control_reqs:
+                        if self.parallel_config.use_ep:
+                            self.cached_control_reqs.append(control_req)
+                            logger.info(f"Rank: {self.local_rank} cached ep control request: {control_req}")
                         else:
-                            max_occupied_batch_index = int(bsz)
-                            req_dicts.extend(req_dict)
-
-                    # todo: run control request async
-                    if len(control_reqs) > 0:
-                        logger.info(f"Rank: {self.local_rank} received {len(control_reqs)} control request.")
-                        for control_req in control_reqs:
-                            if self.parallel_config.use_ep:
-                                self.cached_control_reqs.append(control_req)
-                                logger.info(f"Rank: {self.local_rank} cached ep control request: {control_req}")
-                            else:
-                                self.run_control_method(control_req)
-                                self._tp_barrier_wait() if tp_size > 1 else None
-
-                if len(req_dicts) > 0:
+                            self.run_control_method(control_req)
+                            self._tp_barrier_wait() if tp_size > 1 else None
+
+                if len(batch_request) > 0:
                     # Count prefill requests in current batch
-                    num_prefill_requests = sum(1 for req in req_dicts if req.task_type == RequestType.PREFILL)
-                    num_scheduled_requests = len(req_dicts)
-                    scheduled_request_ids = [req.request_id for req in req_dicts]
+                    num_prefill_requests = sum(1 for req in batch_request if req.task_type == RequestType.PREFILL)
+                    num_scheduled_requests = len(batch_request)
+                    scheduled_request_ids = [req.request_id for req in batch_request]
                     logger.info(
                         f"Rank: {self.local_rank}, num_prefill_requests: {num_prefill_requests}, "
                         f"max_occupied_batch_index: {max_occupied_batch_index}, "
@@ -627,7 +620,7 @@ def event_loop_normal(self) -> None:
                     )
 
                     # Process prefill inputs
-                    self.worker.preprocess_new_task(req_dicts, max_occupied_batch_index)
+                    self.worker.preprocess_new_task(batch_request, max_occupied_batch_index)
             else:
                 if self.scheduler_config.splitwise_role == "prefill":
                     if tp_size > 1:
diff --git a/tests/cache_manager/v1/__init__.py b/tests/cache_manager/v1/__init__.py
new file mode 100644
index 00000000000..a9cc79cc9d7
--- /dev/null
+++ b/tests/cache_manager/v1/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/cache_manager/v1/test_cache_controller.py b/tests/cache_manager/v1/test_cache_controller.py
new file mode 100644
index 00000000000..858dbf69b56
--- /dev/null
+++ b/tests/cache_manager/v1/test_cache_controller.py
@@ -0,0 +1,727 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Unit tests for CacheController class with the new LayerDoneCounter design.
+
+Tests cover:
+- Initialization
+- load_host_to_device returns LayerDoneCounter
+- evict_device_to_host returns LayerDoneCounter
+- submit_swap_tasks returns LayerDoneCounter
+- LayerDoneCounter methods: wait_for_layer, wait_all, mark_layer_done, mark_all_done
+- Statistics
+- Edge cases (empty metadata, failed transfers)
+"""
+
+import time
+import unittest
+from unittest.mock import MagicMock, patch
+
+from utils import get_default_test_fd_config
+
+from fastdeploy.cache_manager.v1.metadata import CacheSwapMetadata
+
+
+def create_cache_controller(
+    enable_prefix_caching: bool = True,
+    num_host_blocks: int = 50,
+    num_layers: int = 4,
+):
+    """Helper to create CacheController with test config."""
+    from fastdeploy.cache_manager.v1.cache_controller import CacheController
+
+    config = get_default_test_fd_config()
+    config.cache_config.enable_prefix_caching = enable_prefix_caching
+    config.cache_config.num_cpu_blocks = num_host_blocks
+    config.cache_config.cache_dtype = "bfloat16"
+    config.model_config.num_hidden_layers = num_layers
+    config.model_config.dtype = "bfloat16"
+
+    return CacheController(config, local_rank=0, device_id=0)
+
+
+def create_mock_device_cache_kvs_map(
+    num_layers: int = 4,
+    local_rank: int = 0,
+    device_id: int = 0,
+    num_blocks: int = 100,
+    num_heads: int = 32,
+    block_size: int = 64,
+    head_dim: int = 128,
+    dtype: str = "bfloat16",
+):
+    """Helper to create mock device cache_kvs_map."""
+    import paddle
+
+    cache_kvs_map = {}
+
+    for layer_idx in range(num_layers):
+        key_name = f"key_caches_{layer_idx}_rank{local_rank}.device{device_id}"
+        val_name = f"value_caches_{layer_idx}_rank{local_rank}.device{device_id}"
+
+        key_tensor = paddle.zeros([num_blocks, num_heads, block_size, head_dim], dtype=dtype)
+        val_tensor = paddle.zeros([num_blocks, num_heads, block_size, head_dim], dtype=dtype)
+
+        cache_kvs_map[key_name] = key_tensor
+        cache_kvs_map[val_name] = val_tensor
+
+    return cache_kvs_map
+
+
+def create_mock_host_cache_kvs_map(
+    num_layers: int = 4,
+    local_rank: int = 0,
+    device_id: int = 0,
+    base_ptr: int = 1000000,
+):
+    """Helper to create mock host cache_kvs_map (with int pointers)."""
+    cache_kvs_map = {}
+
+    for layer_idx in range(num_layers):
+        key_name = f"key_caches_{layer_idx}_rank{local_rank}.device{device_id}"
+        val_name = f"value_caches_{layer_idx}_rank{local_rank}.device{device_id}"
+
+        cache_kvs_map[key_name] = base_ptr + layer_idx * 10000
+        cache_kvs_map[val_name] = base_ptr + layer_idx * 10000 + 5000
+
+    return cache_kvs_map
+
+
+def setup_transfer_env(controller, num_layers=4):
+    """Helper to set up device and host cache for transfer tests."""
+    device_cache = create_mock_device_cache_kvs_map(num_layers=num_layers)
+    controller._transfer_manager.set_cache_kvs_map(device_cache)
+    host_cache = create_mock_host_cache_kvs_map(num_layers=num_layers)
+    controller._transfer_manager.set_host_cache_kvs_map(host_cache)
+
+
+# ============================================================================
+# Initialization Tests
+# ============================================================================
+
+
+class TestCacheControllerInit(unittest.TestCase):
+    """Test CacheController initialization."""
+
+    def test_init_creates_executor(self):
+        """Test that ThreadPoolExecutor is created on init."""
+        from concurrent.futures import ThreadPoolExecutor
+
+        controller = create_cache_controller()
+        self.assertIsNotNone(controller._executor)
+        self.assertIsInstance(controller._executor, ThreadPoolExecutor)
+
+    def test_init_creates_transfer_manager(self):
+        """Test that TransferManager is created on init."""
+        controller = create_cache_controller()
+        self.assertIsNotNone(controller._transfer_manager)
+
+    def test_init_no_singleton_layer_counter(self):
+        """Test that LayerDoneCounter is NOT created as singleton on init (per-transfer design)."""
+        controller = create_cache_controller(num_layers=4)
+        # In the new design, _layer_counter is None initially, set per transfer
+        self.assertIsNone(controller._layer_done_counter)
+
+    def test_init_empty_pending_evict_counters(self):
+        """Test that pending evict counters list is empty on init."""
+        controller = create_cache_controller()
+        self.assertEqual(len(controller._pending_evict_counters), 0)
+
+
+# ============================================================================
+# load_host_to_device Tests
+# ============================================================================
+
+
+def make_done_counter(num_layers=4):
+    """Create a pre-completed LayerDoneCounter for use in mocks."""
+    from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+    counter = LayerDoneCounter(num_layers)
+    counter.mark_all_done()
+    return counter
+
+
+class TestLoadHostToDevice(unittest.TestCase):
+    """Test load_host_to_device returns LayerDoneCounter."""
+
+    def setUp(self):
+        self.controller = create_cache_controller(num_layers=4)
+        setup_transfer_env(self.controller, num_layers=4)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_returns_layer_done_counter(self, mock_submit):
+        """Test that load_host_to_device returns LayerDoneCounter."""
+        from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+        mock_submit.return_value = make_done_counter()
+
+        meta = CacheSwapMetadata(
+            src_block_ids=[10, 11, 12],
+            dst_block_ids=[0, 1, 2],
+            src_type="host",
+            dst_type="device",
+        )
+        counter = self.controller.load_host_to_device(meta)
+
+        self.assertIsNotNone(counter)
+        self.assertIsInstance(counter, LayerDoneCounter)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_single_metadata_completes_successfully(self, mock_submit):
+        """Test that single metadata task completes with success."""
+
+        def fake_submit(meta, **kwargs):
+            meta.success = True
+            return make_done_counter()
+
+        mock_submit.side_effect = fake_submit
+
+        meta = CacheSwapMetadata(src_block_ids=[10], dst_block_ids=[0])
+        counter = self.controller.load_host_to_device(meta)
+
+        # Counter is already done (pre-completed)
+        self.assertTrue(counter.is_all_done())
+        self.assertTrue(meta.success)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_wait_for_layer(self, mock_submit):
+        """Test wait_for_layer returns when layer is done."""
+        mock_submit.return_value = make_done_counter()
+
+        meta = CacheSwapMetadata(src_block_ids=[10], dst_block_ids=[0])
+        counter = self.controller.load_host_to_device(meta)
+
+        # Counter is pre-completed, wait_for_layer should return True immediately
+        result = counter.wait_for_layer(0, timeout=5.0)
+        self.assertTrue(result)
+        self.assertTrue(counter.is_layer_done(0))
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_multiple_metadata_creates_separate_counters(self, mock_submit):
+        """Test that multiple CacheSwapMetadatas create separate counters."""
+        mock_submit.side_effect = lambda *a, **kw: make_done_counter()
+
+        meta1 = CacheSwapMetadata(src_block_ids=[10], dst_block_ids=[0])
+        meta2 = CacheSwapMetadata(src_block_ids=[11], dst_block_ids=[1])
+
+        counter1 = self.controller.load_host_to_device(meta1)
+        counter2 = self.controller.load_host_to_device(meta2)
+
+        # Each should have its own counter
+        self.assertIsNot(counter1, counter2)
+
+    def test_empty_src_block_ids_sets_error(self):
+        """Test that empty src block IDs set error."""
+        meta = CacheSwapMetadata(src_block_ids=[], dst_block_ids=[0])
+        self.controller.load_host_to_device(meta)
+
+        self.assertFalse(meta.success)
+        self.assertIsNotNone(meta.error_message)
+
+    def test_empty_dst_block_ids_sets_error(self):
+        """Test that empty dst block IDs set error."""
+        meta = CacheSwapMetadata(src_block_ids=[10], dst_block_ids=[])
+        self.controller.load_host_to_device(meta)
+
+        self.assertFalse(meta.success)
+        self.assertIsNotNone(meta.error_message)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_returns_immediately_non_blocking(self, mock_submit):
+        """Test that load_host_to_device returns without blocking."""
+
+        def slow_submit(*args, **kwargs):
+            time.sleep(0.5)
+            return make_done_counter()
+
+        mock_submit.side_effect = slow_submit
+
+        meta = CacheSwapMetadata(src_block_ids=[10], dst_block_ids=[0])
+
+        start = time.time()
+        self.controller.load_host_to_device(meta)
+        elapsed = time.time() - start
+
+        # load_host_to_device calls _submit_swap_task synchronously (submit to executor),
+        # so elapsed includes the mock's 0.5s sleep. Assert it completes within 1s.
+        self.assertLess(elapsed, 1.0)
+
+
+# ============================================================================
+# evict_device_to_host Tests
+# ============================================================================
+
+
+class TestEvictDeviceToHost(unittest.TestCase):
+    """Test evict_device_to_host returns LayerDoneCounter."""
+
+    def setUp(self):
+        self.controller = create_cache_controller(num_layers=4)
+        setup_transfer_env(self.controller, num_layers=4)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_returns_layer_done_counter(self, mock_submit):
+        """Test that evict_device_to_host returns LayerDoneCounter."""
+        from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+        mock_submit.return_value = make_done_counter()
+
+        meta = CacheSwapMetadata(src_block_ids=[0, 1], dst_block_ids=[10, 11])
+        counter = self.controller.evict_device_to_host(meta)
+
+        self.assertIsNotNone(counter)
+        self.assertIsInstance(counter, LayerDoneCounter)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_single_metadata_completes(self, mock_submit):
+        """Test that eviction completes successfully."""
+
+        def fake_submit(meta, **kwargs):
+            meta.success = True
+            return make_done_counter()
+
+        mock_submit.side_effect = fake_submit
+
+        meta = CacheSwapMetadata(src_block_ids=[0, 1], dst_block_ids=[10, 11])
+        counter = self.controller.evict_device_to_host(meta)
+
+        self.assertTrue(counter.is_all_done())
+        self.assertTrue(meta.success)
+
+
+# ============================================================================
+# submit_swap_tasks Tests
+# ============================================================================
+
+
+class TestSubmitSwapTasks(unittest.TestCase):
+    """Test submit_swap_tasks method returns LayerDoneCounter."""
+
+    def setUp(self):
+        self.controller = create_cache_controller(num_layers=4)
+        setup_transfer_env(self.controller, num_layers=4)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_submit_swap_tasks_returns_layer_done_counter(self, mock_submit):
+        """Test submit_swap_tasks returns LayerDoneCounter for swap_in."""
+        from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+        mock_submit.return_value = make_done_counter()
+
+        evict_meta = CacheSwapMetadata(src_block_ids=[0], dst_block_ids=[10])
+        swap_in_meta = CacheSwapMetadata(src_block_ids=[10], dst_block_ids=[0])
+
+        counter = self.controller.submit_swap_tasks(evict_meta, swap_in_meta)
+
+        self.assertIsNotNone(counter)
+        self.assertIsInstance(counter, LayerDoneCounter)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_submit_swap_tasks_evict_only_returns_none(self, mock_submit):
+        """Test submit_swap_tasks with only evict metadata returns None."""
+        mock_submit.return_value = make_done_counter()
+
+        evict_meta = CacheSwapMetadata(src_block_ids=[0], dst_block_ids=[10])
+
+        counter = self.controller.submit_swap_tasks(evict_meta, None)
+
+        # Evict-only returns None (no swap-in counter)
+        self.assertIsNone(counter)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_submit_swap_tasks_sets_swap_layer_done_counter(self, mock_submit):
+        """Test submit_swap_tasks sets swap_layer_done_counter property."""
+        expected_counter = make_done_counter()
+        mock_submit.return_value = expected_counter
+
+        evict_meta = CacheSwapMetadata(src_block_ids=[0], dst_block_ids=[10])
+        swap_in_meta = CacheSwapMetadata(src_block_ids=[10], dst_block_ids=[0])
+
+        counter = self.controller.submit_swap_tasks(evict_meta, swap_in_meta)
+
+        # swap_layer_done_counter should be set
+        self.assertIs(self.controller.swap_layer_done_counter, counter)
+
+
+# ============================================================================
+# LayerDoneCounter Tests
+# ============================================================================
+
+
+class TestLayerDoneCounter(unittest.TestCase):
+    """Test LayerDoneCounter independent sync primitive."""
+
+    def test_layer_done_counter_basic(self):
+        """Test basic LayerDoneCounter functionality."""
+        from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+        counter = LayerDoneCounter(num_layers=4)
+
+        # Initially not done
+        self.assertFalse(counter.is_all_done())
+        self.assertEqual(counter.get_completed_count(), 0)
+
+        # Mark one layer done
+        counter.mark_layer_done(0)
+        self.assertTrue(counter.is_layer_done(0))
+        self.assertFalse(counter.is_layer_done(1))
+        self.assertEqual(counter.get_completed_count(), 1)
+        self.assertFalse(counter.is_all_done())
+
+    def test_layer_done_counter_mark_all_done(self):
+        """Test mark_all_done marks all layers."""
+        from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+        counter = LayerDoneCounter(num_layers=4)
+
+        counter.mark_all_done()
+
+        self.assertTrue(counter.is_all_done())
+        self.assertEqual(counter.get_completed_count(), 4)
+        self.assertTrue(counter.is_layer_done(0))
+        self.assertTrue(counter.is_layer_done(3))
+
+    def test_layer_done_counter_wait_for_layer_immediate(self):
+        """Test wait_for_layer returns immediately if done."""
+        from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+        counter = LayerDoneCounter(num_layers=4)
+        counter.mark_all_done()
+
+        result = counter.wait_for_layer(0, timeout=1.0)
+        self.assertTrue(result)
+
+    def test_layer_done_counter_wait_all(self):
+        """Test wait_all waits for all layers."""
+        from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+        counter = LayerDoneCounter(num_layers=4)
+
+        # Mark all done
+        counter.mark_all_done()
+
+        result = counter.wait_all(timeout=1.0)
+        self.assertTrue(result)
+        self.assertTrue(counter.is_all_done())
+
+    def test_layer_done_counter_get_pending_layers(self):
+        """Test get_pending_layers returns correct list."""
+        from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+        counter = LayerDoneCounter(num_layers=4)
+        counter.mark_layer_done(1)
+
+        pending = counter.get_pending_layers()
+        self.assertEqual(pending, [0, 2, 3])
+
+    def test_layer_done_counter_callback(self):
+        """Test callback is called on layer complete."""
+        from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+        counter = LayerDoneCounter(num_layers=4)
+        callback_layers = []
+
+        def callback(layer_idx):
+            callback_layers.append(layer_idx)
+
+        counter.register_callback(callback)
+        counter.mark_layer_done(2)
+
+        self.assertEqual(callback_layers, [2])
+
+    def test_layer_done_counter_stats(self):
+        """Test get_stats returns correct stats."""
+        from fastdeploy.cache_manager.v1.cache_utils import LayerDoneCounter
+
+        counter = LayerDoneCounter(num_layers=4)
+        counter.mark_layer_done(0)
+        counter.mark_layer_done(1)
+
+        stats = counter.get_stats()
+        self.assertEqual(stats["num_layers"], 4)
+        self.assertEqual(stats["completed_layers"], 2)
+        self.assertEqual(stats["pending_layers"], 2)
+
+
+# ============================================================================
+# Statistics Tests
+# ============================================================================
+
+
+class TestStats(unittest.TestCase):
+    """Test statistics functionality."""
+
+    def test_get_stats_returns_expected_keys(self):
+        """Test get_stats returns expected keys."""
+        controller = create_cache_controller(num_layers=4)
+        stats = controller.get_stats()
+
+        self.assertIn("initialized", stats)
+        self.assertIn("num_layers", stats)
+        self.assertTrue(stats["initialized"])
+        self.assertEqual(stats["num_layers"], 4)
+
+
+# ============================================================================
+# Reset Tests
+# ============================================================================
+
+
+class TestReset(unittest.TestCase):
+    """Test reset_cache method."""
+
+    def setUp(self):
+        self.controller = create_cache_controller(num_layers=4)
+        setup_transfer_env(self.controller, num_layers=4)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_reset_cache_clears_pending_evict_counters(self, mock_submit):
+        """Test reset_cache clears pending evict counters."""
+        mock_submit.return_value = make_done_counter()
+
+        evict_meta = CacheSwapMetadata(src_block_ids=[0], dst_block_ids=[10])
+        counter = self.controller.evict_device_to_host(evict_meta)
+
+        # Manually add counter to pending evict counters (simulating what submit_swap_tasks does)
+        self.controller._pending_evict_counters.append(counter)
+
+        self.assertEqual(len(self.controller._pending_evict_counters), 1)
+
+        result = self.controller.reset_cache()
+        self.assertTrue(result)
+        self.assertEqual(len(self.controller._pending_evict_counters), 0)
+
+
+# ============================================================================
+# KV Cache Management Tests
+# ============================================================================
+
+
+class TestKVCacheManagement(unittest.TestCase):
+    """Test KV cache initialization and retrieval."""
+
+    def test_get_kv_caches_without_init(self):
+        """Test get_kv_caches returns empty dict when not initialized."""
+        controller = create_cache_controller()
+        result = controller.get_kv_caches()
+        self.assertIsNotNone(result)
+
+    def test_get_host_cache_kvs_map_without_init(self):
+        """Test get_host_cache_kvs_map returns empty dict when not initialized."""
+        controller = create_cache_controller()
+        result = controller.get_host_cache_kvs_map()
+        self.assertEqual(len(result), 0)
+
+
+# ============================================================================
+# Transfer Failure Tests
+# ============================================================================
+
+
+class TestTransferFailure(unittest.TestCase):
+    """Test behavior when transfer fails."""
+
+    def setUp(self):
+        self.controller = create_cache_controller(num_layers=4)
+        setup_transfer_env(self.controller, num_layers=4)
+
+    @patch("fastdeploy.cache_manager.v1.cache_controller.CacheController._submit_swap_task")
+    def test_layer_by_layer_transfer_failure(self, mock_submit):
+        """Test that transfer failure is properly reported via _submit_swap_task exception."""
+
+        def failing_submit(meta, **kwargs):
+            meta.success = False
+            meta.error_message = "CUDA error"
+            counter = make_done_counter()
+            return counter
+
+        mock_submit.side_effect = failing_submit
+
+        meta = CacheSwapMetadata(src_block_ids=[10], dst_block_ids=[0])
+        self.controller.load_host_to_device(meta)
+
+        # The error should be stored in meta.error_message
+        self.assertFalse(meta.success)
+        self.assertIsNotNone(meta.error_message)
+        self.assertIn("CUDA error", meta.error_message)
+
+
+# ============================================================================
+# Storage Placeholder Tests
+# ============================================================================
+
+
+class TestStoragePlaceholders(unittest.TestCase):
+    """Test storage placeholder methods."""
+
+    def setUp(self):
+        self.controller = create_cache_controller(num_layers=4)
+
+    def test_prefetch_from_storage_returns_error_handler(self):
+        """Test prefetch_from_storage returns error handler (not implemented)."""
+        from fastdeploy.cache_manager.v1.metadata import StorageMetadata
+
+        mock_metadata = MagicMock(spec=StorageMetadata)
+        handler = self.controller.prefetch_from_storage(mock_metadata)
+
+        self.assertIsNotNone(handler)
+        self.assertIsNotNone(handler.error)
+
+    def test_backup_device_to_storage_returns_error_handler(self):
+        """Test backup_device_to_storage returns error handler (not implemented)."""
+        from fastdeploy.cache_manager.v1.metadata import StorageMetadata
+
+        mock_metadata = MagicMock(spec=StorageMetadata)
+        handler = self.controller.backup_device_to_storage([0, 1], mock_metadata)
+
+        self.assertIsNotNone(handler)
+        self.assertIsNotNone(handler.error)
+
+    def test_backup_host_to_storage_returns_error_handler(self):
+        """Test backup_host_to_storage returns error handler (not implemented)."""
+        from fastdeploy.cache_manager.v1.metadata import StorageMetadata
+
+        mock_metadata = MagicMock(spec=StorageMetadata)
+        handler = self.controller.backup_host_to_storage([0, 1], mock_metadata)
+
+        self.assertIsNotNone(handler)
+        self.assertIsNotNone(handler.error)
+
+
+class TestPDTransferPlaceholders(unittest.TestCase):
+    """Test PD transfer placeholder methods."""
+
+    def setUp(self):
+        self.controller = create_cache_controller(num_layers=4)
+
+    def test_send_to_node_returns_error_handler(self):
+        """Test send_to_node returns error handler (not implemented)."""
+        from fastdeploy.cache_manager.v1.metadata import PDTransferMetadata
+
+        mock_metadata = MagicMock(spec=PDTransferMetadata)
+        handler = self.controller.send_to_node(mock_metadata)
+
+        self.assertIsNotNone(handler)
+        self.assertIsNotNone(handler.error)
+
+    def test_wait_for_transfer_from_node_returns_error_handler(self):
+        """Test wait_for_transfer_from_node returns error handler (not implemented)."""
+        from fastdeploy.cache_manager.v1.metadata import PDTransferMetadata
+
+        mock_metadata = MagicMock(spec=PDTransferMetadata)
+        handler = self.controller.wait_for_transfer_from_node(mock_metadata)
+
+        self.assertIsNotNone(handler)
+        self.assertIsNotNone(handler.error)
+
+
+# ============================================================================
+# CacheSwapMetadata Mapping Tests
+# ============================================================================
+
+
+class TestCacheSwapMetadataMapping(unittest.TestCase):
+    """Test CacheSwapMetadata mapping property."""
+
+    def test_mapping_empty_when_not_success(self):
+        meta = CacheSwapMetadata(src_block_ids=[1, 2], dst_block_ids=[10, 11])
+        self.assertEqual(meta.mapping, {})
+
+    def test_mapping_returns_dict_after_success(self):
+        meta = CacheSwapMetadata(src_block_ids=[1, 2], dst_block_ids=[10, 11])
+        meta.success = True
+        expected = {1: 10, 2: 11}
+        self.assertEqual(meta.mapping, expected)
+
+
+# ============================================================================
+# write_policy Property Tests
+# ============================================================================
+
+
+class TestWritePolicy(unittest.TestCase):
+    """Test write_policy property and related behavior."""
+
+    def test_write_policy_default(self):
+        """Test write_policy reads from config."""
+        controller = create_cache_controller()
+        # Default config has write_policy set; just verify it's accessible
+        policy = controller.write_policy
+        self.assertIsInstance(policy, (str, type(None)))
+
+    def test_should_wait_for_swap_out_write_back(self):
+        """Test _should_wait_for_swap_out returns True for write_back policy."""
+        from fastdeploy.cache_manager.v1.cache_controller import CacheController
+
+        config = get_default_test_fd_config()
+        config.cache_config.num_cpu_blocks = 50
+        config.model_config.num_hidden_layers = 4
+        config.cache_config.write_policy = "write_back"
+
+        controller = CacheController(config, local_rank=0, device_id=0)
+        self.assertTrue(controller._should_wait_for_swap_out())
+
+    def test_should_wait_for_swap_out_write_through(self):
+        """Test _should_wait_for_swap_out returns False for write_through policy."""
+        from fastdeploy.cache_manager.v1.cache_controller import CacheController
+
+        config = get_default_test_fd_config()
+        config.cache_config.num_cpu_blocks = 50
+        config.model_config.num_hidden_layers = 4
+        config.cache_config.write_policy = "write_through"
+
+        controller = CacheController(config, local_rank=0, device_id=0)
+        self.assertFalse(controller._should_wait_for_swap_out())
+
+
+# ============================================================================
+# free_cache / free_gpu_cache Tests
+# ============================================================================
+
+
+class TestFreeCacheMethods(unittest.TestCase):
+    """Test free_cache and free_gpu_cache methods."""
+
+    def setUp(self):
+        self.controller = create_cache_controller(num_layers=4)
+        setup_transfer_env(self.controller, num_layers=4)
+
+    def test_free_gpu_cache_clears_map(self):
+        """Test free_gpu_cache clears the cache_kvs_map."""
+        device_cache = create_mock_device_cache_kvs_map(num_layers=4)
+        self.controller.cache_kvs_map = device_cache
+
+        self.assertGreater(len(self.controller.cache_kvs_map), 0)
+
+        self.controller.free_gpu_cache()
+
+        self.assertEqual(len(self.controller.cache_kvs_map), 0)
+
+    def test_free_cache_returns_true(self):
+        """Test free_cache returns True on success."""
+        result = self.controller.free_cache()
+        self.assertTrue(result)
+
+    def test_free_gpu_cache_noop_when_empty(self):
+        """Test free_gpu_cache is a no-op when cache_kvs_map is already empty."""
+        self.controller.cache_kvs_map = {}
+        # Should not raise
+        self.controller.free_gpu_cache()
+        self.assertEqual(len(self.controller.cache_kvs_map), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/cache_manager/v1/test_cache_manager.py b/tests/cache_manager/v1/test_cache_manager.py
new file mode 100644
index 00000000000..61953cb6540
--- /dev/null
+++ b/tests/cache_manager/v1/test_cache_manager.py
@@ -0,0 +1,713 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+Unit tests for CacheManager class.
+
+Tests cover:
+- Block allocation (device/host)
+- Block release (device/host)
+- Resource checking (can_allocate_*)
+- Free block counting (num_free_*_blocks)
+- Reset functionality
+- Request lifecycle management with RadixTree integration
+- Multi-method workflow tests
+"""
+
+import unittest
+from dataclasses import dataclass, field
+from typing import List
+
+from utils import get_default_test_fd_config
+
+
+def create_cache_manager(
+    total_block_num: int = 100,
+    num_cpu_blocks: int = 50,
+    block_size: int = 64,
+    enable_prefix_caching: bool = True,
+):
+    """Helper to create CacheManager with test config."""
+    from fastdeploy.cache_manager.v1.cache_manager import CacheManager
+
+    config = get_default_test_fd_config()
+    config.cache_config.total_block_num = total_block_num
+    config.cache_config.num_cpu_blocks = num_cpu_blocks
+    config.cache_config.block_size = block_size
+    config.cache_config.enable_prefix_caching = enable_prefix_caching
+
+    return CacheManager(config)
+
+
+@dataclass
+class MockMatchResult:
+    """Mock MatchResult for testing."""
+
+    device_nodes: List = field(default_factory=list)
+    host_nodes: List = field(default_factory=list)
+    storage_nodes: List = field(default_factory=list)
+    uncached_block_ids: List = field(default_factory=list)
+
+    @property
+    def matched_device_nums(self) -> int:
+        return len(self.device_nodes)
+
+    @property
+    def matched_host_nums(self) -> int:
+        return len(self.host_nodes)
+
+    @property
+    def matched_storage_nums(self) -> int:
+        return len(self.storage_nodes)
+
+    @property
+    def total_matched_blocks(self) -> int:
+        return self.matched_device_nums + self.matched_host_nums + self.matched_storage_nums
+
+    @property
+    def device_block_ids(self) -> List[int]:
+        return [node.block_id for node in self.device_nodes]
+
+
+@dataclass
+class MockRequest:
+    """Mock Request for testing CacheManager."""
+
+    request_id: str
+    prompt_hashes: List[str]
+    block_tables: List[int] = field(default_factory=list)
+    match_result: MockMatchResult = field(default_factory=MockMatchResult)
+    cache_evict_metadata: List = field(default_factory=list)
+    cache_swap_metadata: List = field(default_factory=list)
+
+
+class TestCacheManagerAllocation(unittest.TestCase):
+    """Test CacheManager block allocation functionality."""
+
+    def test_allocate_device_blocks_with_request(self):
+        """Test device block allocation with mock request."""
+        cache_manager = create_cache_manager()
+        request = MockRequest(
+            request_id="test_req_1",
+            prompt_hashes=["h1", "h2", "h3", "h4", "h5"],
+            block_tables=[],
+        )
+
+        allocated = cache_manager.allocate_device_blocks(request, 5)
+
+        self.assertIsNotNone(allocated)
+        self.assertEqual(len(allocated), 5)
+        self.assertEqual(cache_manager.num_free_device_blocks, 95)
+
+    def test_allocate_device_blocks_insufficient(self):
+        """Test device block allocation when not enough blocks after eviction."""
+        cache_manager = create_cache_manager()
+        # Exhaust device blocks
+        for _ in range(10):
+            cache_manager.allocate_device_blocks(MockRequest(request_id="req", prompt_hashes=[], block_tables=[]), 10)
+
+        # Next allocation should fail (no evictable blocks and no free blocks)
+        request = MockRequest(request_id="test", prompt_hashes=["h1"], block_tables=[])
+        result = cache_manager.allocate_device_blocks(request, 10)
+        self.assertEqual(result, [])
+
+    def test_allocate_host_blocks_success(self):
+        """Test successful host block allocation."""
+        cache_manager = create_cache_manager()
+        allocated = cache_manager.allocate_host_blocks(10)
+
+        self.assertIsNotNone(allocated)
+        self.assertEqual(len(allocated), 10)
+        self.assertEqual(cache_manager.num_free_host_blocks, 40)
+
+    def test_allocate_host_blocks_insufficient(self):
+        """Test host block allocation returns empty when not enough blocks."""
+        cache_manager = create_cache_manager(num_cpu_blocks=5)
+        allocated = cache_manager.allocate_host_blocks(10)
+
+        self.assertEqual(allocated, [])
+
+
+class TestCacheManagerRelease(unittest.TestCase):
+    """Test CacheManager block release functionality."""
+
+    def test_free_device_blocks(self):
+        """Test freeing device blocks."""
+        cache_manager = create_cache_manager()
+        request = MockRequest(request_id="req", prompt_hashes=[], block_tables=[])
+        allocated = cache_manager.allocate_device_blocks(request, 10)
+        initial_free = cache_manager.num_free_device_blocks
+
+        cache_manager.free_device_blocks(allocated)
+
+        self.assertEqual(cache_manager.num_free_device_blocks, initial_free + 10)
+
+    def test_free_host_blocks(self):
+        """Test freeing host blocks."""
+        cache_manager = create_cache_manager()
+        allocated = cache_manager.allocate_host_blocks(10)
+        initial_free = cache_manager.num_free_host_blocks
+
+        cache_manager.free_host_blocks(allocated)
+
+        self.assertEqual(cache_manager.num_free_host_blocks, initial_free + 10)
+
+    def test_free_all_device_blocks(self):
+        """Test freeing all device blocks."""
+        cache_manager = create_cache_manager()
+        req = MockRequest(request_id="req", prompt_hashes=[], block_tables=[])
+        cache_manager.allocate_device_blocks(req, 50)
+
+        freed = cache_manager.free_all_device_blocks()
+
+        self.assertEqual(freed, 50)
+        self.assertEqual(cache_manager.num_free_device_blocks, 100)
+
+    def test_free_all_host_blocks(self):
+        """Test freeing all host blocks."""
+        cache_manager = create_cache_manager()
+        cache_manager.allocate_host_blocks(25)
+
+        freed = cache_manager.free_all_host_blocks()
+
+        self.assertEqual(freed, 25)
+        self.assertEqual(cache_manager.num_free_host_blocks, 50)
+
+
+class TestCacheManagerReset(unittest.TestCase):
+    """Test CacheManager reset functionality."""
+
+    def test_reset_cache(self):
+        """Test cache reset functionality."""
+        cache_manager = create_cache_manager()
+        req = MockRequest(request_id="req", prompt_hashes=[], block_tables=[])
+        cache_manager.allocate_device_blocks(req, 50)
+        cache_manager.allocate_host_blocks(25)
+
+        result = cache_manager.reset_cache()
+
+        self.assertTrue(result)
+        self.assertEqual(cache_manager.num_free_device_blocks, 100)
+        self.assertEqual(cache_manager.num_free_host_blocks, 50)
+
+
+class TestCacheManagerResize(unittest.TestCase):
+    """Test CacheManager resize functionality."""
+
+    def test_resize_device_pool_expand(self):
+        """Test expanding device pool."""
+        cache_manager = create_cache_manager(total_block_num=100)
+
+        result = cache_manager.resize_device_pool(150)
+
+        self.assertTrue(result)
+        self.assertEqual(cache_manager.num_gpu_blocks, 150)
+        self.assertEqual(cache_manager.num_free_device_blocks, 150)
+
+    def test_resize_device_pool_shrink_with_used_blocks(self):
+        """Test shrinking device pool fails when used blocks exceed new size."""
+        cache_manager = create_cache_manager(total_block_num=100)
+        req = MockRequest(request_id="req", prompt_hashes=[], block_tables=[])
+        cache_manager.allocate_device_blocks(req, 60)
+
+        result = cache_manager.resize_device_pool(50)
+
+        self.assertFalse(result)
+        self.assertEqual(cache_manager.num_gpu_blocks, 100)
+
+    def test_resize_device_pool_allocate_after_expand(self):
+        """Test allocating blocks after expanding pool."""
+        cache_manager = create_cache_manager(total_block_num=100)
+        cache_manager.resize_device_pool(150)
+
+        req = MockRequest(request_id="req", prompt_hashes=[], block_tables=[])
+        allocated = cache_manager.allocate_device_blocks(req, 120)
+
+        self.assertIsNotNone(allocated)
+        self.assertEqual(len(allocated), 120)
+
+
+class TestCacheManagerWorkflow(unittest.TestCase):
+    """Test CacheManager multi-method workflow scenarios."""
+
+    def test_request_lifecycle_full(self):
+        """Test complete request lifecycle: match -> allocate -> finish."""
+        cache_manager = create_cache_manager()
+
+        # Step 1: Request comes in, match prefix (no existing cache)
+        request1 = MockRequest(
+            request_id="req_1",
+            prompt_hashes=["hash1", "hash2", "hash3"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(request1)
+
+        self.assertEqual(request1.match_result.total_matched_blocks, 0)
+
+        # Step 2: Allocate blocks for the request
+        allocated = cache_manager.allocate_device_blocks(request1, 3)
+        self.assertIsNotNone(allocated)
+        self.assertEqual(len(allocated), 3)
+
+        # Step 3: Request finishes, cache the blocks
+        request1.block_tables = allocated
+        cache_manager.request_finish(request1)
+
+        # Verify blocks are cached
+        self.assertEqual(cache_manager.num_free_device_blocks, 97)
+
+    def test_request_lifecycle_with_prefix_reuse(self):
+        """Test request reusing cached prefix."""
+        cache_manager = create_cache_manager()
+
+        # First request: insert [h1, h2, h3]
+        req1 = MockRequest(
+            request_id="req_1",
+            prompt_hashes=["h1", "h2", "h3"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(req1)
+        allocated1 = cache_manager.allocate_device_blocks(req1, 3)
+        req1.block_tables = allocated1
+        cache_manager.request_finish(req1)
+
+        # Second request: same prefix [h1, h2], then new [h4]
+        req2 = MockRequest(
+            request_id="req_2",
+            prompt_hashes=["h1", "h2", "h4"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(req2)
+
+        # Should match h1, h2 (result stored in _match_result)
+        self.assertEqual(req2._match_result.matched_device_nums, 2)
+        self.assertEqual(req2._match_result.matched_host_nums, 0)
+
+        # Allocate only for h4 (1 new block needed)
+        allocated2 = cache_manager.allocate_device_blocks(req2, 1)
+        self.assertIsNotNone(allocated2)
+
+        matched_ids = req2._match_result.device_block_ids
+        req2.block_tables = matched_ids + allocated2
+        cache_manager.request_finish(req2)
+
+    def test_shared_prefix_multiple_requests(self):
+        """Test multiple requests sharing prefix."""
+        cache_manager = create_cache_manager()
+
+        # Insert base prefix [A, B]
+        req1 = MockRequest(
+            request_id="req_1",
+            prompt_hashes=["A", "B", "C1"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(req1)
+        allocated1 = cache_manager.allocate_device_blocks(req1, 3)
+        req1.block_tables = allocated1
+        cache_manager.request_finish(req1)
+
+        # Check radix tree state
+        stats = cache_manager.radix_tree.get_stats()
+        self.assertEqual(stats.node_count, 4)  # root + A + B + C1
+
+        # Second request with different suffix
+        req2 = MockRequest(
+            request_id="req_2",
+            prompt_hashes=["A", "B", "C2"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(req2)
+        self.assertEqual(req2._match_result.matched_device_nums, 2)  # A, B
+
+        allocated2 = cache_manager.allocate_device_blocks(req2, 1)
+        req2.block_tables = req2._match_result.device_block_ids + allocated2
+        cache_manager.request_finish(req2)
+
+        stats = cache_manager.radix_tree.get_stats()
+        self.assertEqual(stats.node_count, 5)  # root + A + B + C1 + C2
+
+    def test_eviction_workflow(self):
+        """Test eviction when device memory is full."""
+        cache_manager = create_cache_manager(num_cpu_blocks=50)
+
+        # Exhaust device memory
+        requests = []
+        for i in range(10):
+            req = MockRequest(
+                request_id=f"req_{i}",
+                prompt_hashes=[f"h{i}_{j}" for j in range(10)],
+                block_tables=[],
+            )
+            cache_manager.match_prefix(req)
+            allocated = cache_manager.allocate_device_blocks(req, 10)
+            req.block_tables = allocated
+            cache_manager.request_finish(req)
+            requests.append(req)
+
+        self.assertEqual(cache_manager.num_free_device_blocks, 0)
+
+        # Verify evictable blocks exist
+        stats = cache_manager.radix_tree.get_stats()
+        self.assertEqual(stats.evictable_device_count, 100)
+
+        # New request should trigger eviction
+        new_req = MockRequest(
+            request_id="new_req",
+            prompt_hashes=["new1", "new2", "new3"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(new_req)
+        allocated = cache_manager.allocate_device_blocks(new_req, 3)
+
+        self.assertIsNotNone(allocated)
+        self.assertEqual(len(allocated), 3)
+
+    def test_host_cache_eviction_workflow(self):
+        """Test device -> host eviction workflow when memory is full."""
+        cache_manager = create_cache_manager(num_cpu_blocks=30)
+
+        # Exhaust device memory with different hashes (no prefix sharing)
+        for i in range(10):
+            req = MockRequest(
+                request_id=f"req_{i}",
+                prompt_hashes=[f"h{i}_{j}" for j in range(10)],
+                block_tables=[],
+            )
+            cache_manager.match_prefix(req)
+            allocated = cache_manager.allocate_device_blocks(req, 10)
+            req.block_tables = allocated
+            cache_manager.request_finish(req)
+
+        # Device should be full
+        self.assertEqual(cache_manager.num_free_device_blocks, 0)
+
+        # New request should still work (eviction should occur)
+        new_req = MockRequest(
+            request_id="new_req",
+            prompt_hashes=["new1", "new2", "new3"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(new_req)
+        allocated = cache_manager.allocate_device_blocks(new_req, 3)
+
+        self.assertIsNotNone(allocated)
+        self.assertEqual(len(allocated), 3)
+
+
+class TestCacheManagerRadixTreeIntegration(unittest.TestCase):
+    """Test CacheManager RadixTree integration."""
+
+    def test_match_prefix_updates_ref_count(self):
+        """Test that match_prefix increments ref count."""
+        cache_manager = create_cache_manager()
+
+        # Insert some blocks
+        req1 = MockRequest(
+            request_id="req_1",
+            prompt_hashes=["h1", "h2"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(req1)
+        allocated1 = cache_manager.allocate_device_blocks(req1, 2)
+        req1.block_tables = allocated1
+        cache_manager.request_finish(req1)
+
+        # Check initial evictable count (should be 2 after finish)
+        stats1 = cache_manager.radix_tree.get_stats()
+        self.assertEqual(stats1.evictable_device_count, 2)
+
+        # Match same prefix - should increment ref
+        req2 = MockRequest(
+            request_id="req_2",
+            prompt_hashes=["h1", "h2"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(req2)
+
+        # Ref count should be incremented, nodes not evictable
+        stats2 = cache_manager.radix_tree.get_stats()
+        self.assertEqual(stats2.evictable_device_count, 0)
+
+    def test_insert_and_find_prefix(self):
+        """Test inserting blocks and finding prefix."""
+        cache_manager = create_cache_manager()
+
+        # Insert blocks
+        req1 = MockRequest(
+            request_id="req_1",
+            prompt_hashes=["hash_a", "hash_b", "hash_c"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(req1)
+        allocated = cache_manager.allocate_device_blocks(req1, 3)
+        req1.block_tables = allocated
+        cache_manager.request_finish(req1)
+
+        # Find prefix
+        req2 = MockRequest(
+            request_id="req_2",
+            prompt_hashes=["hash_a", "hash_b"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(req2)
+
+        self.assertEqual(req2._match_result.matched_device_nums, 2)
+        # Block IDs depend on allocation order; verify count and that they are valid ints
+        block_ids = req2._match_result.device_block_ids
+        self.assertEqual(len(block_ids), 2)
+        self.assertTrue(all(isinstance(bid, int) for bid in block_ids))
+
+
+class TestCacheManagerWithDisabledPrefixCaching(unittest.TestCase):
+    """Test CacheManager with prefix caching disabled."""
+
+    def test_radix_tree_none_when_disabled(self):
+        """Test radix_tree is None when prefix caching disabled."""
+        cache_manager = create_cache_manager(enable_prefix_caching=False)
+        self.assertIsNone(cache_manager.radix_tree)
+
+    def test_allocation_works_without_prefix_caching(self):
+        """Test block allocation still works without prefix caching."""
+        cache_manager = create_cache_manager(enable_prefix_caching=False)
+        req = MockRequest(request_id="req", prompt_hashes=[], block_tables=[])
+        allocated = cache_manager.allocate_device_blocks(req, 10)
+
+        self.assertIsNotNone(allocated)
+        self.assertEqual(len(allocated), 10)
+
+
+class TestCacheManagerWithNoHostCache(unittest.TestCase):
+    """Test CacheManager with no host cache."""
+
+    def test_host_cache_disabled(self):
+        """Test host cache is disabled."""
+        cache_manager = create_cache_manager(num_cpu_blocks=0)
+        self.assertFalse(cache_manager.enable_host_cache)
+
+    def test_no_free_host_blocks(self):
+        """Test no free host blocks when disabled."""
+        cache_manager = create_cache_manager(num_cpu_blocks=0)
+        self.assertEqual(cache_manager.num_free_host_blocks, 0)
+
+
+class TestCacheManagerProperties(unittest.TestCase):
+    """Test CacheManager properties."""
+
+    def test_device_pool_property(self):
+        """Test device_pool property returns correct pool."""
+        from fastdeploy.cache_manager.v1.block_pool import DeviceBlockPool
+
+        cache_manager = create_cache_manager()
+        self.assertIsInstance(cache_manager.device_pool, DeviceBlockPool)
+
+    def test_host_pool_property(self):
+        """Test host_pool property returns correct pool."""
+        from fastdeploy.cache_manager.v1.block_pool import HostBlockPool
+
+        cache_manager = create_cache_manager()
+        self.assertIsInstance(cache_manager.host_pool, HostBlockPool)
+
+    def test_radix_tree_property(self):
+        """Test radix_tree property returns correct tree."""
+        from fastdeploy.cache_manager.v1.radix_tree import RadixTree
+
+        cache_manager = create_cache_manager()
+        self.assertIsInstance(cache_manager.radix_tree, RadixTree)
+
+
+class TestCacheManagerStats(unittest.TestCase):
+    """Test CacheManager statistics methods."""
+
+    def test_get_stats(self):
+        """Test get_stats returns correct structure."""
+        cache_manager = create_cache_manager()
+        stats = cache_manager.get_stats()
+
+        self.assertIn("initialized", stats)
+        self.assertIn("num_gpu_blocks", stats)
+        self.assertIn("num_cpu_blocks", stats)
+        self.assertIn("block_size", stats)
+        self.assertIn("device_pool", stats)
+        self.assertIn("host_pool", stats)
+        self.assertIn("num_free_device_blocks", stats)
+        self.assertIn("num_free_host_blocks", stats)
+        self.assertIn("radix_tree", stats)
+
+        self.assertTrue(stats["initialized"])
+        self.assertEqual(stats["num_gpu_blocks"], 100)
+        self.assertEqual(stats["num_cpu_blocks"], 50)
+
+    def test_get_memory_usage(self):
+        """Test get_memory_usage returns correct structure."""
+        cache_manager = create_cache_manager()
+        usage = cache_manager.get_memory_usage()
+
+        self.assertIn("device", usage)
+        self.assertIn("host", usage)
+        self.assertIn("total_blocks", usage["device"])
+        self.assertIn("used_blocks", usage["device"])
+        self.assertIn("free_blocks", usage["device"])
+        self.assertIn("usage_percent", usage["device"])
+
+
+class TestCacheManagerEdgeCases(unittest.TestCase):
+    """Test CacheManager edge cases."""
+
+    def test_empty_prompt_hashes(self):
+        """Test request with empty prompt hashes."""
+        cache_manager = create_cache_manager()
+        req = MockRequest(request_id="req", prompt_hashes=[], block_tables=[])
+
+        cache_manager.match_prefix(req)
+        self.assertEqual(req.match_result.total_matched_blocks, 0)
+
+        allocated = cache_manager.allocate_device_blocks(req, 0)
+        self.assertEqual(allocated, [])
+
+    def test_allocation_with_matched_host_blocks(self):
+        """Test allocation when host cache has matched blocks."""
+        cache_manager = create_cache_manager(num_cpu_blocks=50)
+
+        # Insert blocks and evict some to host
+        req1 = MockRequest(
+            request_id="req_1",
+            prompt_hashes=["h1", "h2", "h3"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(req1)
+        allocated1 = cache_manager.allocate_device_blocks(req1, 3)
+        req1.block_tables = allocated1
+        cache_manager.request_finish(req1)
+
+        # Exhaust device, evict to host
+        for i in range(10):
+            req = MockRequest(
+                request_id=f"req_{i}",
+                prompt_hashes=[f"other_{i}_{j}" for j in range(10)],
+                block_tables=[],
+            )
+            cache_manager.match_prefix(req)
+            allocated = cache_manager.allocate_device_blocks(req, 10)
+            req.block_tables = allocated
+            cache_manager.request_finish(req)
+
+        # Now request h1, h2 - should find them in host cache
+        req2 = MockRequest(
+            request_id="req_2",
+            prompt_hashes=["h1", "h2"],
+            block_tables=[],
+        )
+        cache_manager.match_prefix(req2)
+
+        # After device is full, h1 and h2 may be evicted to host (write_through policy)
+        # Total matched should be non-negative regardless of eviction policy
+        total_matched = req2._match_result.total_matched_blocks
+        self.assertGreaterEqual(total_matched, 0)
+        # If found in host, matched_host_nums > 0
+        if req2._match_result.matched_host_nums > 0:
+            self.assertGreater(req2._match_result.matched_host_nums, 0)
+
+
+class TestCacheManagerCanAllocate(unittest.TestCase):
+    """Test CacheManager can_allocate_* methods."""
+
+    def test_can_allocate_device_blocks_enough(self):
+        """Test can_allocate_device_blocks returns True when enough free blocks."""
+        cache_manager = create_cache_manager(total_block_num=100)
+        self.assertTrue(cache_manager.can_allocate_device_blocks(50))
+
+    def test_can_allocate_device_blocks_exact(self):
+        """Test can_allocate_device_blocks returns True for exact count."""
+        cache_manager = create_cache_manager(total_block_num=100)
+        self.assertTrue(cache_manager.can_allocate_device_blocks(100))
+
+    def test_can_allocate_device_blocks_too_many(self):
+        """Test can_allocate_device_blocks returns False when not enough blocks."""
+        cache_manager = create_cache_manager(total_block_num=100, enable_prefix_caching=False)
+        self.assertFalse(cache_manager.can_allocate_device_blocks(101))
+
+    def test_can_allocate_host_blocks_enough(self):
+        """Test can_allocate_host_blocks returns True when enough free blocks."""
+        cache_manager = create_cache_manager(num_cpu_blocks=50)
+        self.assertTrue(cache_manager.can_allocate_host_blocks(30))
+
+    def test_can_allocate_host_blocks_too_many(self):
+        """Test can_allocate_host_blocks returns False when not enough blocks."""
+        cache_manager = create_cache_manager(num_cpu_blocks=10, enable_prefix_caching=False)
+        self.assertFalse(cache_manager.can_allocate_host_blocks(20))
+
+    def test_can_allocate_gpu_blocks_alias(self):
+        """Test can_allocate_gpu_blocks is alias for can_allocate_device_blocks."""
+        cache_manager = create_cache_manager(total_block_num=100)
+        self.assertEqual(
+            cache_manager.can_allocate_device_blocks(50),
+            cache_manager.can_allocate_gpu_blocks(50),
+        )
+
+
+class TestCacheManagerLegacyMethods(unittest.TestCase):
+    """Test CacheManager legacy compatibility methods."""
+
+    def test_allocate_gpu_blocks_alias(self):
+        """Test allocate_gpu_blocks delegates to allocate_device_blocks."""
+        cache_manager = create_cache_manager()
+        req = MockRequest(request_id="req", prompt_hashes=[], block_tables=[])
+        allocated = cache_manager.allocate_gpu_blocks(req, 5)
+
+        self.assertIsNotNone(allocated)
+        self.assertEqual(len(allocated), 5)
+
+    def test_gpu_free_block_list_property(self):
+        """Test gpu_free_block_list returns a list."""
+        cache_manager = create_cache_manager(total_block_num=100)
+        free_list = cache_manager.gpu_free_block_list
+        self.assertIsInstance(free_list, list)
+
+    def test_available_gpu_resource_full(self):
+        """Test available_gpu_resource is 1.0 when no blocks used."""
+        cache_manager = create_cache_manager(total_block_num=100)
+        self.assertAlmostEqual(cache_manager.available_gpu_resource, 1.0)
+
+    def test_available_gpu_resource_after_allocation(self):
+        """Test available_gpu_resource decreases after allocation."""
+        cache_manager = create_cache_manager(total_block_num=100, enable_prefix_caching=False)
+        req = MockRequest(request_id="req", prompt_hashes=[], block_tables=[])
+        cache_manager.allocate_device_blocks(req, 50)
+        self.assertAlmostEqual(cache_manager.available_gpu_resource, 0.5)
+
+    def test_update_cache_config(self):
+        """Test update_cache_config resizes device pool when total_block_num changes."""
+        cache_manager = create_cache_manager(total_block_num=100)
+
+        new_cfg = cache_manager.cache_config
+        new_cfg.total_block_num = 150
+        cache_manager.update_cache_config(new_cfg)
+
+        self.assertEqual(cache_manager.num_gpu_blocks, 150)
+
+
+class TestCacheManagerStorageScheduler(unittest.TestCase):
+    """Test CacheManager storage_scheduler property."""
+
+    def test_storage_scheduler_none_by_default(self):
+        """Test storage_scheduler is None when not configured."""
+        cache_manager = create_cache_manager()
+        # Default config has no storage backend, so scheduler should be None
+        # (behavior depends on create_storage_scheduler implementation)
+        # Just verify it's accessible without error
+        _ = cache_manager.storage_scheduler
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/cache_manager/v1/test_cache_utils.py b/tests/cache_manager/v1/test_cache_utils.py
new file mode 100644
index 00000000000..06de020cd0c
--- /dev/null
+++ b/tests/cache_manager/v1/test_cache_utils.py
@@ -0,0 +1,389 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+Unit tests for get_block_hash_extra_keys in
+fastdeploy/cache_manager/v1/cache_utils.py.
+
+Tests mirror the style used in
+tests/cache_manager/test_prefix_cache_manager.py and cover:
+
+- Early return paths (None input, missing keys, empty mm_positions)
+- Fast-exit path (last item ends before block start)
+- Image entirely before the block  (skip via continue)
+- Image entirely after the block   (stop via return)
+- Image fully contained in block
+- Image spanning the right block boundary
+- Image spanning the entire block (starts before, ends after)
+- Multiple images: only overlapping ones included
+- Sequential multi-block scan using the returned mm_idx
+- Single-token block and single-token image edge cases
+"""
+
+import unittest
+from types import SimpleNamespace
+
+from fastdeploy.cache_manager.v1.cache_utils import get_block_hash_extra_keys
+
+
+def _req(mm_positions, mm_hashes):
+    """Build a minimal request-like object with multimodal_inputs."""
+    return SimpleNamespace(
+        multimodal_inputs={
+            "mm_positions": [SimpleNamespace(offset=o, length=l) for o, l in mm_positions],
+            "mm_hashes": list(mm_hashes),
+        }
+    )
+
+
+class TestGetBlockHashExtraKeysEarlyReturn(unittest.TestCase):
+    """Tests for the guard / early-return paths at the top of the function."""
+
+    def test_multimodal_inputs_none(self):
+        """multimodal_inputs=None → (mm_idx, []) unchanged."""
+        req = SimpleNamespace(multimodal_inputs=None)
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=0, end_idx=4, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    def test_multimodal_inputs_attribute_missing(self):
+        """Object without multimodal_inputs attribute → treated as None."""
+        req = SimpleNamespace()
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=0, end_idx=4, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    def test_mm_positions_key_missing(self):
+        """mm_positions key absent → early return."""
+        req = SimpleNamespace(multimodal_inputs={"mm_hashes": ["h"]})
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=0, end_idx=4, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    def test_mm_hashes_key_missing(self):
+        """mm_hashes key absent → early return."""
+        req = SimpleNamespace(multimodal_inputs={"mm_positions": [SimpleNamespace(offset=0, length=2)]})
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=0, end_idx=4, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    def test_mm_positions_empty_list(self):
+        """mm_positions=[] → early return."""
+        req = SimpleNamespace(multimodal_inputs={"mm_positions": [], "mm_hashes": []})
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=0, end_idx=4, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    def test_fast_exit_last_item_ends_exactly_at_block_start(self):
+        """
+        Fast-exit: last item offset+length == start_idx
+        (item ends exactly where block begins → no overlap).
+        """
+        # img [0,4), block [4,8)  →  4 <= 4  → fast exit
+        req = _req([(0, 4)], ["h"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    def test_fast_exit_last_item_ends_before_block_start(self):
+        """Fast-exit: all items end strictly before block start."""
+        # img [0,3), block [4,8)
+        req = _req([(0, 3)], ["h"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    def test_fast_exit_preserves_mm_idx(self):
+        """Fast-exit returns the original mm_idx unchanged."""
+        req = _req([(0, 2)], ["h"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=5, end_idx=9, mm_idx=0)
+        self.assertEqual(mm_idx, 0)
+        self.assertEqual(keys, [])
+
+
+class TestGetBlockHashExtraKeysSingleImage(unittest.TestCase):
+    """Tests with exactly one multimodal item and one block."""
+
+    # ------------------------------------------------------------------
+    # Item entirely before block → skip (continue), reaches end of loop
+    # ------------------------------------------------------------------
+
+    def test_item_ends_before_block_start(self):
+        """img [0,2) is entirely before block [3,7)."""
+        req = _req([(0, 2)], ["h"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=3, end_idx=7, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    def test_item_ends_exactly_at_block_start(self):
+        """img [0,3) ends exactly at block start 3 → 3<=3 → skip."""
+        req = _req([(0, 3)], ["h"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=3, end_idx=7, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    # ------------------------------------------------------------------
+    # Item entirely after block → stop (return img_idx, [])
+    # ------------------------------------------------------------------
+
+    def test_item_starts_at_block_end(self):
+        """img [8,10) starts exactly at block end 8 → offset>=end_idx → stop."""
+        req = _req([(8, 2)], ["h"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    def test_item_starts_after_block_end(self):
+        """img [10,3) starts strictly after block [4,8)."""
+        req = _req([(10, 3)], ["h"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, []))
+
+    # ------------------------------------------------------------------
+    # Item spans beyond block right boundary
+    # ------------------------------------------------------------------
+
+    def test_item_spans_right_boundary(self):
+        """img [6,4) → [6,10) spans block [4,8) right boundary."""
+        req = _req([(6, 4)], ["hash-cross"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, ["hash-cross"]))
+
+    def test_item_spans_entire_block(self):
+        """img [3,6) → [3,9) wraps the whole block [4,8)."""
+        req = _req([(3, 6)], ["hash-span"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, ["hash-span"]))
+
+    def test_item_starts_at_block_start_spans_right(self):
+        """img starts at block start, extends past block end."""
+        req = _req([(4, 6)], ["h"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, ["h"]))
+
+    # ------------------------------------------------------------------
+    # Item fully contained within block
+    # ------------------------------------------------------------------
+
+    def test_item_fully_inside_block(self):
+        """img [2,2) → [2,4) fully inside block [0,8)."""
+        req = _req([(2, 2)], ["hash-inside"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=0, end_idx=8, mm_idx=0)
+        self.assertIn("hash-inside", keys)
+
+    def test_item_fills_block_exactly(self):
+        """img occupies exactly the block [4,8)."""
+        req = _req([(4, 4)], ["h-exact"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, ["h-exact"]))
+
+    # ------------------------------------------------------------------
+    # Single-token edge cases
+    # ------------------------------------------------------------------
+
+    def test_single_token_block_single_token_item_inside(self):
+        """Block [5,6), img [5,1) → item fills the single-token block."""
+        req = _req([(5, 1)], ["h1"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=5, end_idx=6, mm_idx=0)
+        self.assertIn("h1", keys)
+
+    def test_single_token_block_item_starts_after(self):
+        """Block [5,6), img [6,1) → starts at block end, not included."""
+        req = _req([(6, 1)], ["h1"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=5, end_idx=6, mm_idx=0)
+        self.assertEqual(keys, [])
+
+
+class TestGetBlockHashExtraKeysMultipleImages(unittest.TestCase):
+    """Tests with multiple multimodal items."""
+
+    def test_only_overlapping_items_included(self):
+        """
+        3 images; only the one overlapping the block should be in hash_keys.
+          img0: [0,2) → before block [4,8)
+          img1: [5,2) → inside  block [4,8)
+          img2: [9,2) → after   block [4,8)
+        """
+        req = _req([(0, 2), (5, 2), (9, 2)], ["h0", "h1", "h2"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertNotIn("h0", keys)
+        self.assertIn("h1", keys)
+        self.assertNotIn("h2", keys)
+
+    def test_multiple_items_all_inside_block(self):
+        """Two images both inside the block → both hashes collected."""
+        req = _req([(1, 2), (4, 2)], ["hA", "hB"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=0, end_idx=8, mm_idx=0)
+        self.assertEqual(keys, ["hA", "hB"])
+
+    def test_no_item_overlaps_block(self):
+        """All images are before the block → empty keys."""
+        req = _req([(0, 2), (2, 1)], ["h0", "h1"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=5, end_idx=9, mm_idx=0)
+        self.assertEqual(keys, [])
+
+    def test_mm_idx_skips_already_processed_items(self):
+        """
+        When mm_idx=1, item at index 0 is not scanned at all.
+        """
+        req = _req([(0, 2), (5, 2)], ["h0", "h1"])
+        # Start scanning from mm_idx=1, so h0 must never appear
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=1)
+        self.assertNotIn("h0", keys)
+        self.assertIn("h1", keys)
+
+    def test_returned_mm_idx_points_to_spanning_item(self):
+        """
+        When an item spans the block right boundary, returned mm_idx points
+        to that item (so the next block can re-examine it).
+
+        img0 [2,7): offset+length=9 > end_idx=8 → spans right boundary
+        → include hA, return img_idx=0 immediately (img1 never reached).
+        """
+        # img0 offset=2, length=7 → end=9 > end_idx=8 → spans right boundary
+        req = _req([(2, 7), (10, 2)], ["hA", "hB"])
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertEqual(mm_idx, 0)  # still points to img0 (not fully consumed)
+        self.assertIn("hA", keys)
+        self.assertNotIn("hB", keys)
+
+    def test_returned_mm_idx_stops_at_after_item(self):
+        """
+        When an item starts after the block, returned mm_idx points to it
+        so the next block can start scanning from there.
+        """
+        req = _req([(2, 2), (9, 1)], ["hA", "hB"])
+        # img1 [9,10) is after block [4,8)
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=1)
+        self.assertEqual(mm_idx, 1)
+        self.assertEqual(keys, [])
+
+
+class TestGetBlockHashExtraKeysSequentialScan(unittest.TestCase):
+    """
+    Simulates a full multi-block scan, reusing the returned mm_idx as the
+    next call's mm_idx – mirroring the exact pattern used in
+    test_prefix_cache_manager.py.
+
+    Data layout (block_size=4):
+      tokens: 0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
+      img0:          [=====]                      [2,5)   hash-0
+      img1:                         [========]    [8,12)  hash-1
+      img2:                                  [==] [14,16) hash-2
+      blocks:  [0,4)  [4,8)  [8,12) [12,16)
+    """
+
+    def setUp(self):
+        self.req = SimpleNamespace(
+            multimodal_inputs={
+                "mm_positions": [
+                    SimpleNamespace(offset=2, length=3),  # [2,5)
+                    SimpleNamespace(offset=8, length=4),  # [8,12)
+                    SimpleNamespace(offset=14, length=2),  # [14,16)
+                ],
+                "mm_hashes": ["hash-0", "hash-1", "hash-2"],
+            }
+        )
+
+    def test_block_0_4(self):
+        """Block [0,4): img0 [2,5) spans right boundary → hash-0, mm_idx=0."""
+        mm_idx, keys = get_block_hash_extra_keys(self.req, start_idx=0, end_idx=4, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (0, ["hash-0"]))
+
+    def test_block_4_8_using_returned_mm_idx(self):
+        """Block [4,8): carry mm_idx=0 from previous call → img0 tail, then img1 stops."""
+        mm_idx, keys = get_block_hash_extra_keys(self.req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertEqual((mm_idx, keys), (1, ["hash-0"]))
+
+    def test_block_8_12_using_returned_mm_idx(self):
+        """Block [8,12): img1 [8,12) exactly fills block → hash-1, mm_idx advances."""
+        mm_idx, keys = get_block_hash_extra_keys(self.req, start_idx=8, end_idx=12, mm_idx=1)
+        self.assertEqual((mm_idx, keys), (2, ["hash-1"]))
+
+    def test_block_12_16_using_returned_mm_idx(self):
+        """Block [12,16): img2 [14,16) fully inside → hash-2."""
+        mm_idx, keys = get_block_hash_extra_keys(self.req, start_idx=12, end_idx=16, mm_idx=2)
+        self.assertEqual((mm_idx, keys), (2, ["hash-2"]))
+
+    def test_full_sequential_scan(self):
+        """Run all four blocks sequentially, feeding mm_idx forward."""
+        mm_idx = 0
+        expected = [
+            ((0, 4), (0, ["hash-0"])),
+            ((4, 8), (1, ["hash-0"])),
+            ((8, 12), (2, ["hash-1"])),
+            ((12, 16), (2, ["hash-2"])),
+        ]
+        for (s, e), (exp_mm_idx, exp_keys) in expected:
+            mm_idx, keys = get_block_hash_extra_keys(self.req, start_idx=s, end_idx=e, mm_idx=mm_idx)
+            self.assertEqual((mm_idx, keys), (exp_mm_idx, exp_keys), msg=f"block [{s},{e})")
+
+
+class TestGetBlockHashExtraKeysBoundaryPrecision(unittest.TestCase):
+    """Exact boundary conditions: <= vs < matters at edges."""
+
+    def test_item_end_equals_start_idx_not_included(self):
+        """
+        offset+length == start_idx  →  item ends exactly where block starts
+        →  condition `<= start_idx` is True  →  skip (not included).
+        """
+        # img [0,4), block [4,8): 0+4=4 == start_idx=4 → skip
+        req = SimpleNamespace(
+            multimodal_inputs={
+                "mm_positions": [SimpleNamespace(offset=0, length=4), SimpleNamespace(offset=10, length=1)],
+                "mm_hashes": ["h-boundary", "h-other"],
+            }
+        )
+        _, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertNotIn("h-boundary", keys)
+
+    def test_item_offset_equals_end_idx_not_included(self):
+        """
+        offset == end_idx  →  item starts exactly where block ends
+        →  condition `>= end_idx` is True  →  stop (not included).
+        """
+        # img [8,2), block [4,8): offset=8 == end_idx=8 → stop
+        req = SimpleNamespace(
+            multimodal_inputs={
+                "mm_positions": [SimpleNamespace(offset=8, length=2)],
+                "mm_hashes": ["h-boundary"],
+            }
+        )
+        _, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertNotIn("h-boundary", keys)
+
+    def test_item_end_one_past_block_end_included(self):
+        """
+        offset+length == end_idx+1  →  item end is 1 past block end
+        →  condition `> end_idx` is True  →  included and mm_idx stays.
+        """
+        # img [6,3) → [6,9), block [4,8): 6+3=9 > 8 → spans right boundary
+        req = SimpleNamespace(
+            multimodal_inputs={
+                "mm_positions": [SimpleNamespace(offset=6, length=3)],
+                "mm_hashes": ["h-one-past"],
+            }
+        )
+        mm_idx, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertIn("h-one-past", keys)
+        self.assertEqual(mm_idx, 0)
+
+    def test_item_end_equals_end_idx_fully_contained(self):
+        """
+        offset+length == end_idx  →  item ends exactly at block end
+        →  condition `> end_idx` is False  →  fully contained, included.
+        """
+        # img [4,4) → [4,8), block [4,8): 4+4=8 == end_idx=8 → contained
+        req = SimpleNamespace(
+            multimodal_inputs={
+                "mm_positions": [SimpleNamespace(offset=4, length=4)],
+                "mm_hashes": ["h-exact-end"],
+            }
+        )
+        _, keys = get_block_hash_extra_keys(req, start_idx=4, end_idx=8, mm_idx=0)
+        self.assertIn("h-exact-end", keys)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/cache_manager/v1/test_radix_tree.py b/tests/cache_manager/v1/test_radix_tree.py
new file mode 100644
index 00000000000..3694d3192d3
--- /dev/null
+++ b/tests/cache_manager/v1/test_radix_tree.py
@@ -0,0 +1,1329 @@
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Unit tests for RadixTree in cache_manager/v1.
+
+Tests cover:
+- Basic operations: insert, find_prefix, increment_ref_nodes, decrement_ref_nodes
+- Eviction: evict_host_nodes, evict_device_to_host
+- Edge cases and error handling
+
+Run with:
+    source .venv/py310/bin/activate
+    pytest tests/cache_manager/v1/test_radix_tree.py -v
+"""
+
+import time
+
+from fastdeploy.cache_manager.v1.metadata import CacheStatus
+from fastdeploy.cache_manager.v1.radix_tree import RadixTree
+
+
+class TestRadixTreeInit:
+    """Tests for RadixTree initialization."""
+
+    def test_init_default(self):
+        """Test default initialization."""
+        tree = RadixTree()
+        assert tree.node_count() == 1  # Only root
+        assert tree._enable_host_cache is False
+
+    def test_init_with_host_cache(self):
+        """Test initialization with host cache enabled."""
+        tree = RadixTree(enable_host_cache=True)
+        assert tree._enable_host_cache is True
+
+    def test_get_stats(self):
+        """Test get_stats returns correct structure."""
+        tree = RadixTree()
+        stats = tree.get_stats()
+        assert stats.node_count == 1
+        assert stats.evictable_device_count == 0
+        assert stats.evictable_host_count == 0
+        assert stats.evictable_count == 0
+        # Test to_dict
+        stats_dict = stats.to_dict()
+        assert "node_count" in stats_dict
+        assert "evictable_device_count" in stats_dict
+        assert "evictable_host_count" in stats_dict
+        assert "evictable_count" in stats_dict
+
+
+class TestRadixTreeInsert:
+    """Tests for insert operation."""
+
+    def test_insert_single_block(self):
+        """Test inserting a single block."""
+        tree = RadixTree()
+        result, _ = tree.insert([("hash1", 1)])
+        assert len(result) == 1  # Returns list of nodes
+        assert tree.node_count() == 2  # root + 1 node
+
+    def test_insert_multiple_blocks(self):
+        """Test inserting multiple blocks in sequence."""
+        tree = RadixTree()
+        result, _ = tree.insert([("hash1", 1), ("hash2", 2), ("hash3", 3)])
+        assert len(result) == 3
+        assert tree.node_count() == 4  # root + 3 nodes
+
+    def test_insert_empty_list(self):
+        """Test inserting empty list returns empty list."""
+        tree = RadixTree()
+        result, _ = tree.insert([])
+        assert result == []
+        assert tree.node_count() == 1
+
+    def test_insert_shared_prefix(self):
+        """Test inserting sequences with shared prefix."""
+        tree = RadixTree()
+        # Insert first sequence
+        tree.insert([("hash1", 1), ("hash2", 2)])
+        # Insert second sequence sharing first block
+        tree.insert([("hash1", 1), ("hash3", 3)])
+
+        # Should reuse the first node, only add one new node
+        assert tree.node_count() == 4  # root + 3 unique nodes (hash1, hash2, hash3)
+
+    def test_insert_same_sequence_twice(self):
+        """Test inserting the same sequence twice increases ref_count."""
+        tree = RadixTree()
+        tree.insert([("hash1", 1), ("hash2", 2)])
+        tree.insert([("hash1", 1), ("hash2", 2)])
+
+        # Should reuse nodes, not create new ones
+        assert tree.node_count() == 3  # root + 2 nodes
+
+
+class TestRadixTreeFindPrefix:
+    """Tests for find_prefix operation."""
+
+    def test_find_prefix_full_match(self):
+        """Test finding a full prefix match."""
+        tree = RadixTree()
+        tree.insert([("hash1", 1), ("hash2", 2), ("hash3", 3)])
+
+        nodes = tree.find_prefix(["hash1", "hash2", "hash3"])
+        assert len(nodes) == 3
+        block_ids = [node.block_id for node in nodes]
+        assert block_ids == [1, 2, 3]
+
+    def test_find_prefix_partial_match(self):
+        """Test finding a partial prefix match."""
+        tree = RadixTree()
+        tree.insert([("hash1", 1), ("hash2", 2), ("hash3", 3)])
+
+        nodes = tree.find_prefix(["hash1", "hash2", "hash4"])
+        assert len(nodes) == 2
+        block_ids = [node.block_id for node in nodes]
+        assert block_ids == [1, 2]
+
+    def test_find_prefix_no_match(self):
+        """Test finding no prefix match."""
+        tree = RadixTree()
+        tree.insert([("hash1", 1), ("hash2", 2)])
+
+        nodes = tree.find_prefix(["hash3", "hash4"])
+        assert len(nodes) == 0
+
+    def test_find_prefix_empty_query(self):
+        """Test finding prefix with empty query."""
+        tree = RadixTree()
+        tree.insert([("hash1", 1)])
+
+        nodes = tree.find_prefix([])
+        assert len(nodes) == 0
+
+
+class TestRadixTreeRefCount:
+    """Tests for reference count operations."""
+
+    def test_increment_ref_nodes(self):
+        """Test incrementing reference count for nodes."""
+        tree = RadixTree()
+        nodes, _ = tree.insert([("hash1", 1), ("hash2", 2)])
+
+        # Release nodes first
+        tree.decrement_ref_nodes(nodes)
+        assert len(tree._evictable_device) == 2
+
+        # Increment again - should remove from evictable
+        tree.increment_ref_nodes(nodes)
+        assert len(tree._evictable_device) == 0
+
+    def test_decrement_ref_nodes(self):
+        """Test decrementing reference count for nodes."""
+        tree = RadixTree()
+        nodes, _ = tree.insert([("hash1", 1), ("hash2", 2)])
+
+        assert len(tree._evictable_device) == 0
+
+        # Decrement ref count
+        tree.decrement_ref_nodes(nodes)
+        assert len(tree._evictable_device) == 2
+
+    def test_decrement_ref_nodes_shared_prefix(self):
+        """Test decrementing with shared prefix."""
+        tree = RadixTree()
+        nodes1, _ = tree.insert([("hash1", 1), ("hash2", 2)])
+        nodes2, _ = tree.insert([("hash1", 1), ("hash3", 3)])
+
+        # Release first sequence
+        tree.decrement_ref_nodes(nodes1)
+        # hash2 should be evictable, hash1 still has ref=1
+        assert len(tree._evictable_device) == 1
+
+        # Release second sequence
+        tree.decrement_ref_nodes(nodes2)
+        # Now hash1 and hash3 should be evictable (hash2 already was)
+        assert len(tree._evictable_device) == 3
+
+
+class TestEvictDeviceToHost:
+    """Tests for evict_device_to_host method."""
+
+    def test_basic_evict_to_host(self):
+        """Test basic device-to-host eviction."""
+        tree = RadixTree(enable_host_cache=True)
+        nodes, _ = tree.insert([("h1", 10), ("h2", 20), ("h3", 30)])
+        tree.decrement_ref_nodes(nodes)
+
+        result = tree.evict_device_to_host(3, [100, 101, 102])
+        assert sorted(result) == [10, 20, 30]
+
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 0
+        assert stats.evictable_host_count == 3
+
+        # Verify nodes now have HOST status and new block_ids
+        for node in nodes:
+            assert node.cache_status == CacheStatus.HOST
+            assert node.block_id in [100, 101, 102]
+
+    def test_evict_partial(self):
+        """Test evicting only part of the evictable nodes."""
+        tree = RadixTree(enable_host_cache=True)
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Evict only 1 out of 3
+        result = tree.evict_device_to_host(1, [100])
+        assert result == [1]
+
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 2
+        assert stats.evictable_host_count == 1
+
+    def test_evict_with_shared_prefix_non_evictable(self):
+        """Test eviction skips non-evictable nodes (ref_count > 0)."""
+        tree = RadixTree(enable_host_cache=True)
+
+        # Insert two sequences sharing prefix: h1->h2
+        nodes_a, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3)])
+        tree.insert([("h1", 1), ("h2", 2), ("h4", 4)])
+
+        # Release only sequence A: h3 evictable, h1 and h2 still ref=2
+        tree.decrement_ref_nodes(nodes_a)
+
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 1  # only h3
+
+        # Evict h3 to host
+        result = tree.evict_device_to_host(1, [100])
+        assert result == [3]
+
+        # h3 should now be on host
+        for node in nodes_a:
+            if node.hash_value == "h3":
+                assert node.cache_status == CacheStatus.HOST
+                assert node.block_id == 100
+
+    def test_evict_skips_host_nodes_in_heap(self):
+        """Test that HOST nodes already in heap are skipped."""
+        tree = RadixTree(enable_host_cache=True)
+
+        # Insert and release sequence A
+        nodes_a, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes_a)
+
+        # Evict A to host
+        tree.evict_device_to_host(2, [100, 101])
+
+        # Insert and release sequence B
+        nodes_b, _ = tree.insert([("h3", 3), ("h4", 4)])
+        tree.decrement_ref_nodes(nodes_b)
+
+        # Now heap has: host(h1), host(h2), device(h3), device(h4)
+        # Try to evict 2 device blocks - should skip host nodes
+        result = tree.evict_device_to_host(2, [200, 201])
+        assert sorted(result) == [3, 4]
+
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 0
+        assert stats.evictable_host_count == 4
+
+    def test_evict_to_host_then_reuse_in_find_prefix(self):
+        """Test that evicted HOST nodes can still be found by find_prefix."""
+        tree = RadixTree(enable_host_cache=True)
+
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Evict to host
+        tree.evict_device_to_host(2, [100, 101])
+
+        # find_prefix should still match (HOST nodes are not skipped)
+        matched = tree.find_prefix(["h1", "h2"])
+        assert len(matched) == 2
+        block_ids = [n.block_id for n in matched]
+        assert block_ids == [100, 101]
+
+    def test_evict_to_host_then_swap_back_to_device(self):
+        """Test full cycle: insert -> evict to host -> swap back to device."""
+        tree = RadixTree(enable_host_cache=True)
+
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Evict to host
+        tree.evict_device_to_host(2, [100, 101])
+        for node in nodes:
+            assert node.cache_status == CacheStatus.HOST
+
+        # Swap back to device: swap_to_device sets status directly to DEVICE (not SWAP_TO_DEVICE)
+        original_host_ids = tree.swap_to_device(nodes, [1, 2])
+        assert sorted(original_host_ids) == [100, 101]
+        for node in nodes:
+            assert node.cache_status == CacheStatus.DEVICE
+
+        # Complete swap (idempotent when already DEVICE)
+        tree.complete_swap_to_device(nodes)
+        for node in nodes:
+            assert node.cache_status == CacheStatus.DEVICE
+
+    def test_evict_precheck_insufficient_evictable(self):
+        """Test pre-check returns None when not enough evictable DEVICE nodes."""
+        tree = RadixTree(enable_host_cache=True)
+
+        # Insert but do NOT decrement (ref_count=1, not evictable)
+        tree.insert([("h1", 1)])
+
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 0
+
+        result = tree.evict_device_to_host(1, [100])
+        assert result is None
+
+    def test_evict_to_host_preserves_tree_structure(self):
+        """Test that eviction preserves tree parent-child relationships."""
+        tree = RadixTree(enable_host_cache=True)
+
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Evict all to host
+        tree.evict_device_to_host(3, [100, 101, 102])
+
+        # Verify tree structure is intact
+        assert tree.node_count() == 4  # root + 3 nodes
+
+        root = tree._root
+        assert "h1" in root.children
+        assert "h2" in root.children["h1"].children
+        assert "h3" in root.children["h1"].children["h2"].children
+
+    def test_evict_to_host_multiple_times(self):
+        """Test evicting in multiple rounds."""
+        tree = RadixTree(enable_host_cache=True)
+
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3), ("h4", 4)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Round 1: evict 2 blocks
+        result1 = tree.evict_device_to_host(2, [100, 101])
+        assert sorted(result1) == [1, 2]
+
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 2
+        assert stats.evictable_host_count == 2
+
+        # Round 2: evict remaining 2 blocks
+        result2 = tree.evict_device_to_host(2, [102, 103])
+        assert sorted(result2) == [3, 4]
+
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 0
+        assert stats.evictable_host_count == 4
+
+
+class TestRadixTreeEviction:
+    """Tests for eviction operations."""
+
+    def test_evict_host_nodes(self):
+        """Test evicting HOST nodes."""
+        tree = RadixTree(enable_host_cache=True)
+        nodes, _ = tree.insert([("hash1", 1), ("hash2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        # First, evict device to host
+        device_ids = tree.evict_device_to_host(2, [101, 102])
+        assert sorted(device_ids) == [1, 2]
+
+        # Now nodes are on host, evict them
+        host_ids = tree.evict_host_nodes(2)
+        assert sorted(host_ids) == [101, 102]
+        assert tree.node_count() == 1  # Only root
+
+    def test_evict_device_to_host(self):
+        """Test evicting DEVICE nodes to host."""
+        tree = RadixTree(enable_host_cache=True)
+        nodes, _ = tree.insert([("hash1", 1), ("hash2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        device_ids = tree.evict_device_to_host(2, [101, 102])
+        assert sorted(device_ids) == [1, 2]
+
+        # Check nodes are now on host
+        stats = tree.get_stats()
+        assert stats.evictable_host_count == 2
+        assert stats.evictable_device_count == 0
+
+    def test_evict_device_to_host_not_enough_blocks(self):
+        """Test eviction when not enough evictable blocks."""
+        tree = RadixTree(enable_host_cache=True)
+        nodes, _ = tree.insert([("hash1", 1)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Try to evict more than available
+        result = tree.evict_device_to_host(5, [101, 102, 103, 104, 105])
+        assert result is None
+
+    def test_evict_device_to_host_mismatched_host_ids(self):
+        """Test eviction with insufficient host_block_ids."""
+        tree = RadixTree(enable_host_cache=True)
+        nodes, _ = tree.insert([("hash1", 1), ("hash2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Not enough host block ids
+        result = tree.evict_device_to_host(2, [101])  # Only 1 host id
+        assert result is None
+
+    def test_evict_host_nodes_empty(self):
+        """Test evicting when no host nodes available."""
+        tree = RadixTree()
+
+        result = tree.evict_host_nodes(1)
+        assert result is None
+
+    def test_evict_zero_blocks(self):
+        """Test evicting zero blocks returns empty list."""
+        tree = RadixTree()
+
+        result = tree.evict_host_nodes(0)
+        assert result == []
+
+        result = tree.evict_device_to_host(0, [])
+        assert result == []
+
+
+class TestRadixTreeReset:
+    """Tests for reset operation."""
+
+    def test_reset_clears_all(self):
+        """Test reset clears all data."""
+        tree = RadixTree()
+        nodes, _ = tree.insert([("hash1", 1), ("hash2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        tree.reset()
+
+        assert tree.node_count() == 1
+        assert len(tree._evictable_device) == 0
+        assert len(tree._evictable_host) == 0
+
+
+class TestRadixTreeFullWorkflow:
+    """Tests for complete workflow scenarios."""
+
+    def test_workflow_shared_prefix_eviction(self):
+        """Test complete workflow with shared prefix and eviction."""
+        tree = RadixTree(enable_host_cache=True)
+
+        # Insert two sequences sharing a prefix
+        nodes_a, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3)])  # Sequence A
+        _ = tree.insert([("h1", 1), ("h2", 2), ("h4", 4)])  # Sequence B
+
+        # Release sequence A
+        tree.decrement_ref_nodes(nodes_a)
+
+        # h3 should be evictable, but h1 and h2 still have ref_count=1
+        assert len(tree._evictable_device) == 1
+
+        # Find prefix for new sequence should still match h1, h2
+        matched_nodes = tree.find_prefix(["h1", "h2", "h5"])
+        assert len(matched_nodes) == 2
+        block_ids = [node.block_id for node in matched_nodes]
+        assert block_ids == [1, 2]
+
+    def test_workflow_evict_device_to_host_then_remove(self):
+        """Test workflow: evict to host, then remove from host."""
+        tree = RadixTree(enable_host_cache=True)
+
+        # Insert and release
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Evict device to host
+        device_ids = tree.evict_device_to_host(2, [101, 102])
+        assert sorted(device_ids) == [1, 2]
+
+        # Nodes should be on host now and evictable again
+        stats = tree.get_stats()
+        assert stats.evictable_host_count == 2
+
+        # Now remove from host
+        host_ids = tree.evict_host_nodes(2)
+        assert sorted(host_ids) == [101, 102]
+        assert tree.node_count() == 1
+
+
+class TestRadixTreeEdgeCases:
+    """Tests for edge cases and error handling."""
+
+    def test_evict_not_enough_blocks(self):
+        """Test eviction when not enough evictable blocks."""
+        tree = RadixTree(enable_host_cache=True)
+        nodes, _ = tree.insert([("h1", 1)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Try to evict more than available
+        result = tree.evict_device_to_host(5, [101, 102, 103, 104, 105])
+        assert result is None
+
+        # Node should still be evictable
+        assert len(tree._evictable_device) == 1
+
+    def test_node_id_uniqueness(self):
+        """Test that each node has a unique node_id."""
+        tree = RadixTree()
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3)])
+
+        # Collect node_ids from the tree structure
+        node_ids = set()
+
+        def traverse(node):
+            if node.hash_value:  # Skip root
+                node_ids.add(node.node_id)
+            for child in node.children.values():
+                traverse(child)
+
+        traverse(tree._root)
+        assert len(node_ids) == 3  # All unique
+
+    def test_eviction_order_lru(self):
+        """Test that eviction follows LRU order."""
+        tree = RadixTree(enable_host_cache=True)
+
+        # Insert multiple blocks
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Wait a bit and access h2
+        time.sleep(0.01)
+        _ = tree.find_prefix(["h1", "h2"])
+        # h2 is now more recently accessed
+
+        # Evict - should start with least recently used
+        device_ids = tree.evict_device_to_host(3, [101, 102, 103])
+        assert len(device_ids) == 3
+        # h1 should be evicted first (least recently accessed after find_prefix)
+        assert device_ids[0] == 1
+
+
+class TestRadixTreeMultiSequenceWorkflow:
+    """Tests for multi-sequence workflows simulating real usage patterns."""
+
+    def test_multi_sequence_shared_prefix_reuse(self):
+        """
+        Test multiple sequences sharing a common prefix.
+
+        Simulates CacheManager usage:
+        1. Request A: [h1, h2, h3] -> cached
+        2. Request B: [h1, h2, h4] -> finds prefix match for [h1, h2], inserts new [h4]
+        3. Request C: [h1, h2] -> finds full prefix match
+        """
+        tree = RadixTree(enable_host_cache=True)
+
+        # Request A: Insert full sequence
+        nodes_a, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3)])
+        assert len(nodes_a) == 3
+
+        # After insert, h1 has ref_count=1
+        h1_node = tree._root.children["h1"]
+        assert h1_node.ref_count == 1
+
+        # Simulate request finish - decrement ref
+        tree.decrement_ref_nodes(nodes_a)
+
+        # Now h1, h2, h3 are all evictable (ref_count=0)
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 3
+
+        # Request B: Share prefix, insert new suffix
+        nodes_b, wasted = tree.insert([("h1", 1), ("h2", 2), ("h4", 4)])
+        assert len(nodes_b) == 3
+        # h1 and h2 should be reused (not incremented), h4 is new
+        # h1 and h2 still have ref_count=0, h4 has ref_count=1
+        assert tree.node_count() == 5  # root + h1, h2, h3, h4
+
+        h4_node = h1_node.children["h2"].children["h4"]
+        assert h4_node.ref_count == 1
+
+        # Decrement B's refs
+        tree.decrement_ref_nodes(nodes_b)
+
+        # Request C: Find prefix for [h1, h2]
+        matched = tree.find_prefix(["h1", "h2"])
+        assert len(matched) == 2
+
+        # Increment ref for matched nodes to prevent eviction
+        tree.increment_ref_nodes(matched)
+        assert h1_node.ref_count == 1
+        assert h1_node.children["h2"].ref_count == 1
+
+        # Decrement when done
+        tree.decrement_ref_nodes(matched)
+
+    def test_incremental_insert_after_prefix_match(self):
+        """
+        Test incremental insertion from a matched prefix node.
+
+        Simulates CacheManager usage where:
+        1. Insert [h1, h2] and cache it
+        2. Later request comes with [h1, h2, h3, h4]
+        3. find_prefix returns [h1, h2]
+        4. insert remaining [h3, h4] starting from matched node
+        """
+        tree = RadixTree()
+
+        # Initial sequence
+        nodes1, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes1)
+
+        # Later request with longer sequence
+        matched = tree.find_prefix(["h1", "h2"])
+        assert len(matched) == 2
+
+        # Incremental insert starting from last matched node
+        last_node = matched[-1]
+        nodes2, wasted = tree.insert([("h3", 3), ("h4", 4)], start_node=last_node)
+        assert len(nodes2) == 2
+        assert len(wasted) == 0
+
+        # Verify complete sequence
+        full_match = tree.find_prefix(["h1", "h2", "h3", "h4"])
+        assert len(full_match) == 4
+
+    def test_three_request_caching_cycle(self):
+        """
+        Test complete caching cycle with three sequential requests.
+
+        Workflow:
+        1. Request 1: Insert [A, B, C], finish
+        2. Request 2: Find [A, B], gets match, continue with [X, Y], finish
+        3. Request 3: Find [A, B], gets full match
+
+        Note: Request 3 finds [A, B] but NOT [X] because X is under A, not B.
+        """
+        tree = RadixTree(enable_host_cache=True)
+
+        # Request 1: Insert and cache
+        req1_nodes, _ = tree.insert([("A", 1), ("B", 2), ("C", 3)])
+        tree.decrement_ref_nodes(req1_nodes)
+
+        # Request 2: Find prefix, add new blocks
+        matched = tree.find_prefix(["A", "B"])
+        assert len(matched) == 2
+        tree.increment_ref_nodes(matched)
+
+        req2_new, wasted = tree.insert([("X", 10), ("Y", 11)])
+        assert len(req2_new) == 2
+
+        tree.decrement_ref_nodes(matched)
+        tree.decrement_ref_nodes(req2_new)
+
+        # Request 3: Find [A, B] - should get full match
+        # X is NOT under B, so we can only match A, B
+        matched3 = tree.find_prefix(["A", "B"])
+        assert len(matched3) == 2
+
+        # Stats should show correct state
+        stats = tree.get_stats()
+        # Tree has: root, A, B, C (from req1), X, Y (from req2)
+        assert stats.node_count == 6
+
+
+class TestRadixTreeCompleteEvictionCycle:
+    """Tests for complete eviction cycles (DEVICE -> HOST -> Removed)."""
+
+    def test_full_eviction_cycle_single_sequence(self):
+        """
+        Test complete eviction cycle for a single sequence.
+
+        Cycle: Insert -> Decrement -> Evict to Host -> Remove from Host
+        """
+        tree = RadixTree(enable_host_cache=True)
+
+        # Step 1: Insert
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3)])
+        assert tree.node_count() == 4
+
+        # Step 2: Decrement refs to make evictable
+        tree.decrement_ref_nodes(nodes)
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 3
+
+        # Step 3: Evict to host
+        released = tree.evict_device_to_host(3, [100, 101, 102])
+        assert sorted(released) == [1, 2, 3]
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 0
+        assert stats.evictable_host_count == 3
+
+        # Verify nodes are now HOST
+        for node in nodes:
+            assert node.cache_status == CacheStatus.HOST
+            assert node.block_id in [100, 101, 102]
+
+        # Step 4: Remove from host
+        evicted = tree.evict_host_nodes(3)
+        assert sorted(evicted) == [100, 101, 102]
+        assert tree.node_count() == 1  # Only root remains
+
+    def test_full_eviction_cycle_multiple_rounds(self):
+        """
+        Test eviction in multiple rounds.
+
+        Insert 10 blocks, evict 3, then evict remaining 7.
+        """
+        tree = RadixTree(enable_host_cache=True)
+
+        nodes, _ = tree.insert([(f"h{i}", i) for i in range(10)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Round 1: Evict 3
+        released1 = tree.evict_device_to_host(3, [100, 101, 102])
+        assert len(released1) == 3
+
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 7
+        assert stats.evictable_host_count == 3
+
+        # Round 2: Evict remaining 7
+        released2 = tree.evict_device_to_host(7, [200, 201, 202, 203, 204, 205, 206])
+        assert len(released2) == 7
+
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 0
+        assert stats.evictable_host_count == 10
+
+        # Now remove all from host
+        evicted = tree.evict_host_nodes(10)
+        assert len(evicted) == 10
+        assert tree.node_count() == 1
+
+    def test_eviction_with_shared_prefix_multiple_refs(self):
+        """
+        Test eviction when nodes have shared prefixes with active references.
+
+        Tree structure:
+            root
+            └── h1 (ref=2) - shared by both sequences, incremented each insert
+                ├── h2 (evicted to HOST)
+                └── h3 (ref=1 after decrement)
+
+        After seq1 finishes: h1 stays (ref=1), h2 is evicted to HOST (still in tree)
+        """
+        tree = RadixTree(enable_host_cache=True)
+
+        # Insert seq1: h1 -> h2
+        nodes1, _ = tree.insert([("h1", 1), ("h2", 2)])
+        # Insert seq2: h1 -> h3 (shares h1)
+        nodes2, _ = tree.insert([("h1", 1), ("h3", 3)])
+
+        # Shared h1 has ref_count=2 (incremented on each insert traversal)
+        h1_node = tree._root.children["h1"]
+        assert h1_node.ref_count == 2
+
+        # Seq1 finishes - decrement its refs
+        tree.decrement_ref_nodes(nodes1)
+
+        # h1 still has ref=1, h2 should be evictable
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 1
+
+        # Evict h2 to host (changes status, node stays in tree until evict_host_nodes)
+        released = tree.evict_device_to_host(1, [100])
+        assert released == [2]
+
+        # h2 is now on host but still in tree
+        assert "h1" in tree._root.children
+        # evict_device_to_host only changes status, doesn't remove from tree
+        assert tree.node_count() == 4  # root + h1 + h2 + h3
+
+        # h2 is now on host with ref=0 (evictable in host heap)
+        h2_node = h1_node.children["h2"]
+        assert h2_node.cache_status == CacheStatus.HOST
+        assert h2_node.ref_count == 0
+
+
+class TestRadixTreeSwapWorkflow:
+    """Tests for HOST -> DEVICE swap workflow."""
+
+    def test_swap_host_to_device_complete_cycle(self):
+        """
+        Test full swap cycle: DEVICE -> HOST -> SWAP_TO_DEVICE -> DEVICE.
+
+        This simulates loading cached blocks back to GPU.
+        """
+        tree = RadixTree(enable_host_cache=True)
+
+        # Step 1: Insert and evict to host
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+        tree.evict_device_to_host(2, [100, 101])
+
+        # Verify nodes are on host
+        for node in nodes:
+            assert node.cache_status == CacheStatus.HOST
+            assert node.block_id in [100, 101]
+
+        # Step 2: Swap back to device
+        # swap_to_device() sets status directly to DEVICE (not SWAP_TO_DEVICE intermediate)
+        original_ids = tree.swap_to_device(nodes, [50, 51])
+        assert sorted(original_ids) == [100, 101]
+
+        # Verify status is DEVICE after swap_to_device
+        for node in nodes:
+            assert node.cache_status == CacheStatus.DEVICE
+            assert node.block_id in [50, 51]
+
+        # Step 3: complete_swap_to_device is idempotent when already DEVICE
+        gpu_ids = tree.complete_swap_to_device(nodes)
+        assert sorted(gpu_ids) == [50, 51]
+
+        for node in nodes:
+            assert node.cache_status == CacheStatus.DEVICE
+            assert node.block_id in [50, 51]
+
+    def test_swap_after_find_prefix(self):
+        """
+        Test that swapped blocks can still be found via find_prefix.
+
+        After swap_to_device, nodes should be findable again.
+        """
+        tree = RadixTree(enable_host_cache=True)
+
+        # Insert and evict
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+        tree.evict_device_to_host(2, [100, 101])
+
+        # Find prefix (should find HOST nodes)
+        matched = tree.find_prefix(["h1", "h2"])
+        assert len(matched) == 2
+
+        # Increment refs to prevent eviction during swap
+        tree.increment_ref_nodes(matched)
+
+        # Swap to device
+        original_ids = tree.swap_to_device(matched, [50, 51])
+        assert sorted(original_ids) == [100, 101]
+
+        # Find should still work
+        matched2 = tree.find_prefix(["h1", "h2"])
+        assert len(matched2) == 2
+        block_ids = [n.block_id for n in matched2]
+        assert sorted(block_ids) == [50, 51]
+
+        tree.decrement_ref_nodes(matched2)
+
+
+class TestRadixTreeConcurrencySafety:
+    """Tests for thread safety and concurrent access patterns."""
+
+    def test_concurrent_insert_and_find(self):
+        """Test concurrent insert and find_prefix operations."""
+        import threading
+
+        tree = RadixTree(enable_host_cache=True)
+
+        def insert_sequence(prefix, start_id, count):
+            for i in range(count):
+                blocks = [(f"{prefix}_{j}", start_id + j) for j in range(5)]
+                tree.insert(blocks)
+
+        def find_sequence(prefix, results):
+            for _ in range(10):
+                matched = tree.find_prefix([f"{prefix}_0", f"{prefix}_1"])
+                results.append(len(matched))
+
+        threads = []
+        results = []
+
+        # Create 5 threads doing inserts
+        for i in range(5):
+            t = threading.Thread(target=insert_sequence, args=(f"P{i}", i * 10, 10))
+            threads.append(t)
+
+        # Create 5 threads doing finds
+        for i in range(5):
+            t = threading.Thread(target=find_sequence, args=(f"P{i}", results))
+            threads.append(t)
+
+        for t in threads:
+            t.start()
+
+        for t in threads:
+            t.join()
+
+        # All find operations should complete without error
+        assert len(results) == 50
+        # Find results may vary depending on timing, but should be valid
+        for r in results:
+            assert 0 <= r <= 2
+
+    def test_concurrent_eviction_and_access(self):
+        """Test concurrent eviction and find_prefix operations."""
+        import threading
+
+        tree = RadixTree(enable_host_cache=True)
+
+        # Setup: Insert and make evictable
+        nodes, _ = tree.insert([(f"h{i}", i) for i in range(20)])
+        tree.decrement_ref_nodes(nodes)
+
+        results = []
+        errors = []
+
+        def evict_blocks():
+            try:
+                for _ in range(5):
+                    released = tree.evict_device_to_host(2, [1000, 1001])
+                    if released:
+                        results.append(("evict", len(released)))
+            except Exception as e:
+                errors.append(e)
+
+        def access_blocks():
+            try:
+                for _ in range(10):
+                    matched = tree.find_prefix(["h0", "h1"])
+                    results.append(("access", len(matched)))
+            except Exception as e:
+                errors.append(e)
+
+        threads = [
+            threading.Thread(target=evict_blocks),
+            threading.Thread(target=access_blocks),
+            threading.Thread(target=access_blocks),
+        ]
+
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        # Should have completed without error
+        assert len(errors) == 0
+        # Should have results from all operations
+        assert len(results) > 0
+        # Access results should be valid (0, 1, or 2 blocks matched)
+        for op, count in results:
+            if op == "access":
+                assert 0 <= count <= 2
+
+
+class TestRadixTreeMemoryManagement:
+    """Tests for proper memory management and reference counting."""
+
+    def test_node_reuse_different_block_ids(self):
+        """
+        Test that reusing a node with different block_id tracks wasted blocks.
+
+        When inserting a sequence that partially reuses existing nodes
+        but with different block_ids, the conflicting block_ids should
+        be tracked as wasted.
+
+        In this case:
+        - h1 already exists with block_id=1, new block_id=100 -> wasted
+        - h2 already exists with block_id=2, new block_id=200 -> wasted
+        """
+        tree = RadixTree()
+
+        # Insert first sequence
+        nodes1, wasted1 = tree.insert([("h1", 1), ("h2", 2)])
+        assert len(wasted1) == 0
+
+        # Insert same hashes but different block_ids - both are wasted
+        nodes2, wasted2 = tree.insert([("h1", 100), ("h2", 200)])
+        # Both h1 and h2 already exist, so both new block_ids are wasted
+        assert len(wasted2) == 2
+        assert sorted(wasted2) == [100, 200]
+
+        # Verify nodes still have original block_ids
+        h1_node = tree._root.children["h1"]
+        h2_node = h1_node.children["h2"]
+        assert h1_node.block_id == 1
+        assert h2_node.block_id == 2
+
+    def test_multiple_insert_same_node_tracking(self):
+        """
+        Test that multiple inserts of the same path correctly track refs.
+
+        Insert the same sequence 5 times, then decrement 5 times.
+        Node should become evictable only after all decrements.
+        """
+        tree = RadixTree()
+
+        # Insert same sequence 5 times
+        all_nodes = []
+        for i in range(5):
+            nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+            all_nodes.append(nodes)
+
+        h1_node = tree._root.children["h1"]
+        assert h1_node.ref_count == 5
+
+        # Decrement refs one by one
+        for i in range(5):
+            tree.decrement_ref_nodes(all_nodes[i])
+            expected_ref = 5 - i - 1
+            assert h1_node.ref_count == expected_ref
+
+        # Now h1 should be evictable
+        assert h1_node.ref_count == 0
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == 2  # h1 and h2
+
+    def test_reset_clears_all_tracking(self):
+        """Test that reset properly clears all tracking structures."""
+        tree = RadixTree(enable_host_cache=True)
+
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3)])
+        tree.decrement_ref_nodes(nodes)
+        tree.evict_device_to_host(3, [100, 101, 102])
+
+        assert tree.node_count() == 4
+        stats = tree.get_stats()
+        assert stats.evictable_host_count == 3
+
+        # Reset
+        tree.reset()
+
+        assert tree.node_count() == 1
+        assert len(tree._evictable_device) == 0
+        assert len(tree._evictable_host) == 0
+
+
+class TestRadixTreeComplexScenarios:
+    """Tests for complex real-world scenarios."""
+
+    def test_batched_requests_with_partial_match(self):
+        """
+        Test handling multiple batched requests with partial prefix matches.
+
+        Simulates a batch of 3 requests:
+        - Req1: [sys, user1] -> insert both
+        - Req2: [sys, user2] -> prefix match [sys], insert [user2]
+        - Req3: [sys, user1] -> full prefix match
+        """
+        tree = RadixTree(enable_host_cache=True)
+
+        # Request 1: Full insert
+        req1_nodes, _ = tree.insert([("sys", 0), ("user1", 1)])
+        tree.decrement_ref_nodes(req1_nodes)
+
+        # Request 2: Partial match (sys), new suffix (user2)
+        matched = tree.find_prefix(["sys"])
+        assert len(matched) == 1
+        tree.increment_ref_nodes(matched)
+
+        req2_nodes, wasted = tree.insert([("user2", 2)])
+        assert len(wasted) == 0
+
+        tree.decrement_ref_nodes(matched)
+        tree.decrement_ref_nodes(req2_nodes)
+
+        # Request 3: Full match
+        matched3 = tree.find_prefix(["sys", "user1"])
+        assert len(matched3) == 2
+
+        # Stats check
+        stats = tree.get_stats()
+        assert stats.node_count == 4  # sys, user1, user2 + root
+
+    def test_deep_chain_insertion(self):
+        """
+        Test insertion and access of deep node chains.
+
+        Insert a chain of 20 blocks, verify find_prefix works at various depths.
+        """
+        tree = RadixTree()
+
+        # Insert deep chain
+        depth = 20
+        blocks = [(f"h{i}", i) for i in range(depth)]
+        nodes, _ = tree.insert(blocks)
+
+        assert len(nodes) == depth
+        assert tree.node_count() == depth + 1
+
+        # Find at various depths
+        for d in [5, 10, 15, 20]:
+            matched = tree.find_prefix([f"h{i}" for i in range(d)])
+            assert len(matched) == d
+
+        # Decrement and verify all become evictable
+        tree.decrement_ref_nodes(nodes)
+        stats = tree.get_stats()
+        assert stats.evictable_device_count == depth
+
+    def test_wide_tree_with_shared_prefix(self):
+        """
+        Test tree with many branches sharing a common prefix.
+
+        Structure:
+            root
+            └── shared (ref=100) - incremented each insert
+                ├── branch_0 (ref=0 after release)
+                ├── branch_1 (ref=0 after release)
+                ... (50 branches released, 50 still held)
+        """
+        tree = RadixTree(enable_host_cache=True)
+        num_branches = 100
+
+        # Insert 100 sequences, all sharing "shared" prefix
+        all_branch_nodes = []
+        for i in range(num_branches):
+            nodes, _ = tree.insert([("shared", 0), (f"branch_{i}", i)])
+            all_branch_nodes.append(nodes)
+
+        # shared has ref_count=100 (incremented on each insert traversal)
+        shared_node = tree._root.children["shared"]
+        assert shared_node.ref_count == 100
+
+        # Release half the branches
+        for i in range(num_branches // 2):
+            tree.decrement_ref_nodes(all_branch_nodes[i])
+
+        stats = tree.get_stats()
+        # 50 branch nodes become evictable, shared stays at ref=50
+        assert stats.evictable_device_count == num_branches // 2  # 50
+
+        # shared node should still have ref=50 (not evictable)
+        assert shared_node.ref_count == num_branches // 2
+
+        # Verify one remaining branch is still findable
+        matched = tree.find_prefix(["shared", f"branch_{num_branches // 2}"])
+        assert len(matched) == 2
+
+
+class TestEvictDeviceNodes:
+    """Tests for evict_device_nodes (no host cache mode)."""
+
+    def test_evict_device_nodes_basic(self):
+        """Test evicting DEVICE nodes directly (no host cache)."""
+        tree = RadixTree(enable_host_cache=False)
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2), ("h3", 3)])
+        tree.decrement_ref_nodes(nodes)
+
+        result = tree.evict_device_nodes(2)
+        assert result is not None
+        assert len(result) == 2
+        # Returned block_ids must be from original insert
+        assert all(bid in [1, 2, 3] for bid in result)
+
+    def test_evict_device_nodes_not_enough(self):
+        """Test eviction fails when not enough evictable DEVICE nodes."""
+        tree = RadixTree(enable_host_cache=False)
+        nodes, _ = tree.insert([("h1", 1)])
+        tree.decrement_ref_nodes(nodes)
+
+        result = tree.evict_device_nodes(5)
+        assert result is None
+
+    def test_evict_device_nodes_zero(self):
+        """Test evicting zero DEVICE nodes returns empty list."""
+        tree = RadixTree()
+        result = tree.evict_device_nodes(0)
+        assert result == []
+
+    def test_evict_device_nodes_removes_from_tree(self):
+        """Test that evicted DEVICE nodes are removed from tree."""
+        tree = RadixTree(enable_host_cache=False)
+        nodes, _ = tree.insert([("h1", 1)])
+        tree.decrement_ref_nodes(nodes)
+
+        assert tree.node_count() == 2  # root + h1
+
+        tree.evict_device_nodes(1)
+
+        assert tree.node_count() == 1  # only root
+        assert "h1" not in tree._root.children
+
+
+class TestBackupBlocks:
+    """Tests for backup_blocks method."""
+
+    def test_backup_blocks_basic(self):
+        """Test marking blocks as backed up."""
+        tree = RadixTree(write_policy="write_through_selective")
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        backed_ids = tree.backup_blocks(nodes, [100, 101])
+
+        assert sorted(backed_ids) == [1, 2]
+        for node in nodes:
+            assert node.backuped is True
+            assert node.host_block_id in [100, 101]
+
+    def test_backup_blocks_mismatched_length(self):
+        """Test backup_blocks returns empty for mismatched lengths."""
+        tree = RadixTree()
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        result = tree.backup_blocks(nodes, [100])  # Only 1 host_block_id for 2 nodes
+        assert result == []
+
+    def test_backup_blocks_empty(self):
+        """Test backup_blocks with empty lists."""
+        tree = RadixTree()
+        result = tree.backup_blocks([], [])
+        assert result == []
+
+
+class TestGetCandidatesForBackup:
+    """Tests for get_candidates_for_backup method."""
+
+    def test_get_candidates_basic(self):
+        """Test get_candidates_for_backup returns eligible nodes."""
+        tree = RadixTree(write_policy="write_through_selective")
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        # Simulate hit_count >= threshold
+        tree.decrement_ref_nodes(nodes)
+        # Manually set hit_count so they qualify
+        for node in nodes:
+            node.hit_count = 3
+
+        candidates = tree.get_candidates_for_backup(threshold=2)
+
+        assert len(candidates) == 2
+
+    def test_get_candidates_excludes_already_backed_up(self):
+        """Test that already backed-up nodes are excluded."""
+        tree = RadixTree(write_policy="write_through_selective")
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        for node in nodes:
+            node.hit_count = 5
+
+        # Mark first node as backed up
+        nodes[0].backuped = True
+
+        candidates = tree.get_candidates_for_backup(threshold=1)
+        assert len(candidates) == 1
+        assert candidates[0] is nodes[1]
+
+    def test_get_candidates_wrong_policy_returns_empty(self):
+        """Test that non-write_through_selective policy returns empty."""
+        tree = RadixTree(write_policy="write_through")
+        nodes, _ = tree.insert([("h1", 1)])
+        tree.decrement_ref_nodes(nodes)
+        nodes[0].hit_count = 10
+
+        candidates = tree.get_candidates_for_backup(threshold=1)
+        assert candidates == []
+
+    def test_get_candidates_excludes_pending_block_ids(self):
+        """Test that nodes with block_ids in pending list are excluded."""
+        tree = RadixTree(write_policy="write_through_selective")
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        for node in nodes:
+            node.hit_count = 5
+
+        # Exclude block_id=1 from candidates
+        candidates = tree.get_candidates_for_backup(threshold=1, pending_block_ids=[1])
+
+        assert len(candidates) == 1
+        assert candidates[0].block_id == 2
+
+
+class TestEvictNodesSelective:
+    """Tests for evict_nodes_selective (write_through_selective policy)."""
+
+    def test_evict_nodes_selective_without_backup(self):
+        """Test eviction of nodes without backup removes from tree."""
+        tree = RadixTree(write_policy="write_through_selective")
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Nodes have no backup
+        result = tree.evict_nodes_selective(2)
+
+        assert sorted(result) == [1, 2]
+        # Nodes should be removed from tree (no backup, so deleted)
+        assert tree.node_count() == 1
+
+    def test_evict_nodes_selective_with_backup(self):
+        """Test eviction of backed-up nodes transitions to HOST state."""
+        tree = RadixTree(write_policy="write_through_selective", enable_host_cache=True)
+        nodes, _ = tree.insert([("h1", 1), ("h2", 2)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Mark nodes as backed up with host block IDs
+        tree.backup_blocks(nodes, [100, 101])
+
+        result = tree.evict_nodes_selective(2)
+
+        assert sorted(result) == [1, 2]
+        # Nodes should now be in HOST state (not removed from tree)
+        for node in nodes:
+            assert node.cache_status == CacheStatus.HOST
+            assert node.block_id in [100, 101]
+
+        # Nodes should be evictable from host
+        stats = tree.get_stats()
+        assert stats.evictable_host_count == 2
+
+    def test_evict_nodes_selective_zero_blocks(self):
+        """Test evicting zero blocks returns empty list."""
+        tree = RadixTree(write_policy="write_through_selective")
+        result = tree.evict_nodes_selective(0)
+        assert result == []
+
+    def test_evict_nodes_selective_not_enough_blocks(self):
+        """Test eviction returns empty list when not enough evictable blocks."""
+        tree = RadixTree(write_policy="write_through_selective")
+        nodes, _ = tree.insert([("h1", 1)])
+        tree.decrement_ref_nodes(nodes)
+
+        # Request more than available
+        result = tree.evict_nodes_selective(5)
+        assert result == []
diff --git a/tests/cache_manager/v1/test_swap_cache_ops.py b/tests/cache_manager/v1/test_swap_cache_ops.py
new file mode 100644
index 00000000000..bc9fc24bcaf
--- /dev/null
+++ b/tests/cache_manager/v1/test_swap_cache_ops.py
@@ -0,0 +1,774 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+Unit tests for swap_cache_all_layers operator.
+
+Tests cover:
+- Data correctness verification (MD5 checksum before and after transfer)
+- Transfer speed benchmark
+- Both CPU->GPU (load) and GPU->CPU (evict) modes
+"""
+
+import ctypes
+import hashlib
+import random
+import statistics
+import unittest
+from dataclasses import dataclass
+
+import numpy as np
+import paddle
+
+# Import the ops under test
+from fastdeploy.cache_manager.ops import cuda_host_alloc, swap_cache_all_layers
+
+
+@dataclass
+class TestConfig:
+    """Test configuration for KV cache transfer."""
+
+    num_layers: int = 4
+    num_heads: int = 16
+    head_dim: int = 128
+    block_size: int = 64
+    total_block_num: int = 128
+    dtype: paddle.dtype = paddle.bfloat16
+
+    @property
+    def kv_shape(self):
+        """KV cache shape: [total_block_num, num_heads, block_size, head_dim]"""
+        return (self.total_block_num, self.num_heads, self.block_size, self.head_dim)
+
+    @property
+    def kv_cache_dim(self):
+        """Single block K or V cache dimension size."""
+        return self.head_dim * self.num_heads * self.block_size
+
+    @property
+    def element_size(self):
+        """Size of each element in bytes."""
+        dummy = paddle.zeros([], dtype=self.dtype)
+        return dummy.element_size()
+
+    @property
+    def block_bytes(self):
+        """Single block K or V size in bytes."""
+        return self.kv_cache_dim * self.element_size
+
+    @property
+    def layer_bytes(self):
+        """Single layer K+V total size in bytes."""
+        return self.block_bytes * self.total_block_num * 2
+
+
+def compute_md5(data: np.ndarray) -> str:
+    """Compute MD5 checksum of numpy array data.
+
+    Note: For bfloat16 data, we need to handle the fact that numpy
+    doesn't have native bfloat16 support. We convert to uint16 to get
+    the raw bytes for MD5 computation.
+    """
+    if data.dtype == np.float32:
+        # Already float32, use directly
+        return hashlib.md5(data.tobytes()).hexdigest()
+    elif data.dtype == np.uint16 or str(data.dtype) == "bfloat16":
+        # bfloat16 stored as uint16 in numpy, use raw bytes
+        return hashlib.md5(data.tobytes()).hexdigest()
+    else:
+        # For other dtypes, convert to float32 for consistent comparison
+        return hashlib.md5(data.astype(np.float32).tobytes()).hexdigest()
+
+
+def init_test_data(
+    config: TestConfig,
+    num_blocks_to_transfer: int,
+    use_random: bool = False,
+    shuffle_blocks: bool = False,
+    seed: int = 42,
+):
+    """
+    Initialize test data for transfer.
+
+    Args:
+        config: Test configuration for KV cache transfer.
+        num_blocks_to_transfer: Number of blocks to transfer.
+        use_random: If True, use random tensor values instead of constant per-layer values.
+        shuffle_blocks: If True, use randomly sampled non-consecutive block IDs.
+        seed: Random seed for reproducibility.
+
+    Returns:
+        Tuple of (gpu_k_tensors, gpu_v_tensors, k_ptrs, v_ptrs, src_k_data, src_v_data, md5_sums)
+    """
+    device = "cuda"
+    rng = random.Random(seed)
+
+    if shuffle_blocks:
+        # Non-consecutive GPU block IDs: randomly sample from the full GPU block pool
+        # CPU block IDs must stay in [0, num_blocks_to_transfer) as CPU pinned memory
+        # is allocated for exactly num_blocks_to_transfer contiguous slots.
+        all_ids = list(range(config.total_block_num))
+        gpu_block_ids = sorted(rng.sample(all_ids, num_blocks_to_transfer))
+        cpu_block_ids = list(range(num_blocks_to_transfer))
+    else:
+        # Consecutive: 0, 1, 2, ..., num_blocks_to_transfer-1
+        gpu_block_ids = list(range(num_blocks_to_transfer))
+        cpu_block_ids = list(range(num_blocks_to_transfer))
+
+    gpu_k_tensors = []
+    gpu_v_tensors = []
+    k_ptrs = []
+    v_ptrs = []
+    src_k_data = []
+    src_v_data = []
+    md5_sums = []
+
+    bytes_per_block = config.kv_cache_dim * config.element_size
+
+    for layer_idx in range(config.num_layers):
+        if use_random:
+            # Random values: use float32 seed-based generation then cast to target dtype
+            paddle.seed(seed + layer_idx)
+            src_k = paddle.randn(config.kv_shape, dtype=paddle.float32).cast(config.dtype)
+            src_v = paddle.randn(config.kv_shape, dtype=paddle.float32).cast(config.dtype)
+        else:
+            # Constant values per layer for easier visual verification
+            src_k = paddle.ones(config.kv_shape, dtype=config.dtype) * (layer_idx + 1)
+            src_v = paddle.ones(config.kv_shape, dtype=config.dtype) * (layer_idx + 2)
+        src_k_data.append(src_k)
+        src_v_data.append(src_v)
+
+        # Compute MD5 for verification (only for the cpu_block_ids blocks in source)
+        # cpu_block_ids indicates which source blocks get copied into CPU pinned memory
+        k_np = np.array(src_k)[cpu_block_ids]
+        v_np = np.array(src_v)[cpu_block_ids]
+        md5_sums.append((compute_md5(k_np), compute_md5(v_np)))
+
+        # GPU tensors (destination for H2D, source for D2H)
+        dst_k = paddle.zeros(config.kv_shape, dtype=config.dtype).to(device)
+        dst_v = paddle.zeros(config.kv_shape, dtype=config.dtype).to(device)
+        gpu_k_tensors.append(dst_k)
+        gpu_v_tensors.append(dst_v)
+
+        # Allocate CPU pinned memory
+        k_ptr = cuda_host_alloc(bytes_per_block * num_blocks_to_transfer)
+        v_ptr = cuda_host_alloc(bytes_per_block * num_blocks_to_transfer)
+
+        # Fill CPU memory: pack the cpu_block_ids blocks contiguously
+        k_np_full = np.array(src_k)
+        v_np_full = np.array(src_v)
+        k_np_flat = k_np_full[cpu_block_ids].flatten()
+        v_np_flat = v_np_full[cpu_block_ids].flatten()
+        ctypes.memmove(k_ptr, k_np_flat.ctypes.data, bytes_per_block * num_blocks_to_transfer)
+        ctypes.memmove(v_ptr, v_np_flat.ctypes.data, bytes_per_block * num_blocks_to_transfer)
+
+        k_ptrs.append(k_ptr)
+        v_ptrs.append(v_ptr)
+
+    total_transfer_bytes = num_blocks_to_transfer * config.block_bytes * config.num_layers * 2
+
+    return (
+        gpu_k_tensors,
+        gpu_v_tensors,
+        k_ptrs,
+        v_ptrs,
+        src_k_data,
+        src_v_data,
+        md5_sums,
+        total_transfer_bytes,
+        gpu_block_ids,
+        cpu_block_ids,
+    )
+
+
+def verify_transfer_correctness(
+    gpu_tensors,
+    src_data_list,
+    md5_sums,
+    num_blocks_to_check,
+    config: TestConfig,
+    atol=1e-2,
+    rtol=1e-2,
+    gpu_block_ids=None,
+    src_block_ids=None,
+):
+    """
+    Verify transfer correctness by comparing data and MD5 checksums.
+
+    Args:
+        gpu_block_ids: indices of blocks on GPU that were written (H2D destination).
+                       If None, defaults to 0..num_blocks_to_check-1 (consecutive).
+        src_block_ids: indices into src_data_list tensors that correspond to the
+                       source blocks (i.e. what was in CPU memory).
+                       If None, defaults to 0..num_blocks_to_check-1 (consecutive).
+
+    Returns:
+        Tuple of (md5_passed, data_passed)
+    """
+    if gpu_block_ids is None:
+        gpu_block_ids = list(range(num_blocks_to_check))
+    if src_block_ids is None:
+        src_block_ids = list(range(num_blocks_to_check))
+
+    md5_passed = True
+    data_passed = True
+
+    for layer_idx in range(config.num_layers):
+        gpu_data = gpu_tensors[layer_idx].cpu().numpy()
+        # Only check the transferred blocks (by gpu_block_ids)
+        gpu_data = gpu_data[gpu_block_ids]
+        src_np = np.array(src_data_list[layer_idx])[src_block_ids]
+
+        # Check MD5 checksum
+        actual_md5 = compute_md5(gpu_data)
+        expected_md5 = md5_sums[layer_idx]
+        if actual_md5 != expected_md5:
+            md5_passed = False
+
+        # Check numerical correctness
+        if not np.allclose(gpu_data, src_np, rtol=rtol, atol=atol):
+            data_passed = False
+
+    return md5_passed, data_passed
+
+
+def benchmark_transfer(
+    op_func,
+    gpu_k_tensors,
+    gpu_v_tensors,
+    k_ptrs,
+    v_ptrs,
+    num_blocks,
+    gpu_block_ids,
+    cpu_block_ids,
+    device_id,
+    mode,
+    num_warmup=2,
+    num_iterations=5,
+):
+    """
+    Benchmark transfer operation.
+
+    Returns:
+        Tuple of (avg_time_ms, all_times_ms)
+    """
+    # Warmup
+    for _ in range(num_warmup):
+        op_func(
+            gpu_k_tensors,
+            k_ptrs,
+            num_blocks,
+            gpu_block_ids,
+            cpu_block_ids,
+            device_id,
+            mode,
+        )
+        op_func(
+            gpu_v_tensors,
+            v_ptrs,
+            num_blocks,
+            gpu_block_ids,
+            cpu_block_ids,
+            device_id,
+            mode,
+        )
+    paddle.device.cuda.synchronize()
+
+    # Benchmark
+    times = []
+    for _ in range(num_iterations):
+        start = paddle.device.cuda.Event(enable_timing=True)
+        end = paddle.device.cuda.Event(enable_timing=True)
+
+        start.record()
+        op_func(
+            gpu_k_tensors,
+            k_ptrs,
+            num_blocks,
+            gpu_block_ids,
+            cpu_block_ids,
+            device_id,
+            mode,
+        )
+        op_func(
+            gpu_v_tensors,
+            v_ptrs,
+            num_blocks,
+            gpu_block_ids,
+            cpu_block_ids,
+            device_id,
+            mode,
+        )
+        end.record()
+        paddle.device.cuda.synchronize()
+
+        times.append(start.elapsed_time(end))
+
+    avg_time = statistics.mean(times)
+    return avg_time, times
+
+
+class TestSwapCacheAllLayersCorrectness(unittest.TestCase):
+    """Test correctness of swap_cache_all_layers operator."""
+
+    @classmethod
+    def setUpClass(cls):
+        raise unittest.SkipTest("Swap cache ops test temporarily skipped")
+        """Set up test environment."""
+        if not paddle.is_compiled_with_cuda():
+            raise unittest.SkipTest("CUDA not available, skipping GPU tests")
+
+    def setUp(self):
+        """Set up each test."""
+        self.config = TestConfig(
+            num_layers=64,
+            num_heads=16,
+            head_dim=128,
+            block_size=64,
+            total_block_num=256,
+        )
+        self.device_id = 0
+        self.num_blocks = 256  # Number of blocks to transfer in each test
+
+    def test_h2d_transfer_correctness(self):
+        """Test Host->Device (load) transfer correctness with MD5 verification."""
+        (
+            gpu_k_tensors,
+            gpu_v_tensors,
+            k_ptrs,
+            v_ptrs,
+            src_k_data,
+            src_v_data,
+            md5_sums,
+            _,
+            gpu_block_ids,
+            cpu_block_ids,
+        ) = init_test_data(self.config, self.num_blocks)
+
+        # Perform H2D transfer
+        swap_cache_all_layers(
+            gpu_k_tensors,
+            k_ptrs,
+            self.config.total_block_num,
+            gpu_block_ids,
+            cpu_block_ids,
+            self.device_id,
+            mode=1,  # Host->Device
+        )
+        swap_cache_all_layers(
+            gpu_v_tensors,
+            v_ptrs,
+            self.config.total_block_num,
+            gpu_block_ids,
+            cpu_block_ids,
+            self.device_id,
+            mode=1,
+        )
+        paddle.device.cuda.synchronize()
+
+        # Verify correctness
+        k_md5_ok, k_data_ok = verify_transfer_correctness(
+            gpu_k_tensors, src_k_data, [m[0] for m in md5_sums], self.num_blocks, self.config
+        )
+        v_md5_ok, v_data_ok = verify_transfer_correctness(
+            gpu_v_tensors, src_v_data, [m[1] for m in md5_sums], self.num_blocks, self.config
+        )
+
+        self.assertTrue(k_md5_ok, "K cache MD5 mismatch after H2D transfer")
+        self.assertTrue(v_md5_ok, "V cache MD5 mismatch after H2D transfer")
+        self.assertTrue(k_data_ok, "K cache data mismatch after H2D transfer")
+        self.assertTrue(v_data_ok, "V cache data mismatch after H2D transfer")
+
+    def test_d2h_transfer_correctness(self):
+        """Test Device->Host (evict) transfer correctness."""
+        (
+            gpu_k_tensors,
+            gpu_v_tensors,
+            k_ptrs,
+            v_ptrs,
+            src_k_data,
+            src_v_data,
+            md5_sums,
+            _,
+            gpu_block_ids,
+            cpu_block_ids,
+        ) = init_test_data(self.config, self.num_blocks)
+
+        # First H2D to fill GPU
+        swap_cache_all_layers(
+            gpu_k_tensors,
+            k_ptrs,
+            self.config.total_block_num,
+            gpu_block_ids,
+            cpu_block_ids,
+            self.device_id,
+            mode=1,
+        )
+        swap_cache_all_layers(
+            gpu_v_tensors,
+            v_ptrs,
+            self.config.total_block_num,
+            gpu_block_ids,
+            cpu_block_ids,
+            self.device_id,
+            mode=1,
+        )
+        paddle.device.cuda.synchronize()
+
+        # Clear CPU memory (use uint16 to match bfloat16 storage)
+        bytes_per_block = self.config.kv_cache_dim * self.config.element_size
+        zero_data = np.zeros(self.num_blocks * self.config.kv_cache_dim, dtype=np.uint16)
+        for k_ptr, v_ptr in zip(k_ptrs, v_ptrs):
+            ctypes.memmove(k_ptr, zero_data.ctypes.data, bytes_per_block * self.num_blocks)
+            ctypes.memmove(v_ptr, zero_data.ctypes.data, bytes_per_block * self.num_blocks)
+
+        # Perform D2H transfer
+        swap_cache_all_layers(
+            gpu_k_tensors,
+            k_ptrs,
+            self.config.total_block_num,
+            gpu_block_ids,
+            cpu_block_ids,
+            self.device_id,
+            mode=0,  # Device->Host
+        )
+        swap_cache_all_layers(
+            gpu_v_tensors,
+            v_ptrs,
+            self.config.total_block_num,
+            gpu_block_ids,
+            cpu_block_ids,
+            self.device_id,
+            mode=0,
+        )
+        paddle.device.cuda.synchronize()
+
+        # Verify data in CPU memory
+        bytes_per_layer = bytes_per_block * self.num_blocks
+        k_md5_ok = True
+        v_md5_ok = True
+
+        for layer_idx in range(self.config.num_layers):
+            # Read back from CPU memory (use uint16 to match bfloat16 storage)
+            k_np = np.zeros(self.num_blocks * self.config.kv_cache_dim, dtype=np.uint16)
+            v_np = np.zeros(self.num_blocks * self.config.kv_cache_dim, dtype=np.uint16)
+            ctypes.memmove(k_np.ctypes.data, k_ptrs[layer_idx], bytes_per_layer)
+            ctypes.memmove(v_np.ctypes.data, v_ptrs[layer_idx], bytes_per_layer)
+
+            # Reshape to compare
+            k_np = k_np.reshape(self.num_blocks, self.config.num_heads, self.config.block_size, self.config.head_dim)
+            v_np = v_np.reshape(self.num_blocks, self.config.num_heads, self.config.block_size, self.config.head_dim)
+
+            # Check MD5
+            if compute_md5(k_np) != md5_sums[layer_idx][0]:
+                k_md5_ok = False
+            if compute_md5(v_np) != md5_sums[layer_idx][1]:
+                v_md5_ok = False
+
+        self.assertTrue(k_md5_ok, "K cache MD5 mismatch after D2H transfer")
+        self.assertTrue(v_md5_ok, "V cache MD5 mismatch after D2H transfer")
+
+
+class TestSwapCacheAllLayersPerformance(unittest.TestCase):
+    """Test performance of swap_cache_all_layers operator."""
+
+    @classmethod
+    def setUpClass(cls):
+        raise unittest.SkipTest("Swap cache ops test temporarily skipped")
+
+    def setUp(self):
+        """Set up each test."""
+        self.config = TestConfig(
+            num_layers=64,
+            num_heads=16,
+            head_dim=128,
+            block_size=64,
+            total_block_num=256,
+        )
+        self.device_id = 0
+        self.num_blocks = 256
+
+    def test_h2d_bandwidth(self):
+        """Test H2D transfer bandwidth."""
+        (
+            gpu_k_tensors,
+            gpu_v_tensors,
+            k_ptrs,
+            v_ptrs,
+            _,
+            _,
+            _,
+            total_bytes,
+            gpu_block_ids,
+            cpu_block_ids,
+        ) = init_test_data(self.config, self.num_blocks)
+
+        avg_time, _ = benchmark_transfer(
+            swap_cache_all_layers,
+            gpu_k_tensors,
+            gpu_v_tensors,
+            k_ptrs,
+            v_ptrs,
+            self.config.total_block_num,
+            gpu_block_ids,
+            cpu_block_ids,
+            self.device_id,
+            mode=1,
+            num_warmup=2,
+            num_iterations=5,
+        )
+
+        bandwidth_gbps = (total_bytes / (1024**3)) / (avg_time / 1000)
+
+        print("\n swap_cache_all_layers H2D Performance:")
+        print(f"  Data size: {total_bytes / (1024**3):.2f} GB")
+        print(f"  Avg time: {avg_time:.2f} ms")
+        print(f"  Bandwidth: {bandwidth_gbps:.2f} GB/s")
+
+        # Sanity check: bandwidth should be > 1 GB/s
+        self.assertGreater(bandwidth_gbps, 1.0)
+
+    def test_d2h_bandwidth(self):
+        """Test D2H transfer bandwidth."""
+        (
+            gpu_k_tensors,
+            gpu_v_tensors,
+            k_ptrs,
+            v_ptrs,
+            _,
+            _,
+            _,
+            total_bytes,
+            gpu_block_ids,
+            cpu_block_ids,
+        ) = init_test_data(self.config, self.num_blocks)
+
+        # First H2D to fill GPU
+        swap_cache_all_layers(
+            gpu_k_tensors,
+            k_ptrs,
+            self.config.total_block_num,
+            gpu_block_ids,
+            cpu_block_ids,
+            self.device_id,
+            mode=1,
+        )
+        swap_cache_all_layers(
+            gpu_v_tensors,
+            v_ptrs,
+            self.config.total_block_num,
+            gpu_block_ids,
+            cpu_block_ids,
+            self.device_id,
+            mode=1,
+        )
+        paddle.device.cuda.synchronize()
+
+        avg_time, _ = benchmark_transfer(
+            swap_cache_all_layers,
+            gpu_k_tensors,
+            gpu_v_tensors,
+            k_ptrs,
+            v_ptrs,
+            self.config.total_block_num,
+            gpu_block_ids,
+            cpu_block_ids,
+            self.device_id,
+            mode=0,
+            num_warmup=2,
+            num_iterations=5,
+        )
+
+        bandwidth_gbps = (total_bytes / (1024**3)) / (avg_time / 1000)
+
+        print("\n swap_cache_all_layers D2H Performance:")
+        print(f"  Data size: {total_bytes / (1024**3):.2f} GB")
+        print(f"  Avg time: {avg_time:.2f} ms")
+        print(f"  Bandwidth: {bandwidth_gbps:.2f} GB/s")
+
+        self.assertGreater(bandwidth_gbps, 1.0)
+
+
+@unittest.skip("Swap cache ops test temporarily skipped")
+class TestSwapCacheRandomBlockIndices(unittest.TestCase):
+    """
+    Test swap operations with random, varying block indices per round.
+
+    Simulates real-world cache eviction/loading patterns:
+    - Each round picks a different random subset of blocks
+    - Block count varies per round (e.g. 4~64 out of 128 total)
+    - Verifies both swapped blocks (MD5 + allclose) and non-swapped blocks
+    - Tests swap_cache_all_layers
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if not paddle.is_compiled_with_cuda():
+            raise unittest.SkipTest("CUDA not available, skipping GPU tests")
+
+    def setUp(self):
+        self.config = TestConfig(
+            num_layers=64,
+            num_heads=16,
+            head_dim=128,
+            block_size=64,
+            total_block_num=256,
+        )
+        self.device_id = 0
+        self.num_rounds = 10
+        self.min_blocks = 32
+        self.max_blocks = 128
+        self.seed = 2025
+
+    def _init_all_gpu_blocks(self):
+        """Initialize ALL GPU blocks with unique random data. Returns ground truth numpy arrays."""
+        config = self.config
+        gpu_k, gpu_v, gt_k, gt_v = [], [], [], []
+        for li in range(config.num_layers):
+            paddle.seed(self.seed + li * 1000)
+            k = paddle.randn(config.kv_shape, dtype=paddle.float32).cast(config.dtype)
+            v = paddle.randn(config.kv_shape, dtype=paddle.float32).cast(config.dtype)
+            gt_k.append(np.array(k).copy())
+            gt_v.append(np.array(v).copy())
+            gpu_k.append(k.to("cuda"))
+            gpu_v.append(v.to("cuda"))
+        paddle.device.cuda.synchronize()
+        return gpu_k, gpu_v, gt_k, gt_v
+
+    def _snapshot_non_swap_blocks(self, gpu_k, gpu_v, swap_ids, rng):
+        """Snapshot a few non-swapped blocks for later corruption check."""
+        non_swap = [i for i in range(self.config.total_block_num) if i not in set(swap_ids)]
+        check_ids = sorted(rng.sample(non_swap, min(5, len(non_swap))))
+        snapshots = {}
+        for name, tensors in [("k", gpu_k), ("v", gpu_v)]:
+            for li in range(self.config.num_layers):
+                data = tensors[li].cpu().numpy()
+                for bid in check_ids:
+                    snapshots[(name, li, bid)] = data[bid].copy()
+        return snapshots
+
+    def _zero_gpu_blocks(self, gpu_k, gpu_v, block_ids):
+        """Zero out specific blocks on GPU via numpy round-trip."""
+        for t in gpu_k + gpu_v:
+            arr = t.cpu().numpy().copy()
+            for bid in block_ids:
+                arr[bid] = 0
+            t.copy_(paddle.to_tensor(arr, place=t.place))
+        paddle.device.cuda.synchronize()
+
+    def _verify_cpu_against_gt(self, k_ptrs, v_ptrs, gt_k, gt_v, swap_ids, num_blocks, label):
+        """Read CPU pinned memory and compare MD5 with ground truth."""
+        config = self.config
+        bytes_per_block = config.kv_cache_dim * config.element_size
+        total_bytes = bytes_per_block * num_blocks
+        for li in range(config.num_layers):
+            for ptrs, gt_list, kv_name in [(k_ptrs, gt_k, "K"), (v_ptrs, gt_v, "V")]:
+                buf = np.zeros(num_blocks * config.kv_cache_dim, dtype=np.uint16)
+                ctypes.memmove(buf.ctypes.data, ptrs[li], total_bytes)
+                buf = buf.reshape(num_blocks, config.num_heads, config.block_size, config.head_dim)
+                expected = gt_list[li][swap_ids]
+                self.assertEqual(
+                    compute_md5(buf),
+                    compute_md5(expected),
+                    f"{label} Layer {li} {kv_name}: MD5 mismatch in CPU memory after D2H",
+                )
+
+    def _verify_gpu_against_gt(self, gpu_k, gpu_v, gt_k, gt_v, swap_ids, label):
+        """Read GPU tensors and compare with ground truth at swap_ids."""
+        for li in range(self.config.num_layers):
+            for tensors, gt_list, kv_name in [(gpu_k, gt_k, "K"), (gpu_v, gt_v, "V")]:
+                actual = tensors[li].cpu().numpy()[swap_ids]
+                expected = gt_list[li][swap_ids]
+                self.assertEqual(
+                    compute_md5(actual),
+                    compute_md5(expected),
+                    f"{label} Layer {li} {kv_name}: MD5 mismatch on GPU after H2D",
+                )
+                self.assertTrue(
+                    np.allclose(actual, expected, rtol=1e-2, atol=1e-2),
+                    f"{label} Layer {li} {kv_name}: data mismatch on GPU after H2D",
+                )
+
+    def _verify_non_swap_unchanged(self, gpu_k, gpu_v, snapshots, label):
+        """Verify that non-swapped blocks were not corrupted by swap operations."""
+        for (name, li, bid), expected_data in snapshots.items():
+            tensors = gpu_k if name == "k" else gpu_v
+            actual = tensors[li].cpu().numpy()[bid]
+            self.assertTrue(
+                np.array_equal(actual, expected_data),
+                f"{label} {name.upper()} layer {li} block {bid}: non-swapped block corrupted!",
+            )
+
+    def _run_multi_round(self, op_func, op_name):
+        """
+        Core multi-round test logic:
+        Each round picks a different random subset of blocks, does D2H then H2D,
+        and verifies: CPU correctness after D2H, GPU correctness after H2D,
+        and non-swapped blocks are not corrupted.
+        """
+        rng = random.Random(self.seed)
+        config = self.config
+        bytes_per_block = config.kv_cache_dim * config.element_size
+
+        gpu_k, gpu_v, gt_k, gt_v = self._init_all_gpu_blocks()
+
+        for round_idx in range(self.num_rounds):
+            num_swap = rng.randint(self.min_blocks, self.max_blocks)
+            swap_ids = sorted(rng.sample(range(config.total_block_num), num_swap))
+            cpu_ids = list(range(num_swap))
+            label = f"[{op_name} Round {round_idx + 1}/{self.num_rounds}, {num_swap} blocks]"
+
+            print(f"\n{label}")
+            print(f"  swap_ids (first 8): {swap_ids[:8]}...")
+
+            # Snapshot non-swapped blocks before swap
+            snapshots = self._snapshot_non_swap_blocks(gpu_k, gpu_v, swap_ids, rng)
+
+            # Allocate CPU pinned memory for this round
+            k_ptrs, v_ptrs = [], []
+            for li in range(config.num_layers):
+                k_ptrs.append(cuda_host_alloc(bytes_per_block * num_swap))
+                v_ptrs.append(cuda_host_alloc(bytes_per_block * num_swap))
+
+            # === D2H: evict GPU -> CPU ===
+            op_func(gpu_k, k_ptrs, num_swap, swap_ids, cpu_ids, self.device_id, mode=0)
+            op_func(gpu_v, v_ptrs, num_swap, swap_ids, cpu_ids, self.device_id, mode=0)
+            paddle.device.cuda.synchronize()
+            self._verify_cpu_against_gt(k_ptrs, v_ptrs, gt_k, gt_v, swap_ids, num_swap, f"{label} D2H")
+            print("  D2H CPU verify: PASS")
+
+            # Zero swapped blocks on GPU to ensure H2D must write correct data
+            self._zero_gpu_blocks(gpu_k, gpu_v, swap_ids)
+
+            # === H2D: load CPU -> GPU ===
+            op_func(gpu_k, k_ptrs, num_swap, swap_ids, cpu_ids, self.device_id, mode=1)
+            op_func(gpu_v, v_ptrs, num_swap, swap_ids, cpu_ids, self.device_id, mode=1)
+            paddle.device.cuda.synchronize()
+            self._verify_gpu_against_gt(gpu_k, gpu_v, gt_k, gt_v, swap_ids, f"{label} H2D")
+            print("  H2D GPU verify: PASS")
+
+            # Verify non-swapped blocks were not corrupted
+            self._verify_non_swap_unchanged(gpu_k, gpu_v, snapshots, label)
+            print("  Non-swap corruption check: PASS")
+
+        print(f"\nAll {self.num_rounds} rounds passed ({op_name}).")
+
+    def test_random_indices_multi_round_non_batch(self):
+        """Multi-round swap with varying random block indices using non-batch operator."""
+        self._run_multi_round(swap_cache_all_layers, "non-batch")
+
+
+if __name__ == "__main__":
+    paddle.device.set_device("cuda:0")
+    unittest.main()
diff --git a/tests/cache_manager/v1/test_transfer_manager.py b/tests/cache_manager/v1/test_transfer_manager.py
new file mode 100644
index 00000000000..5cbafb98bf9
--- /dev/null
+++ b/tests/cache_manager/v1/test_transfer_manager.py
@@ -0,0 +1,651 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+Unit tests for CacheTransferManager class.
+
+Tests cover:
+- Device cache map sharing (set_device_cache_kvs_map)
+- Host cache map sharing (set_host_cache_kvs_map)
+- Layer indices building (_build_device_layer_indices, _build_host_layer_indices)
+- Metadata properties (num_layers, local_rank, device_id, etc.)
+- Layer indexed access methods
+- Host<->Device swap methods (evict/load)
+- Parameter validation
+"""
+
+import unittest
+from unittest.mock import Mock, patch
+
+import paddle
+from utils import get_default_test_fd_config
+
+
+def create_transfer_manager(
+    enable_prefix_caching: bool = True,
+    num_host_blocks: int = 50,
+):
+    """Helper to create CacheTransferManager with test config."""
+    from fastdeploy.cache_manager.v1.transfer_manager import CacheTransferManager
+
+    config = get_default_test_fd_config()
+    config.cache_config.enable_prefix_caching = enable_prefix_caching
+    config.cache_config.num_cpu_blocks = num_host_blocks
+    config.cache_config.cache_dtype = "bfloat16"
+
+    return CacheTransferManager(config)
+
+
+def create_mock_device_cache_kvs_map(
+    num_layers: int = 4,
+    local_rank: int = 0,
+    device_id: int = 0,
+    include_scales: bool = False,
+    dtype: str = "bfloat16",
+    num_blocks: int = 100,
+    num_heads: int = 32,
+    block_size: int = 64,
+    head_dim: int = 128,
+):
+    """
+    Helper to create mock device cache_kvs_map.
+
+    Device cache stores paddle.Tensor objects on GPU.
+    """
+    cache_kvs_map = {}
+
+    for layer_idx in range(num_layers):
+        key_name = f"key_caches_{layer_idx}_rank{local_rank}.device{device_id}"
+        val_name = f"value_caches_{layer_idx}_rank{local_rank}.device{device_id}"
+
+        # Create real tensors on GPU
+        key_tensor = paddle.zeros([num_blocks, num_heads, block_size, head_dim], dtype=dtype)
+        val_tensor = paddle.zeros([num_blocks, num_heads, block_size, head_dim], dtype=dtype)
+
+        cache_kvs_map[key_name] = key_tensor
+        cache_kvs_map[val_name] = val_tensor
+
+        if include_scales:
+            key_scale_name = f"key_cache_scales_{layer_idx}_rank{local_rank}.device{device_id}"
+            val_scale_name = f"value_cache_scales_{layer_idx}_rank{local_rank}.device{device_id}"
+
+            key_scale_tensor = paddle.ones([num_blocks, num_heads, block_size], dtype="float32")
+            val_scale_tensor = paddle.ones([num_blocks, num_heads, block_size], dtype="float32")
+
+            cache_kvs_map[key_scale_name] = key_scale_tensor
+            cache_kvs_map[val_scale_name] = val_scale_tensor
+
+    return cache_kvs_map
+
+
+def create_mock_host_cache_kvs_map(
+    num_layers: int = 4,
+    local_rank: int = 0,
+    device_id: int = 0,
+    include_scales: bool = False,
+    base_ptr: int = 1000000,
+):
+    """
+    Helper to create mock host cache_kvs_map (with int pointers).
+
+    Host cache stores pinned memory pointers (int) on CPU.
+    """
+    cache_kvs_map = {}
+
+    for layer_idx in range(num_layers):
+        key_name = f"key_caches_{layer_idx}_rank{local_rank}.device{device_id}"
+        val_name = f"value_caches_{layer_idx}_rank{local_rank}.device{device_id}"
+
+        # Use int pointers (simulating cuda_host_alloc result)
+        cache_kvs_map[key_name] = base_ptr + layer_idx * 10000
+        cache_kvs_map[val_name] = base_ptr + layer_idx * 10000 + 5000
+
+        if include_scales:
+            key_scale_name = f"key_cache_scales_{layer_idx}_rank{local_rank}.device{device_id}"
+            val_scale_name = f"value_cache_scales_{layer_idx}_rank{local_rank}.device{device_id}"
+
+            cache_kvs_map[key_scale_name] = base_ptr + layer_idx * 10000 + 20000
+            cache_kvs_map[val_scale_name] = base_ptr + layer_idx * 10000 + 25000
+
+    return cache_kvs_map
+
+
+# ============================================================================
+# Initialization Tests
+# ============================================================================
+
+
+class TestCacheTransferManagerInit(unittest.TestCase):
+    """Test CacheTransferManager initialization."""
+
+    def test_init_basic(self):
+        """Test basic initialization."""
+        manager = create_transfer_manager()
+
+        self.assertIsNotNone(manager)
+        # Device cache storage
+        self.assertEqual(manager._cache_kvs_map, {})
+        self.assertEqual(manager._device_key_caches, [])
+        self.assertEqual(manager._device_value_caches, [])
+
+        # Host cache storage
+        self.assertEqual(manager._host_cache_kvs_map, {})
+        self.assertEqual(manager._host_key_ptrs, [])
+        self.assertEqual(manager._host_value_ptrs, [])
+
+    def test_init_metadata_defaults(self):
+        """Test default metadata values from config."""
+        manager = create_transfer_manager()
+
+        # These values are read from config, not defaults
+        self.assertEqual(manager._local_rank, 0)
+        self.assertEqual(manager._device_id, 0)
+        self.assertEqual(manager._cache_dtype, "bfloat16")
+        self.assertEqual(manager._num_host_blocks, 50)  # from create_transfer_manager
+        # num_layers comes from config, check it's set
+        self.assertGreater(manager._num_layers, 0)
+
+
+# ============================================================================
+# Device Cache Map Sharing Tests
+# ============================================================================
+
+
+class TestSetDeviceCacheKvsMap(unittest.TestCase):
+    """Test set_cache_kvs_map for device cache."""
+
+    def test_set_device_cache_kvs_map_basic(self):
+        """Test setting device cache_kvs_map."""
+        manager = create_transfer_manager()
+        num_layers = manager._num_layers  # Use actual num_layers from config
+        device_cache = create_mock_device_cache_kvs_map(num_layers=num_layers)
+
+        manager.set_cache_kvs_map(device_cache)
+
+        self.assertEqual(manager._cache_kvs_map, device_cache)
+
+    def test_set_device_cache_kvs_map_builds_layer_indices(self):
+        """Test that device layer indices are built correctly."""
+        manager = create_transfer_manager()
+        num_layers = manager._num_layers  # Use actual num_layers from config
+        device_cache = create_mock_device_cache_kvs_map(num_layers=num_layers)
+
+        manager.set_cache_kvs_map(device_cache)
+
+        self.assertEqual(len(manager._device_key_caches), num_layers)
+        self.assertEqual(len(manager._device_value_caches), num_layers)
+
+        # Verify each layer has correct tensor (compare by identity)
+        for i in range(num_layers):
+            key_name = f"key_caches_{i}_rank0.device0"
+            val_name = f"value_caches_{i}_rank0.device0"
+            self.assertIs(manager._device_key_caches[i], device_cache[key_name])
+            self.assertIs(manager._device_value_caches[i], device_cache[val_name])
+
+    def test_set_device_cache_kvs_map_with_scales(self):
+        """Test setting device cache_kvs_map with fp8 scales."""
+        from fastdeploy.cache_manager.v1.transfer_manager import CacheTransferManager
+
+        config = get_default_test_fd_config()
+        # Enable fp8 quantization to store scales
+        config.quant_config = Mock()
+        config.quant_config.kv_cache_quant_type = "block_wise_fp8"
+        config.cache_config.num_cpu_blocks = 50
+        config.cache_config.cache_dtype = "bfloat16"
+
+        manager = CacheTransferManager(config)
+        num_layers = manager._num_layers
+        device_cache = create_mock_device_cache_kvs_map(num_layers=num_layers, include_scales=True)
+
+        manager.set_cache_kvs_map(device_cache)
+
+        # Scales should be stored when fp8 quantization is enabled
+        self.assertEqual(len(manager._device_key_scales), num_layers)
+        self.assertEqual(len(manager._device_value_scales), num_layers)
+
+    def test_set_device_cache_kvs_map_empty(self):
+        """Test setting empty cache_kvs_map."""
+        manager = create_transfer_manager()
+        num_layers = manager._num_layers  # num_layers is still from config
+
+        manager.set_cache_kvs_map({})
+
+        # num_layers stays the same (from config)
+        self.assertEqual(manager._num_layers, num_layers)
+        # layer indices should be empty since no cache provided
+        self.assertEqual(len(manager._device_key_caches), 0)
+
+    def test_set_device_cache_kvs_map_different_rank_device(self):
+        """Test setting cache_kvs_map with different rank and device names."""
+        manager = create_transfer_manager()
+        num_layers = manager._num_layers
+        # Create cache with different rank/device names - should not match
+        device_cache = create_mock_device_cache_kvs_map(num_layers=num_layers, local_rank=2, device_id=3)
+
+        manager.set_cache_kvs_map(device_cache)
+
+        # The layer indices should have None values since names don't match
+        # (local_rank=0, device_id=0 in manager, but cache has rank=2, device=3)
+        self.assertTrue(all(c is None for c in manager._device_key_caches))
+
+
+# ============================================================================
+# Host Cache Map Sharing Tests
+# ============================================================================
+
+
+class TestSetHostCacheKvsMap(unittest.TestCase):
+    """Test set_host_cache_kvs_map for host cache."""
+
+    def test_set_host_cache_kvs_map_basic(self):
+        """Test setting host cache_kvs_map."""
+        manager = create_transfer_manager()
+        num_layers = manager._num_layers
+
+        # First set device cache to initialize layer indices
+        device_cache = create_mock_device_cache_kvs_map(num_layers=num_layers)
+        manager.set_cache_kvs_map(device_cache)
+
+        host_cache = create_mock_host_cache_kvs_map(num_layers=num_layers)
+        manager.set_host_cache_kvs_map(host_cache)
+
+        self.assertEqual(manager._host_cache_kvs_map, host_cache)
+
+    def test_set_host_cache_kvs_map_builds_layer_indices(self):
+        """Test that host layer indices are built correctly."""
+        manager = create_transfer_manager()
+        num_layers = manager._num_layers
+
+        device_cache = create_mock_device_cache_kvs_map(num_layers=num_layers)
+        manager.set_cache_kvs_map(device_cache)
+
+        host_cache = create_mock_host_cache_kvs_map(num_layers=num_layers)
+        manager.set_host_cache_kvs_map(host_cache)
+
+        self.assertEqual(len(manager._host_key_ptrs), num_layers)
+        self.assertEqual(len(manager._host_value_ptrs), num_layers)
+
+        # Verify pointers are integers
+        for i in range(num_layers):
+            self.assertIsInstance(manager._host_key_ptrs[i], int)
+            self.assertIsInstance(manager._host_value_ptrs[i], int)
+            self.assertGreater(manager._host_key_ptrs[i], 0)
+            self.assertGreater(manager._host_value_ptrs[i], 0)
+
+    def test_set_host_cache_kvs_map_with_scales(self):
+        """Test setting host cache_kvs_map with fp8 scales."""
+        from fastdeploy.cache_manager.v1.transfer_manager import CacheTransferManager
+
+        config = get_default_test_fd_config()
+        # Enable fp8 quantization to store scales
+        config.quant_config = Mock()
+        config.quant_config.kv_cache_quant_type = "block_wise_fp8"
+        config.cache_config.num_cpu_blocks = 50
+        config.cache_config.cache_dtype = "bfloat16"
+
+        manager = CacheTransferManager(config)
+        num_layers = manager._num_layers
+
+        device_cache = create_mock_device_cache_kvs_map(num_layers=num_layers, include_scales=True)
+        manager.set_cache_kvs_map(device_cache)
+
+        host_cache = create_mock_host_cache_kvs_map(num_layers=num_layers, include_scales=True)
+        manager.set_host_cache_kvs_map(host_cache)
+
+        # Scales should be stored when fp8 quantization is enabled
+        self.assertEqual(len(manager._host_key_scales_ptrs), num_layers)
+        self.assertEqual(len(manager._host_value_scales_ptrs), num_layers)
+
+
+# ============================================================================
+# Metadata Properties Tests
+# ============================================================================
+
+
+class TestMetadataProperties(unittest.TestCase):
+    """Test metadata properties."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.manager = create_transfer_manager()
+        self.num_layers = self.manager._num_layers
+        device_cache = create_mock_device_cache_kvs_map(num_layers=self.num_layers)
+        self.manager.set_cache_kvs_map(device_cache)
+
+    def test_num_layers_property(self):
+        """Test num_layers property."""
+        self.assertEqual(self.manager.num_layers, self.num_layers)
+
+    def test_local_rank_property(self):
+        """Test local_rank property."""
+        self.assertEqual(self.manager.local_rank, 0)
+
+    def test_device_id_property(self):
+        """Test device_id property."""
+        self.assertEqual(self.manager.device_id, 0)
+
+    def test_cache_dtype_property(self):
+        """Test cache_dtype property."""
+        self.assertEqual(self.manager.cache_dtype, "bfloat16")
+
+    def test_has_cache_scale_property_false(self):
+        """Test has_cache_scale property when no scales."""
+        self.assertFalse(self.manager.has_cache_scale)
+
+    def test_has_cache_scale_property_true(self):
+        """Test has_cache_scale property with fp8 quantization config."""
+        from fastdeploy.cache_manager.v1.transfer_manager import CacheTransferManager
+
+        config = get_default_test_fd_config()
+        # Mock quant_config to have kv_cache_quant_type
+        config.quant_config = Mock()
+        config.quant_config.kv_cache_quant_type = "block_wise_fp8"
+
+        manager = CacheTransferManager(config)
+        self.assertTrue(manager.has_cache_scale)
+
+    def test_num_host_blocks_property(self):
+        """Test num_host_blocks property."""
+        # num_host_blocks is set from config (50 in create_transfer_manager)
+        self.assertEqual(self.manager.num_host_blocks, 50)
+
+
+# ============================================================================
+# Layer Indexed Access Tests
+# ============================================================================
+
+
+class TestLayerIndexedAccess(unittest.TestCase):
+    """Test layer-indexed access methods."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.manager = create_transfer_manager()
+        self.num_layers = self.manager._num_layers
+        self.device_cache = create_mock_device_cache_kvs_map(num_layers=self.num_layers)
+        self.manager.set_cache_kvs_map(self.device_cache)
+
+        self.host_cache = create_mock_host_cache_kvs_map(num_layers=self.num_layers)
+        self.manager.set_host_cache_kvs_map(self.host_cache)
+
+    # --- Device cache access ---
+
+    def test_get_device_key_cache_valid(self):
+        """Test get_device_key_cache with valid index."""
+        for i in range(self.num_layers):
+            cache = self.manager.get_device_key_cache(i)
+            self.assertIsNotNone(cache)
+            key_name = f"key_caches_{i}_rank0.device0"
+            self.assertIs(cache, self.device_cache[key_name])
+
+    def test_get_device_key_cache_invalid(self):
+        """Test get_device_key_cache with invalid index."""
+        self.assertIsNone(self.manager.get_device_key_cache(-1))
+        self.assertIsNone(self.manager.get_device_key_cache(100))
+
+    def test_get_device_value_cache_valid(self):
+        """Test get_device_value_cache with valid index."""
+        for i in range(self.num_layers):
+            cache = self.manager.get_device_value_cache(i)
+            self.assertIsNotNone(cache)
+
+    # --- Host cache access ---
+
+    def test_get_host_key_ptr_valid(self):
+        """Test get_host_key_ptr with valid index."""
+        for i in range(self.num_layers):
+            ptr = self.manager.get_host_key_ptr(i)
+            self.assertIsInstance(ptr, int)
+            self.assertGreater(ptr, 0)
+
+    def test_get_host_key_ptr_invalid(self):
+        """Test get_host_key_ptr with invalid index."""
+        self.assertEqual(self.manager.get_host_key_ptr(-1), 0)
+        self.assertEqual(self.manager.get_host_key_ptr(100), 0)
+
+    def test_get_host_value_ptr_valid(self):
+        """Test get_host_value_ptr with valid index."""
+        for i in range(self.num_layers):
+            ptr = self.manager.get_host_value_ptr(i)
+            self.assertIsInstance(ptr, int)
+
+
+# ============================================================================
+# Swap Parameter Validation Tests
+# ============================================================================
+
+
+class TestValidateSwapParams(unittest.TestCase):
+    """Test _swap_all_layers behavior with various parameter conditions."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.manager = create_transfer_manager()
+        self.num_layers = self.manager._num_layers
+        device_cache = create_mock_device_cache_kvs_map(num_layers=self.num_layers)
+        self.manager.set_cache_kvs_map(device_cache)
+
+        host_cache = create_mock_host_cache_kvs_map(num_layers=self.num_layers)
+        self.manager.set_host_cache_kvs_map(host_cache)
+
+    @patch("fastdeploy.cache_manager.v1.transfer_manager.swap_cache_all_layers")
+    def test_swap_returns_false_when_no_host_blocks(self, mock_swap):
+        """Test _swap_all_layers returns False when num_host_blocks is 0."""
+        manager = create_transfer_manager(num_host_blocks=0)
+        device_cache = create_mock_device_cache_kvs_map(num_layers=manager._num_layers)
+        manager.set_cache_kvs_map(device_cache)
+
+        result = manager._swap_all_layers([0, 1], [10, 11], mode=0)
+        self.assertFalse(result)
+        mock_swap.assert_not_called()
+
+    @patch("fastdeploy.cache_manager.v1.transfer_manager.swap_cache_all_layers")
+    def test_swap_with_valid_params_calls_operator(self, mock_swap):
+        """Test _swap_all_layers calls operator with valid params."""
+        mock_swap.return_value = None
+
+        result = self.manager._swap_all_layers([0, 1, 2], [10, 11, 12], mode=0)
+        self.assertTrue(result)
+        self.assertGreaterEqual(mock_swap.call_count, 2)  # key + value
+
+    @patch("fastdeploy.cache_manager.v1.transfer_manager.swap_cache_all_layers")
+    def test_swap_with_empty_block_ids(self, mock_swap):
+        """Test _swap_all_layers with empty block id lists."""
+        mock_swap.return_value = None
+
+        result = self.manager._swap_all_layers([], [], mode=0)
+        self.assertTrue(result)
+        # Operator is still called (empty lists are passed through)
+        self.assertEqual(mock_swap.call_count, 2)  # key + value
+
+    @patch("fastdeploy.cache_manager.v1.transfer_manager.swap_cache_all_layers")
+    def test_swap_no_device_caches_skipped(self, mock_swap):
+        """Test _swap_all_layers returns False when device caches not initialized."""
+        manager = create_transfer_manager()
+        # Do NOT set device cache
+
+        result = manager._swap_all_layers([0, 1], [10, 11], mode=0)
+        # With no device caches loaded, num_host_blocks check passes but caches are empty
+        # The operator receives empty lists for key/value caches
+        # Actual behavior: returns True since num_host_blocks > 0
+        # (operator is called with empty layer lists)
+        self.assertIsInstance(result, bool)
+
+
+# ============================================================================
+# Swap All Layers Tests
+# ============================================================================
+
+
+class TestSwapAllLayers(unittest.TestCase):
+    """Test _swap_all_layers and related methods."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.manager = create_transfer_manager()
+        self.num_layers = self.manager._num_layers
+        device_cache = create_mock_device_cache_kvs_map(num_layers=self.num_layers)
+        self.manager.set_cache_kvs_map(device_cache)
+
+        host_cache = create_mock_host_cache_kvs_map(num_layers=self.num_layers)
+        self.manager.set_host_cache_kvs_map(host_cache)
+
+    @patch("fastdeploy.cache_manager.v1.transfer_manager.swap_cache_all_layers")
+    def test_swap_all_layers_evict_device_to_host(self, mock_swap):
+        """Test _swap_all_layers in evict mode (Device->Host)."""
+        mock_swap.return_value = None
+
+        result = self.manager._swap_all_layers(
+            device_block_ids=[0, 1, 2],
+            host_block_ids=[10, 11, 12],
+            mode=0,  # Device->Host
+        )
+
+        self.assertTrue(result)
+        # Should be called for key and value caches
+        self.assertGreaterEqual(mock_swap.call_count, 2)
+
+    @patch("fastdeploy.cache_manager.v1.transfer_manager.swap_cache_all_layers")
+    def test_swap_all_layers_load_host_to_device(self, mock_swap):
+        """Test _swap_all_layers in load mode (Host->Device)."""
+        mock_swap.return_value = None
+
+        result = self.manager._swap_all_layers(
+            device_block_ids=[0, 1, 2],
+            host_block_ids=[10, 11, 12],
+            mode=1,  # Host->Device
+        )
+
+        self.assertTrue(result)
+        self.assertGreaterEqual(mock_swap.call_count, 2)
+
+    @patch("fastdeploy.cache_manager.v1.transfer_manager.swap_cache_all_layers")
+    def test_swap_all_layers_with_fp8_scales(self, mock_swap):
+        """Test _swap_all_layers with fp8 scales."""
+        from fastdeploy.cache_manager.v1.transfer_manager import CacheTransferManager
+
+        config = get_default_test_fd_config()
+        # Mock quant_config to have kv_cache_quant_type for fp8
+        config.quant_config = Mock()
+        config.quant_config.kv_cache_quant_type = "block_wise_fp8"
+        config.cache_config.num_cpu_blocks = 50
+
+        manager = CacheTransferManager(config)
+        num_layers = manager._num_layers
+        device_cache = create_mock_device_cache_kvs_map(num_layers=num_layers, include_scales=True)
+        manager.set_cache_kvs_map(device_cache)
+
+        host_cache = create_mock_host_cache_kvs_map(num_layers=num_layers, include_scales=True)
+        manager.set_host_cache_kvs_map(host_cache)
+
+        mock_swap.return_value = None
+
+        result = manager._swap_all_layers(
+            device_block_ids=[0, 1],
+            host_block_ids=[10, 11],
+            mode=0,
+        )
+
+        self.assertTrue(result)
+        # 2 for key/value + 2 for scales = 4 calls
+        self.assertEqual(mock_swap.call_count, 4)
+
+    @patch("fastdeploy.cache_manager.v1.transfer_manager.swap_cache_all_layers")
+    def test_swap_all_layers_invalid_params(self, mock_swap):
+        """Test _swap_all_layers with empty params."""
+        mock_swap.return_value = None
+
+        result = self.manager._swap_all_layers(
+            device_block_ids=[],
+            host_block_ids=[],
+            mode=0,
+        )
+        # Empty lists should still call the operator and return True
+        self.assertTrue(result)
+        self.assertEqual(mock_swap.call_count, 2)  # key + value
+
+
+# ============================================================================
+# Cache Map Getters Tests
+# ============================================================================
+
+
+class TestCacheKvsMapGetters(unittest.TestCase):
+    """Test cache_kvs_map and host_cache_kvs_map getter properties."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.manager = create_transfer_manager()
+        self.num_layers = self.manager._num_layers
+        self.device_cache = create_mock_device_cache_kvs_map(num_layers=self.num_layers)
+        self.manager.set_cache_kvs_map(self.device_cache)
+
+        self.host_cache = create_mock_host_cache_kvs_map(num_layers=self.num_layers)
+        self.manager.set_host_cache_kvs_map(self.host_cache)
+
+    def test_device_cache_kvs_map_property(self):
+        """Test device cache_kvs_map property returns the set map."""
+        self.assertEqual(self.manager.cache_kvs_map, self.device_cache)
+
+    def test_host_cache_kvs_map_property(self):
+        """Test host cache_kvs_map property returns the set map."""
+        self.assertEqual(self.manager.host_cache_kvs_map, self.host_cache)
+
+    def test_device_key_cache_per_layer_accessible(self):
+        """Test get_device_key_cache returns correct tensor for each layer."""
+        for i in range(self.num_layers):
+            cache = self.manager.get_device_key_cache(i)
+            expected_name = f"key_caches_{i}_rank0.device0"
+            self.assertIs(cache, self.device_cache[expected_name])
+
+    def test_device_value_cache_per_layer_accessible(self):
+        """Test get_device_value_cache returns correct tensor for each layer."""
+        for i in range(self.num_layers):
+            cache = self.manager.get_device_value_cache(i)
+            expected_name = f"value_caches_{i}_rank0.device0"
+            self.assertIs(cache, self.device_cache[expected_name])
+
+    def test_host_key_ptr_per_layer_accessible(self):
+        """Test get_host_key_ptr returns correct pointer for each layer."""
+        for i in range(self.num_layers):
+            ptr = self.manager.get_host_key_ptr(i)
+            expected_name = f"key_caches_{i}_rank0.device0"
+            self.assertEqual(ptr, self.host_cache[expected_name])
+
+    def test_host_value_ptr_per_layer_accessible(self):
+        """Test get_host_value_ptr returns correct pointer for each layer."""
+        for i in range(self.num_layers):
+            ptr = self.manager.get_host_value_ptr(i)
+            expected_name = f"value_caches_{i}_rank0.device0"
+            self.assertEqual(ptr, self.host_cache[expected_name])
+
+    def test_get_stats_includes_expected_keys(self):
+        """Test get_stats returns dict with all expected keys."""
+        stats = self.manager.get_stats()
+
+        self.assertIn("num_layers", stats)
+        self.assertIn("local_rank", stats)
+        self.assertIn("device_id", stats)
+        self.assertIn("cache_dtype", stats)
+        self.assertIn("num_host_blocks", stats)
+        self.assertIn("has_device_cache", stats)
+        self.assertIn("has_host_cache", stats)
+        self.assertIn("is_fp8", stats)
+
+        self.assertTrue(stats["has_device_cache"])
+        self.assertTrue(stats["has_host_cache"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/distributed/chunked_moe.py b/tests/distributed/chunked_moe.py
index fee1582f3c7..78e13d7c8aa 100644
--- a/tests/distributed/chunked_moe.py
+++ b/tests/distributed/chunked_moe.py
@@ -148,6 +148,7 @@ def setup_model_runner(self):
         model_runner.share_inputs["caches"] = None
         model_runner.routing_replay_manager = None
         model_runner.exist_prefill_flag = False
+        model_runner.enable_cache_manager_v1 = False
 
         if dist.get_rank() == 0:
             model_runner.share_inputs["ids_remove_padding"] = paddle.ones([10])
diff --git a/tests/engine/test_request.py b/tests/engine/test_request.py
index 9a1f0bc31cf..c18a0e0b0af 100644
--- a/tests/engine/test_request.py
+++ b/tests/engine/test_request.py
@@ -15,12 +15,15 @@
 """
 
 import json
+import pickle
 import unittest
 from unittest.mock import Mock
 
 import numpy as np
 
+from fastdeploy.cache_manager.v1.metadata import CacheSwapMetadata
 from fastdeploy.engine.request import (
+    BatchRequest,
     CompletionOutput,
     ImagePosition,
     PoolingParams,
@@ -35,6 +38,17 @@
 from fastdeploy.entrypoints.openai.protocol import ResponseFormat, StructuralTag
 
 
+def _make_swap_meta(src_ids, dst_ids, hash_values=None):
+    """Helper: create a CacheSwapMetadata instance."""
+    return CacheSwapMetadata(
+        src_block_ids=list(src_ids),
+        dst_block_ids=list(dst_ids),
+        src_type="host",
+        dst_type="device",
+        hash_values=list(hash_values) if hash_values else [],
+    )
+
+
 class TestRequestInit(unittest.TestCase):
     """Test cases for Request initialization"""
 
@@ -692,5 +706,674 @@ def test_contains_method(self):
         self.assertFalse("non_existent" in self.request_output)
 
 
+class TestRequestCacheFields(unittest.TestCase):
+    """Tests for _block_hasher, _prompt_hashes, cache_swap_metadata, cache_evict_metadata."""
+
+    # ------------------------------------------------------------------
+    # _block_hasher / _prompt_hashes initialization
+    # ------------------------------------------------------------------
+
+    def test_default_block_hasher_and_prompt_hashes(self):
+        """Default values: _block_hasher is None, _prompt_hashes is empty list."""
+        req = Request(request_id="cache_defaults")
+        self.assertIsNone(req._block_hasher)
+        self.assertEqual(req._prompt_hashes, [])
+
+    def test_block_hasher_init_via_constructor(self):
+        """block_hasher passed to constructor is stored in _block_hasher."""
+        hasher = Mock(return_value=[])
+        req = Request(request_id="bh_init", block_hasher=hasher)
+        self.assertIs(req._block_hasher, hasher)
+
+    def test_set_block_hasher(self):
+        """set_block_hasher replaces _block_hasher."""
+        req = Request(request_id="set_bh")
+        self.assertIsNone(req._block_hasher)
+        hasher = Mock(return_value=[])
+        req.set_block_hasher(hasher)
+        self.assertIs(req._block_hasher, hasher)
+
+    # ------------------------------------------------------------------
+    # prompt_hashes property
+    # ------------------------------------------------------------------
+
+    def test_prompt_hashes_no_hasher(self):
+        """prompt_hashes returns _prompt_hashes as-is when no hasher is set."""
+        req = Request(request_id="ph_no_hasher")
+        req._prompt_hashes = ["h1", "h2"]
+        self.assertEqual(req.prompt_hashes, ["h1", "h2"])
+
+    def test_prompt_hashes_hasher_returns_new_hashes(self):
+        """prompt_hashes appends new hashes returned by _block_hasher."""
+        req = Request(request_id="ph_new_hashes")
+        req._prompt_hashes = ["h1"]
+        req._block_hasher = Mock(return_value=["h2", "h3"])
+        result = req.prompt_hashes
+        # hasher is called with req
+        req._block_hasher.assert_called_once_with(req)
+        self.assertEqual(result, ["h1", "h2", "h3"])
+        # underlying list is mutated
+        self.assertEqual(req._prompt_hashes, ["h1", "h2", "h3"])
+
+    def test_prompt_hashes_hasher_returns_empty(self):
+        """When hasher returns empty list, _prompt_hashes is unchanged."""
+        req = Request(request_id="ph_empty")
+        req._prompt_hashes = ["h1"]
+        req._block_hasher = Mock(return_value=[])
+        result = req.prompt_hashes
+        self.assertEqual(result, ["h1"])
+        self.assertEqual(req._prompt_hashes, ["h1"])
+
+    def test_prompt_hashes_hasher_returns_none(self):
+        """When hasher returns None (falsy), _prompt_hashes is unchanged."""
+        req = Request(request_id="ph_none")
+        req._prompt_hashes = ["h1"]
+        req._block_hasher = Mock(return_value=None)
+        result = req.prompt_hashes
+        self.assertEqual(result, ["h1"])
+
+    def test_prompt_hashes_accumulates_across_multiple_accesses(self):
+        """Each access may add more hashes (simulates incremental computation)."""
+        call_count = {"n": 0}
+
+        def incremental_hasher(r):
+            call_count["n"] += 1
+            return [f"h{call_count['n']}"]
+
+        req = Request(request_id="ph_incremental")
+        req._block_hasher = incremental_hasher
+        _ = req.prompt_hashes  # first access → adds "h1"
+        _ = req.prompt_hashes  # second access → adds "h2"
+        self.assertEqual(req._prompt_hashes, ["h1", "h2"])
+
+    # ------------------------------------------------------------------
+    # cache_swap_metadata / cache_evict_metadata initialization
+    # ------------------------------------------------------------------
+
+    def test_default_cache_metadata_are_empty_lists(self):
+        """cache_swap_metadata and cache_evict_metadata default to empty lists."""
+        req = Request(request_id="meta_defaults")
+        self.assertEqual(req.cache_swap_metadata, [])
+        self.assertEqual(req.cache_evict_metadata, [])
+
+    # ------------------------------------------------------------------
+    # pop_cache_swap_metadata / pop_cache_evict_metadata
+    # ------------------------------------------------------------------
+
+    def test_pop_cache_swap_metadata_returns_and_clears(self):
+        """pop_cache_swap_metadata returns current list and resets to []."""
+        req = Request(request_id="pop_swap")
+        meta = _make_swap_meta([1], [2], ["hash_a"])
+        req.cache_swap_metadata = [meta]
+        result = req.pop_cache_swap_metadata()
+        self.assertEqual(result, [meta])
+        self.assertEqual(req.cache_swap_metadata, [])
+
+    def test_pop_cache_evict_metadata_returns_and_clears(self):
+        """pop_cache_evict_metadata returns current list and resets to []."""
+        req = Request(request_id="pop_evict")
+        meta = _make_swap_meta([3], [4], ["hash_b"])
+        req.cache_evict_metadata = [meta]
+        result = req.pop_cache_evict_metadata()
+        self.assertEqual(result, [meta])
+        self.assertEqual(req.cache_evict_metadata, [])
+
+    def test_pop_empty_cache_metadata(self):
+        """pop on empty list returns [] and leaves field as []."""
+        req = Request(request_id="pop_empty")
+        self.assertEqual(req.pop_cache_swap_metadata(), [])
+        self.assertEqual(req.pop_cache_evict_metadata(), [])
+
+    # ------------------------------------------------------------------
+    # __getstate__ skips _block_hasher
+    # ------------------------------------------------------------------
+
+    def test_getstate_excludes_block_hasher(self):
+        """__getstate__ must not include _block_hasher (cannot be pickled)."""
+        req = Request(request_id="getstate_bh", block_hasher=lambda r: [])
+        state = req.__getstate__()
+        self.assertNotIn("_block_hasher", state)
+
+    def test_getstate_preserves_prompt_hashes(self):
+        """__getstate__ preserves _prompt_hashes."""
+        req = Request(request_id="getstate_ph")
+        req._prompt_hashes = ["h1", "h2"]
+        state = req.__getstate__()
+        self.assertEqual(state["_prompt_hashes"], ["h1", "h2"])
+
+
+class TestBatchRequestInit(unittest.TestCase):
+    """Tests for BatchRequest initialization."""
+
+    def test_default_init(self):
+        """BatchRequest starts with empty requests and no metadata."""
+        br = BatchRequest()
+        self.assertEqual(br.requests, [])
+        self.assertIsNone(br.cache_swap_metadata)
+        self.assertIsNone(br.cache_evict_metadata)
+
+    def test_len_empty(self):
+        self.assertEqual(len(BatchRequest()), 0)
+
+
+class TestBatchRequestAddRequest(unittest.TestCase):
+    """Tests for BatchRequest.add_request."""
+
+    def _make_request(self, rid):
+        return Request(request_id=rid)
+
+    def test_add_request_appends_to_requests(self):
+        """add_request stores request in .requests list."""
+        br = BatchRequest()
+        req = self._make_request("r1")
+        br.add_request(req)
+        self.assertIn(req, br.requests)
+        self.assertEqual(len(br), 1)
+
+    def test_add_request_without_metadata(self):
+        """When request has no pending metadata, batch metadata stays None."""
+        br = BatchRequest()
+        req = self._make_request("r_no_meta")
+        br.add_request(req)
+        self.assertIsNone(br.cache_swap_metadata)
+        self.assertIsNone(br.cache_evict_metadata)
+
+    def test_add_request_with_swap_metadata(self):
+        """add_request moves swap metadata from request to batch."""
+        br = BatchRequest()
+        req = self._make_request("r_swap")
+        meta = _make_swap_meta([10, 11], [20, 21], ["hA", "hB"])
+        req.cache_swap_metadata = [meta]
+
+        br.add_request(req)
+
+        # Request's swap list should be cleared
+        self.assertEqual(req.cache_swap_metadata, [])
+        # Batch should aggregate the metadata
+        self.assertIsNotNone(br.cache_swap_metadata)
+        self.assertEqual(br.cache_swap_metadata.src_block_ids, [10, 11])
+        self.assertEqual(br.cache_swap_metadata.dst_block_ids, [20, 21])
+        self.assertEqual(br.cache_swap_metadata.hash_values, ["hA", "hB"])
+
+    def test_add_request_with_evict_metadata(self):
+        """add_request moves evict metadata from request to batch."""
+        br = BatchRequest()
+        req = self._make_request("r_evict")
+        meta = _make_swap_meta([5], [6], ["hE"])
+        req.cache_evict_metadata = [meta]
+
+        br.add_request(req)
+
+        self.assertEqual(req.cache_evict_metadata, [])
+        self.assertIsNotNone(br.cache_evict_metadata)
+        self.assertEqual(br.cache_evict_metadata.src_block_ids, [5])
+        self.assertEqual(br.cache_evict_metadata.dst_block_ids, [6])
+
+    def test_add_multiple_requests_merges_swap_metadata(self):
+        """Swap metadata from multiple requests is merged into one."""
+        br = BatchRequest()
+        for i, (src, dst, h) in enumerate([([1], [2], ["h1"]), ([3], [4], ["h2"])]):
+            req = self._make_request(f"r{i}")
+            req.cache_swap_metadata = [_make_swap_meta(src, dst, h)]
+            br.add_request(req)
+
+        self.assertEqual(br.cache_swap_metadata.src_block_ids, [1, 3])
+        self.assertEqual(br.cache_swap_metadata.dst_block_ids, [2, 4])
+        self.assertEqual(br.cache_swap_metadata.hash_values, ["h1", "h2"])
+
+    def test_add_multiple_requests_merges_evict_metadata(self):
+        """Evict metadata from multiple requests is merged into one."""
+        br = BatchRequest()
+        for i, (src, dst, h) in enumerate([([7], [8], ["e1"]), ([9], [10], ["e2"])]):
+            req = self._make_request(f"re{i}")
+            req.cache_evict_metadata = [_make_swap_meta(src, dst, h)]
+            br.add_request(req)
+
+        self.assertEqual(br.cache_evict_metadata.src_block_ids, [7, 9])
+        self.assertEqual(br.cache_evict_metadata.dst_block_ids, [8, 10])
+        self.assertEqual(br.cache_evict_metadata.hash_values, ["e1", "e2"])
+
+
+class TestBatchRequestAppendSwapEvictMetadata(unittest.TestCase):
+    """Unit tests for append_swap_metadata and append_evict_metadata."""
+
+    def test_append_swap_metadata_first_time(self):
+        """append_swap_metadata creates CacheSwapMetadata when None."""
+        br = BatchRequest()
+        meta = _make_swap_meta([1, 2], [3, 4], ["h1", "h2"])
+        br.append_swap_metadata([meta])
+        self.assertIsNotNone(br.cache_swap_metadata)
+        self.assertEqual(br.cache_swap_metadata.src_block_ids, [1, 2])
+        self.assertEqual(br.cache_swap_metadata.dst_block_ids, [3, 4])
+        self.assertEqual(br.cache_swap_metadata.hash_values, ["h1", "h2"])
+        self.assertEqual(br.cache_swap_metadata.src_type, "host")
+        self.assertEqual(br.cache_swap_metadata.dst_type, "device")
+
+    def test_append_swap_metadata_merges(self):
+        """Subsequent append_swap_metadata extends existing lists."""
+        br = BatchRequest()
+        br.append_swap_metadata([_make_swap_meta([1], [2], ["hA"])])
+        br.append_swap_metadata([_make_swap_meta([3], [4], ["hB"])])
+        self.assertEqual(br.cache_swap_metadata.src_block_ids, [1, 3])
+        self.assertEqual(br.cache_swap_metadata.dst_block_ids, [2, 4])
+        self.assertEqual(br.cache_swap_metadata.hash_values, ["hA", "hB"])
+
+    def test_append_evict_metadata_first_time(self):
+        """append_evict_metadata creates CacheSwapMetadata when None."""
+        br = BatchRequest()
+        meta = _make_swap_meta([5], [6], ["he"])
+        br.append_evict_metadata([meta])
+        self.assertIsNotNone(br.cache_evict_metadata)
+        self.assertEqual(br.cache_evict_metadata.src_block_ids, [5])
+        self.assertEqual(br.cache_evict_metadata.dst_block_ids, [6])
+        self.assertEqual(br.cache_evict_metadata.dst_type, "host")
+
+    def test_append_evict_metadata_merges(self):
+        """Subsequent append_evict_metadata extends existing lists."""
+        br = BatchRequest()
+        br.append_evict_metadata([_make_swap_meta([1], [2], ["e1"])])
+        br.append_evict_metadata([_make_swap_meta([3], [4], ["e2"])])
+        self.assertEqual(br.cache_evict_metadata.src_block_ids, [1, 3])
+        self.assertEqual(br.cache_evict_metadata.dst_block_ids, [2, 4])
+        self.assertEqual(br.cache_evict_metadata.hash_values, ["e1", "e2"])
+
+    def test_append_empty_list_is_noop(self):
+        """append_swap_metadata / append_evict_metadata with empty list is a no-op."""
+        br = BatchRequest()
+        br.append_swap_metadata([])
+        br.append_evict_metadata([])
+        self.assertIsNone(br.cache_swap_metadata)
+        self.assertIsNone(br.cache_evict_metadata)
+
+
+class TestBatchRequestAppendAndExtend(unittest.TestCase):
+    """Tests for BatchRequest.append and BatchRequest.extend."""
+
+    def _br_with_swap(self, src, dst, hashes=None):
+        br = BatchRequest()
+        br.append_swap_metadata([_make_swap_meta(src, dst, hashes or [])])
+        return br
+
+    def _br_with_evict(self, src, dst, hashes=None):
+        br = BatchRequest()
+        br.append_evict_metadata([_make_swap_meta(src, dst, hashes or [])])
+        return br
+
+    def test_append_merges_requests(self):
+        br1 = BatchRequest()
+        br1.add_request(Request(request_id="a"))
+        br2 = BatchRequest()
+        br2.add_request(Request(request_id="b"))
+        br1.append(br2)
+        self.assertEqual(len(br1), 2)
+
+    def test_append_merges_swap_metadata(self):
+        br1 = self._br_with_swap([1], [2], ["h1"])
+        br2 = self._br_with_swap([3], [4], ["h2"])
+        br1.append(br2)
+        self.assertEqual(br1.cache_swap_metadata.src_block_ids, [1, 3])
+        self.assertEqual(br1.cache_swap_metadata.hash_values, ["h1", "h2"])
+
+    def test_append_merges_evict_metadata(self):
+        br1 = self._br_with_evict([5], [6], ["e1"])
+        br2 = self._br_with_evict([7], [8], ["e2"])
+        br1.append(br2)
+        self.assertEqual(br1.cache_evict_metadata.src_block_ids, [5, 7])
+
+    def test_append_batch_without_metadata_does_not_create_metadata(self):
+        br1 = BatchRequest()
+        br1.add_request(Request(request_id="x"))
+        br2 = BatchRequest()
+        br2.add_request(Request(request_id="y"))
+        br1.append(br2)
+        self.assertIsNone(br1.cache_swap_metadata)
+        self.assertIsNone(br1.cache_evict_metadata)
+
+    def test_extend_multiple_batches(self):
+        br_main = BatchRequest()
+        sub1 = self._br_with_swap([1], [2], ["h1"])
+        sub1.add_request(Request(request_id="s1"))
+        sub2 = self._br_with_swap([3], [4], ["h2"])
+        sub2.add_request(Request(request_id="s2"))
+        br_main.extend([sub1, sub2])
+        self.assertEqual(len(br_main), 2)
+        self.assertEqual(br_main.cache_swap_metadata.src_block_ids, [1, 3])
+
+
+class TestBatchRequestIterAndAccess(unittest.TestCase):
+    """Tests for __iter__, __getitem__, __len__, __repr__."""
+
+    def _populated_br(self):
+        br = BatchRequest()
+        for i in range(3):
+            br.add_request(Request(request_id=f"r{i}"))
+        return br
+
+    def test_iter(self):
+        br = self._populated_br()
+        ids = [req.request_id for req in br]
+        self.assertEqual(ids, ["r0", "r1", "r2"])
+
+    def test_getitem(self):
+        br = self._populated_br()
+        self.assertEqual(br[0].request_id, "r0")
+        self.assertEqual(br[2].request_id, "r2")
+
+    def test_len(self):
+        br = self._populated_br()
+        self.assertEqual(len(br), 3)
+
+    def test_repr_contains_swap_and_evict(self):
+        br = BatchRequest()
+        br.append_swap_metadata([_make_swap_meta([1], [2], ["hR"])])
+        r = repr(br)
+        self.assertIn("BatchRequest", r)
+        self.assertIn("swap_metadata", r)
+        self.assertIn("evict_metadata", r)
+
+
+class TestBatchRequestPickle(unittest.TestCase):
+    """Ensure BatchRequest can be serialized / deserialized via pickle."""
+
+    def test_pickle_without_block_hasher(self):
+        """BatchRequest with plain Requests (no block_hasher) round-trips via pickle."""
+        br = BatchRequest()
+        req = Request(request_id="pk1", prompt="hello")
+        req._prompt_hashes = ["h1"]
+        br.add_request(req)
+        br.append_swap_metadata([_make_swap_meta([10], [20], ["hP"])])
+
+        data = pickle.dumps(br)
+        br2 = pickle.loads(data)
+
+        self.assertEqual(len(br2), 1)
+        self.assertEqual(br2[0].request_id, "pk1")
+        self.assertEqual(br2.cache_swap_metadata.src_block_ids, [10])
+
+    def test_getstate_skips_block_hasher_in_requests(self):
+        """__getstate__ of BatchRequest serializes requests without _block_hasher."""
+        br = BatchRequest()
+        req = Request(request_id="gs1", block_hasher=lambda r: ["h_new"])
+        br.add_request(req)
+        state = br.__getstate__()
+        # Each request dict must not contain _block_hasher
+        for req_state in state["requests"]:
+            self.assertNotIn("_block_hasher", req_state)
+
+
+from fastdeploy.cache_manager.v1.cache_utils import (
+    get_block_hash_extra_keys as _get_block_hash_extra_keys,
+)
+from fastdeploy.cache_manager.v1.cache_utils import (
+    get_request_block_hasher as _get_request_block_hasher,
+)
+from fastdeploy.cache_manager.v1.cache_utils import (
+    hash_block_tokens as _hash_block_tokens,
+)
+
+
+class TestPromptHashesWithRealHasher(unittest.TestCase):
+    """
+    Test Request.prompt_hashes together with the real get_request_block_hasher
+    and get_block_hash_extra_keys implementations.
+
+    These tests do NOT use mock hashers, so they exercise the full hash
+    computation path (hash_block_tokens → SHA-256 chained hash).
+    """
+
+    BLOCK_SIZE = 4  # small block size makes tests easy to reason about
+
+    get_request_block_hasher = staticmethod(_get_request_block_hasher)
+    get_block_hash_extra_keys = staticmethod(_get_block_hash_extra_keys)
+    hash_block_tokens = staticmethod(_hash_block_tokens)
+
+    def _hasher(self):
+        return _get_request_block_hasher(self.BLOCK_SIZE)
+
+    # ------------------------------------------------------------------
+    # Basic hash computation
+    # ------------------------------------------------------------------
+
+    def test_no_complete_block_returns_empty(self):
+        """Fewer tokens than one block → prompt_hashes returns []."""
+        req = Request(
+            request_id="real_partial", prompt_token_ids=[1, 2, 3], block_hasher=self._hasher()  # < BLOCK_SIZE=4
+        )
+        self.assertEqual(req.prompt_hashes, [])
+
+    def test_exactly_one_block(self):
+        """Exactly block_size tokens → one hash produced."""
+        tokens = [10, 20, 30, 40]  # 4 tokens == BLOCK_SIZE
+        req = Request(request_id="real_one_block", prompt_token_ids=tokens, block_hasher=self._hasher())
+        hashes = req.prompt_hashes
+        self.assertEqual(len(hashes), 1)
+
+        # Verify hash value matches hash_block_tokens directly
+        expected = self.hash_block_tokens(tokens, None, None)
+        self.assertEqual(hashes[0], expected)
+
+    def test_two_complete_blocks(self):
+        """Two full blocks → two chained hashes."""
+        tokens = list(range(8))  # 8 tokens = 2 blocks of 4
+        req = Request(request_id="real_two_blocks", prompt_token_ids=tokens, block_hasher=self._hasher())
+        hashes = req.prompt_hashes
+        self.assertEqual(len(hashes), 2)
+
+        h0 = self.hash_block_tokens(tokens[:4], None, None)
+        h1 = self.hash_block_tokens(tokens[4:8], h0, None)
+        self.assertEqual(hashes[0], h0)
+        self.assertEqual(hashes[1], h1)
+
+    def test_partial_tail_not_hashed(self):
+        """9 tokens with block_size=4 → only 2 complete blocks hashed."""
+        tokens = list(range(9))
+        req = Request(request_id="real_tail", prompt_token_ids=tokens, block_hasher=self._hasher())
+        self.assertEqual(len(req.prompt_hashes), 2)
+
+    def test_hash_is_deterministic(self):
+        """Same tokens always produce the same hash."""
+        tokens = [1, 2, 3, 4]
+        req1 = Request(request_id="det1", prompt_token_ids=tokens, block_hasher=self._hasher())
+        req2 = Request(request_id="det2", prompt_token_ids=tokens, block_hasher=self._hasher())
+        self.assertEqual(req1.prompt_hashes, req2.prompt_hashes)
+
+    def test_different_tokens_different_hash(self):
+        """Different token sequences yield different hashes."""
+        req1 = Request(request_id="diff1", prompt_token_ids=[1, 2, 3, 4], block_hasher=self._hasher())
+        req2 = Request(request_id="diff2", prompt_token_ids=[5, 6, 7, 8], block_hasher=self._hasher())
+        self.assertNotEqual(req1.prompt_hashes, req2.prompt_hashes)
+
+    # ------------------------------------------------------------------
+    # Incremental (multi-access) behaviour
+    # ------------------------------------------------------------------
+
+    def test_incremental_hashing_does_not_recompute(self):
+        """
+        If existing hashes already cover N blocks, prompt_hashes only computes
+        the next block – not all blocks from scratch.
+        """
+        tokens = list(range(12))  # 3 blocks of 4
+        req = Request(request_id="incremental", prompt_token_ids=tokens, block_hasher=self._hasher())
+
+        # First access: all three blocks computed
+        h_all = req.prompt_hashes[:]  # copy
+        self.assertEqual(len(h_all), 3)
+
+        # If we artificially reset and call again, hasher sees existing 3 hashes
+        # and returns [] because start_token_idx = 3*4 = 12 = num_tokens → no new block
+        result2 = req.prompt_hashes
+        self.assertEqual(len(result2), 3)  # no duplicates
+
+    def test_new_output_tokens_trigger_additional_hashes(self):
+        """
+        After output tokens are appended, a second call to prompt_hashes
+        produces more hashes (because the combined token sequence now has
+        more complete blocks).
+        """
+        # Start with exactly 1 block of prompt tokens
+        tokens = list(range(4))
+        req = Request(request_id="out_tokens", prompt_token_ids=tokens, block_hasher=self._hasher())
+        req.output_token_ids = []
+
+        first = req.prompt_hashes[:]
+        self.assertEqual(len(first), 1)
+
+        # Append 4 output tokens → now 2 complete blocks total
+        req.output_token_ids = list(range(4, 8))
+        second = req.prompt_hashes[:]
+        self.assertEqual(len(second), 2)
+        self.assertEqual(second[0], first[0])  # first hash unchanged
+
+    # ------------------------------------------------------------------
+    # get_block_hash_extra_keys via prompt_hashes (multimodal path)
+    # ------------------------------------------------------------------
+
+    def test_prompt_hashes_no_multimodal_inputs(self):
+        """
+        With no multimodal_inputs, get_block_hash_extra_keys returns empty
+        extra_keys → hash equals plain hash_block_tokens with extra_keys=None.
+        """
+        tokens = [1, 2, 3, 4]
+        req = Request(request_id="mm_none", prompt_token_ids=tokens, block_hasher=self._hasher())
+        req.multimodal_inputs = None
+
+        hashes = req.prompt_hashes
+        expected = self.hash_block_tokens(tokens, None, None)
+        self.assertEqual(hashes[0], expected)
+
+    def test_prompt_hashes_with_multimodal_fully_within_block(self):
+        """
+        A multimodal item fully within the block contributes its hash as
+        extra_keys, changing the computed block hash.
+        """
+        tokens = [1, 2, 3, 4]
+        mm_hash = "img_hash_abc"
+        # Image fully within block [0, 4)
+        req = Request(request_id="mm_within", prompt_token_ids=tokens, block_hasher=self._hasher())
+        req.multimodal_inputs = {
+            "mm_positions": [ImagePosition(offset=1, length=2)],
+            "mm_hashes": [mm_hash],
+        }
+
+        hashes = req.prompt_hashes
+        # Expected: extra_keys = (mm_hash,)
+        expected = self.hash_block_tokens(tokens, None, (mm_hash,))
+        self.assertEqual(hashes[0], expected)
+
+    def test_prompt_hashes_multimodal_outside_block_not_included(self):
+        """
+        A multimodal item that starts after the block end must NOT be included
+        in extra_keys for that block.
+        """
+        tokens = list(range(8))  # 2 blocks: [0,4) and [4,8)
+        mm_hash = "img_hash_xyz"
+        # Image sits in the second block [4, 8)
+        req = Request(request_id="mm_outside", prompt_token_ids=tokens, block_hasher=self._hasher())
+        req.multimodal_inputs = {
+            "mm_positions": [ImagePosition(offset=4, length=2)],
+            "mm_hashes": [mm_hash],
+        }
+
+        hashes = req.prompt_hashes
+
+        # First block has no multimodal item → extra_keys = None
+        h0_expected = self.hash_block_tokens(list(range(4)), None, None)
+        self.assertEqual(hashes[0], h0_expected)
+
+        # Second block contains the image
+        h1_expected = self.hash_block_tokens(list(range(4, 8)), h0_expected, (mm_hash,))
+        self.assertEqual(hashes[1], h1_expected)
+
+    def test_prompt_hashes_multimodal_spanning_two_blocks(self):
+        """
+        A multimodal item spanning two blocks contributes its hash to each block.
+        """
+        tokens = list(range(8))
+        mm_hash = "span_hash"
+        # Image [2, 6) spans both block [0,4) and [4,8)
+        req = Request(request_id="mm_span", prompt_token_ids=tokens, block_hasher=self._hasher())
+        req.multimodal_inputs = {
+            "mm_positions": [ImagePosition(offset=2, length=4)],
+            "mm_hashes": [mm_hash],
+        }
+
+        hashes = req.prompt_hashes
+        self.assertEqual(len(hashes), 2)
+        # Both blocks include the mm hash as extra_keys
+        h0_expected = self.hash_block_tokens(list(range(4)), None, (mm_hash,))
+        self.assertEqual(hashes[0], h0_expected)
+        h1_expected = self.hash_block_tokens(list(range(4, 8)), h0_expected, (mm_hash,))
+        self.assertEqual(hashes[1], h1_expected)
+
+    # ------------------------------------------------------------------
+    # get_block_hash_extra_keys direct unit tests
+    # ------------------------------------------------------------------
+
+    def test_extra_keys_no_multimodal(self):
+        """No multimodal_inputs → empty extra keys."""
+        req = Request(request_id="ek_none")
+        req.multimodal_inputs = None
+        next_idx, keys = self.get_block_hash_extra_keys(req, 0, 4, 0)
+        self.assertEqual(keys, [])
+        self.assertEqual(next_idx, 0)
+
+    def test_extra_keys_item_fully_inside_block(self):
+        """Multimodal item fully inside [start, end) → its hash is collected."""
+        req = Request(request_id="ek_inside")
+        req.multimodal_inputs = {
+            "mm_positions": [ImagePosition(offset=1, length=2)],  # [1, 3)
+            "mm_hashes": ["hash_inside"],
+        }
+        next_idx, keys = self.get_block_hash_extra_keys(req, 0, 4, 0)
+        self.assertIn("hash_inside", keys)
+
+    def test_extra_keys_item_starts_after_block(self):
+        """Multimodal item starts after block end → not included."""
+        req = Request(request_id="ek_after")
+        req.multimodal_inputs = {
+            "mm_positions": [ImagePosition(offset=5, length=2)],  # after block [0,4)
+            "mm_hashes": ["hash_after"],
+        }
+        _, keys = self.get_block_hash_extra_keys(req, 0, 4, 0)
+        self.assertEqual(keys, [])
+
+    def test_extra_keys_item_ends_before_block(self):
+        """Multimodal item ends before block start → fast-exit, not included."""
+        req = Request(request_id="ek_before")
+        req.multimodal_inputs = {
+            "mm_positions": [ImagePosition(offset=0, length=1)],  # [0,1) ends before block [2,6)
+            "mm_hashes": ["hash_before"],
+        }
+        _, keys = self.get_block_hash_extra_keys(req, 2, 6, 0)
+        self.assertEqual(keys, [])
+
+    def test_extra_keys_item_spans_beyond_block(self):
+        """Multimodal item spanning beyond block end → included, and mm_idx points to it."""
+        req = Request(request_id="ek_span")
+        req.multimodal_inputs = {
+            "mm_positions": [ImagePosition(offset=2, length=4)],  # [2, 6) spans [0,4) end
+            "mm_hashes": ["hash_span"],
+        }
+        next_idx, keys = self.get_block_hash_extra_keys(req, 0, 4, 0)
+        self.assertIn("hash_span", keys)
+        self.assertEqual(next_idx, 0)  # mm_idx points back at the spanning item
+
+    def test_extra_keys_multiple_items_only_overlapping_included(self):
+        """Only multimodal items that overlap [start, end) are included."""
+        req = Request(request_id="ek_multi")
+        req.multimodal_inputs = {
+            "mm_positions": [
+                ImagePosition(offset=0, length=2),  # [0,2) → in block [0,4): YES
+                ImagePosition(offset=2, length=2),  # [2,4) → in block [0,4): YES
+                ImagePosition(offset=5, length=2),  # [5,7) → after block [0,4): NO
+            ],
+            "mm_hashes": ["hA", "hB", "hC"],
+        }
+        _, keys = self.get_block_hash_extra_keys(req, 0, 4, 0)
+        self.assertIn("hA", keys)
+        self.assertIn("hB", keys)
+        self.assertNotIn("hC", keys)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/engine/test_resource_manager.py b/tests/engine/test_resource_manager.py
index e3eb5942c86..5dadb9861a6 100644
--- a/tests/engine/test_resource_manager.py
+++ b/tests/engine/test_resource_manager.py
@@ -124,7 +124,7 @@ def _stub_metrics():
 def rm_factory():
     """Yield a factory that creates ResourceManagers with stubbed deps."""
     with (
-        patch("fastdeploy.engine.resource_manager.PrefixCacheManager", _StubCacheManager),
+        patch("fastdeploy.cache_manager.prefix_cache_manager.PrefixCacheManager", _StubCacheManager),
         patch("fastdeploy.engine.resource_manager.main_process_metrics", _stub_metrics()),
         patch("fastdeploy.engine.resource_manager.llm_logger", _noop_logger()),
     ):