diff --git a/src/native/ascend/ops/flash_attention/kernel.h b/src/native/ascend/ops/flash_attention/kernel.h
new file mode 100644
index 000000000..22091e017
--- /dev/null
+++ b/src/native/ascend/ops/flash_attention/kernel.h
@@ -0,0 +1,374 @@
+#ifndef INFINI_OPS_ASCEND_FLASH_ATTENTION_KERNEL_H_
+#define INFINI_OPS_ASCEND_FLASH_ATTENTION_KERNEL_H_
+
+#include <cassert>
+#include <cstddef>
+#include <vector>
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnnop/aclnn_fused_infer_attention_score_v4.h"
+#include "base/flash_attention.h"
+#include "native/ascend/common.h"
+#include "native/ascend/ops/graph_cleanup_.h"
+#include "native/ascend/workspace_pool_.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+namespace detail {
+
+// Extract cu_seqlens differences to a host aclIntArray.
+// cu_seqlens = [0, s1, s1+s2, ...] -> per_seq_lens = [s1, s2, ...].
+// Used by paged decode (actualSeqLengthsKv = per-sequence KV lengths).
+//
+// When cu_seqlens is a CPU tensor (device type kCpu), the data pointer is
+// already on the host and can be read directly — no D2H sync needed.
+inline aclIntArray* extractSeqLengths(const Tensor& cu_seqlens,
+                                      aclrtStream stream) {
+  auto n = cu_seqlens.numel();
+
+  const int64_t* cu_host_ptr = nullptr;
+  std::vector<int64_t> cu_host_buf;
+
+  if (cu_seqlens.device().type() == Device::Type::kCpu) {
+    cu_host_ptr = static_cast<const int64_t*>(cu_seqlens.data());
+  } else {
+    cu_host_buf.resize(n);
+    aclrtMemcpyAsync(cu_host_buf.data(), n * sizeof(int64_t), cu_seqlens.data(),
+                     n * sizeof(int64_t), ACL_MEMCPY_DEVICE_TO_HOST, stream);
+    aclrtSynchronizeStream(stream);
+    cu_host_ptr = cu_host_buf.data();
+  }
+
+  std::vector<int64_t> lengths(n - 1);
+  for (size_t i = 0; i < lengths.size(); ++i) {
+    lengths[i] = cu_host_ptr[i + 1] - cu_host_ptr[i];
+  }
+
+  return aclCreateIntArray(lengths.data(),
+                           static_cast<int64_t>(lengths.size()));
+}
+
+// Extract cumulative end positions from cu_seqlens to a host aclIntArray.
+// cu_seqlens = [0, s1, s1+s2, ...] -> cum_lens = [s1, s1+s2, ...].
+// FIA V4 TND varlen uses cumulative end positions, matching the vllm-ascend
+// convention for npu_fused_infer_attention_score actual_seq_lengths.
+//
+// When cu_seqlens is a CPU tensor, reads directly from host memory.
+inline aclIntArray* cumSeqLengths(const Tensor& cu_seqlens,
+                                  aclrtStream stream) {
+  auto n = cu_seqlens.numel();
+
+  const int64_t* cu_host_ptr = nullptr;
+  std::vector<int64_t> cu_host_buf;
+
+  if (cu_seqlens.device().type() == Device::Type::kCpu) {
+    cu_host_ptr = static_cast<const int64_t*>(cu_seqlens.data());
+  } else {
+    cu_host_buf.resize(n);
+    aclrtMemcpyAsync(cu_host_buf.data(), n * sizeof(int64_t), cu_seqlens.data(),
+                     n * sizeof(int64_t), ACL_MEMCPY_DEVICE_TO_HOST, stream);
+    aclrtSynchronizeStream(stream);
+    cu_host_ptr = cu_host_buf.data();
+  }
+
+  // Skip the leading 0; return [s1, s1+s2, ...].
+  return aclCreateIntArray(cu_host_ptr + 1, static_cast<int64_t>(n - 1));
+}
+
+// Allocate a 2048x2048 lower-triangular UINT8 causal mask on device.
+// Required for `sparseMode` >= 2.
+inline aclTensor* makeCausalMask(void** mask_buf, aclrtStream stream) {
+  constexpr int64_t kMaskDim = 2048;
+  const int64_t mask_elems = kMaskDim * kMaskDim;
+  const size_t mask_bytes = static_cast<size_t>(mask_elems);  // uint8_t
+
+  aclrtMalloc(mask_buf, mask_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
+
+  std::vector<uint8_t> host_mask(mask_elems);
+  for (int64_t r = 0; r < kMaskDim; ++r) {
+    for (int64_t c = 0; c < kMaskDim; ++c) {
+      // 1 = masked out (upper triangle); 0 = attend (lower triangle).
+      host_mask[r * kMaskDim + c] = (c > r) ? 1 : 0;
+    }
+  }
+  aclrtMemcpyAsync(*mask_buf, mask_bytes, host_mask.data(), mask_bytes,
+                   ACL_MEMCPY_HOST_TO_DEVICE, stream);
+  aclrtSynchronizeStream(stream);
+
+  std::vector<int64_t> mask_shape = {kMaskDim, kMaskDim};
+  std::vector<int64_t> mask_strides = {kMaskDim, 1};
+  std::vector<int64_t> mask_storage = {mask_elems};
+  return aclCreateTensor(mask_shape.data(), 2, ACL_UINT8, mask_strides.data(),
+                         0, ACL_FORMAT_ND, mask_storage.data(), 1, *mask_buf);
+}
+
+}  // namespace detail
+
+template <>
+class Operator<FlashAttention, Device::Type::kAscend> : public FlashAttention {
+ public:
+  Operator(const Tensor query, const Tensor key, const Tensor value,
+           std::optional<Tensor> cu_seqlens_q,
+           std::optional<Tensor> cu_seqlens_kv,
+           std::optional<Tensor> block_table, int64_t num_heads,
+           int64_t num_kv_heads, int64_t head_size, double scale, bool causal,
+           int64_t window_left, int64_t window_right, int64_t block_size,
+           Tensor output)
+      : FlashAttention(query, key, value, cu_seqlens_q, cu_seqlens_kv,
+                       block_table, num_heads, num_kv_heads, head_size, scale,
+                       causal, window_left, window_right, block_size, output) {
+    paged_ = block_table.has_value() && block_size > 0;
+    aclDataType acl_dt = ascend::ToAclDtype(query.dtype());
+
+    if (!paged_) {
+      // Prefill: cache Q and output (TND layout).
+      prefill_q_cache_ = ascend::AclTensorCache(query);
+      prefill_out_cache_ = ascend::AclTensorCache(output);
+
+      // Pre-compute causal mask once (sparse_mode >= 2).
+      if (causal) {
+        int64_t sm = (window_left_ >= 0) ? 4 : 3;
+        if (sm >= 2) {
+          causal_mask_ = detail::makeCausalMask(&causal_mask_buf_, nullptr);
+        }
+      }
+    } else {
+      // Decode: cache Q/output (BNSD), block_table.
+      const int64_t N = query.size(1);
+      const int64_t D = query.size(2);
+      const int64_t B = query.size(0);
+
+      decode_q_cache_ = ascend::AclTensorCache({B, N, 1, D}, acl_dt,
+                                               const_cast<void*>(query.data()));
+      decode_out_cache_ =
+          ascend::AclTensorCache({B, N, 1, D}, acl_dt, output.data());
+      block_table_cache_ = ascend::AclTensorCache(block_table.value());
+
+      // Pre-compute KV reshape metadata.
+      const int64_t nb = key.size(0);
+      const int64_t bsz = key.size(1);
+      const int64_t NkvD = key.size(2) * key.size(3);
+      kv_shape_ = {nb, bsz, NkvD};
+      kv_strides_ = {bsz * NkvD, NkvD, 1};
+      kv_storage_shape_ = {nb * bsz * NkvD};
+      kv_acl_dt_ = acl_dt;
+    }
+  }
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    if (causal_mask_) aclDestroyTensor(causal_mask_);
+    if (causal_mask_buf_) aclrtFree(causal_mask_buf_);
+  }
+
+  void operator()(const Tensor query, const Tensor key, const Tensor value,
+                  std::optional<Tensor> cu_seqlens_q,
+                  std::optional<Tensor> cu_seqlens_kv,
+                  std::optional<Tensor> block_table, int64_t num_heads,
+                  int64_t num_kv_heads, int64_t head_size, double scale,
+                  bool causal, int64_t window_left, int64_t window_right,
+                  int64_t block_size, Tensor output) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+    const bool paged = paged_;
+
+    // The base class stored the window pair in `window_left_` /
+    // `window_right_` at construction; prefer those over the call-site args.
+    int64_t wl = window_left_;
+    int64_t wr = window_right_;
+    (void)window_left;
+    (void)window_right;
+
+    int64_t sparse_mode;
+    int64_t pre_tokens = 2147483647;
+    int64_t next_tokens = 2147483647;
+    if (causal) {
+      if (wl >= 0) {
+        sparse_mode = 4;
+        pre_tokens = wl;
+        next_tokens = 0;
+      } else {
+        sparse_mode = 3;
+        next_tokens = 0;
+      }
+    } else {
+      sparse_mode = 0;
+      if (wl >= 0) pre_tokens = wl;
+      if (wr >= 0) next_tokens = wr;
+    }
+
+    if (!paged) {
+      // --- Prefill ---
+      int64_t T = query.size(0);
+
+      // cumSeqLengths / extractSeqLengths automatically skip D2H when
+      // cu_seqlens is a CPU tensor (see detail:: helpers above).
+      aclIntArray* seq_q =
+          cu_seqlens_q.has_value()
+              ? detail::cumSeqLengths(cu_seqlens_q.value(), stream)
+              : aclCreateIntArray(&T, 1);
+      aclIntArray* seq_kv =
+          cu_seqlens_kv.has_value()
+              ? detail::cumSeqLengths(cu_seqlens_kv.value(), stream)
+              : aclCreateIntArray(&T, 1);
+
+      aclTensor* t_q = prefill_q_cache_.get(const_cast<void*>(query.data()));
+      // K/V descriptors go into TensorList which takes ownership — must be
+      // per-call (cannot cache).
+      aclTensor* t_k = ascend::BuildAclTensor(key);
+      aclTensor* t_v = ascend::BuildAclTensor(value);
+      aclTensor* t_out = prefill_out_cache_.get(output.data());
+
+      const aclTensor* k_arr[] = {t_k};
+      const aclTensor* v_arr[] = {t_v};
+      aclTensorList* key_list = aclCreateTensorList(k_arr, 1);
+      aclTensorList* val_list = aclCreateTensorList(v_arr, 1);
+
+      uint64_t ws_needed = 0;
+      aclOpExecutor* executor = nullptr;
+      aclError gws = aclnnFusedInferAttentionScoreV4GetWorkspaceSize(
+          t_q, key_list, val_list,
+          nullptr,       // pseShift
+          causal_mask_,  // attenMask (pre-computed, or nullptr)
+          seq_q,         // actualSeqLengths
+          seq_kv,        // actualSeqLengthsKv
+          nullptr, nullptr, nullptr, nullptr,
+          nullptr,           // deqScale1..quantOffset2
+          nullptr, nullptr,  // antiquantScale, antiquantOffset
+          nullptr,           // blockTable
+          nullptr, nullptr,  // queryPaddingSize, kvPaddingSize
+          nullptr, nullptr, nullptr,
+          nullptr,  // key/value antiquant scale/offset
+          nullptr, nullptr,
+          nullptr,  // keySharedPrefix, valueSharedPrefix, actualSharedPrefixLen
+          nullptr, nullptr,
+          nullptr,           // queryRope, keyRope, keyRopeAntiquantScale
+          nullptr, nullptr,  // dequantScaleQuery, learnableSink
+          num_heads, scale, pre_tokens, next_tokens, const_cast<char*>("TND"),
+          num_kv_heads, sparse_mode,
+          0,         // innerPrecise
+          0,         // blockSize (unused for prefill)
+          0, false,  // antiquantMode, softmaxLseFlag
+          0, 0, 0,   // keyAntiquantMode, valueAntiquantMode, queryQuantMode
+          t_out, nullptr, &ws_needed, &executor);
+      assert(
+          gws == ACL_SUCCESS &&
+          "aclnnFusedInferAttentionScoreV4GetWorkspaceSize failed (prefill)");
+
+      auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_needed);
+      aclError ret = aclnnFusedInferAttentionScoreV4(arena.buf, ws_needed,
+                                                     executor, stream);
+      assert(ret == ACL_SUCCESS &&
+             "aclnnFusedInferAttentionScoreV4 failed (prefill)");
+
+      // t_q and t_out are owned by caches — do NOT destroy.
+      // t_k and t_v are owned by TensorLists.
+      ascend::DeferOrRunAclCleanup([key_list, val_list, seq_q, seq_kv]() {
+        aclDestroyTensorList(key_list);
+        aclDestroyTensorList(val_list);
+        aclDestroyIntArray(seq_q);
+        aclDestroyIntArray(seq_kv);
+      });
+      return;
+    }
+
+    // --- Paged decode ---
+    assert(cu_seqlens_kv.has_value() &&
+           "`FlashAttention` paged decode requires `cu_seqlens_kv`");
+
+    aclTensor* t_query = decode_q_cache_.get(const_cast<void*>(query.data()));
+    aclTensor* t_output = decode_out_cache_.get(output.data());
+
+    // K/V descriptors go into TensorList which takes ownership — must be
+    // per-call.  Use pre-computed metadata to avoid heap allocs.
+    aclTensor* t_key = aclCreateTensor(
+        kv_shape_.data(), static_cast<int64_t>(kv_shape_.size()), kv_acl_dt_,
+        kv_strides_.data(), 0, ACL_FORMAT_ND, kv_storage_shape_.data(),
+        static_cast<int64_t>(kv_storage_shape_.size()),
+        const_cast<void*>(key.data()));
+    aclTensor* t_value = aclCreateTensor(
+        kv_shape_.data(), static_cast<int64_t>(kv_shape_.size()), kv_acl_dt_,
+        kv_strides_.data(), 0, ACL_FORMAT_ND, kv_storage_shape_.data(),
+        static_cast<int64_t>(kv_storage_shape_.size()),
+        const_cast<void*>(value.data()));
+
+    // extractSeqLengths skips D2H when cu_seqlens_kv is a CPU tensor.
+    aclIntArray* seq_kv =
+        detail::extractSeqLengths(cu_seqlens_kv.value(), stream);
+    aclTensor* t_block_table =
+        block_table_cache_.get(const_cast<void*>(block_table.value().data()));
+
+    const aclTensor* k_arr[] = {t_key};
+    const aclTensor* v_arr[] = {t_value};
+    aclTensorList* key_list = aclCreateTensorList(k_arr, 1);
+    aclTensorList* val_list = aclCreateTensorList(v_arr, 1);
+
+    uint64_t ws_needed = 0;
+    aclOpExecutor* executor = nullptr;
+    aclError gws = aclnnFusedInferAttentionScoreV4GetWorkspaceSize(
+        t_query, key_list, val_list,
+        nullptr,  // pseShift
+        nullptr,  // attenMask (sparseMode ignored for Q_S=1)
+        nullptr,  // actualSeqLengths (ignored for Q_S=1)
+        seq_kv,   // actualSeqLengthsKv (mandatory for paged)
+        nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+        t_block_table,  // blockTable
+        nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+        nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, num_heads, scale,
+        static_cast<int64_t>(2147483647), static_cast<int64_t>(2147483647),
+        const_cast<char*>("BNSD"), num_kv_heads,
+        0,           // sparseMode=0 (ignored for Q_S=1)
+        0,           // innerPrecise
+        block_size,  // blockSize
+        0, false,    // antiquantMode, softmaxLseFlag
+        0, 0, 0,     // keyAntiquantMode, valueAntiquantMode, queryQuantMode
+        t_output, nullptr, &ws_needed, &executor);
+    assert(gws == ACL_SUCCESS &&
+           "aclnnFusedInferAttentionScoreV4GetWorkspaceSize failed (decode)");
+
+    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_needed);
+    aclError ret =
+        aclnnFusedInferAttentionScoreV4(arena.buf, ws_needed, executor, stream);
+    assert(ret == ACL_SUCCESS &&
+           "aclnnFusedInferAttentionScoreV4 failed (decode)");
+
+    // t_query, t_output, t_block_table owned by caches — do NOT destroy.
+    // t_key, t_value owned by TensorLists.
+    ascend::DeferOrRunAclCleanup([key_list, val_list, seq_kv]() {
+      aclDestroyTensorList(key_list);
+      aclDestroyTensorList(val_list);
+      aclDestroyIntArray(seq_kv);
+    });
+  }
+
+ private:
+  bool paged_ = false;
+
+  mutable ascend::AclTensorCache prefill_q_cache_;
+
+  mutable ascend::AclTensorCache prefill_out_cache_;
+
+  mutable ascend::AclTensorCache decode_q_cache_;
+
+  mutable ascend::AclTensorCache decode_out_cache_;
+
+  mutable ascend::AclTensorCache block_table_cache_;
+
+  aclTensor* causal_mask_ = nullptr;
+
+  void* causal_mask_buf_ = nullptr;
+
+  std::vector<int64_t> kv_shape_;
+
+  std::vector<int64_t> kv_strides_;
+
+  std::vector<int64_t> kv_storage_shape_;
+
+  aclDataType kv_acl_dt_ = ACL_DT_UNDEFINED;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/ascend/ops/graph_cleanup_.h b/src/native/ascend/ops/graph_cleanup_.h
new file mode 100644
index 000000000..8dfb61319
--- /dev/null
+++ b/src/native/ascend/ops/graph_cleanup_.h
@@ -0,0 +1,60 @@
+#ifndef INFINI_OPS_ASCEND_GRAPH_CLEANUP__H_
+#define INFINI_OPS_ASCEND_GRAPH_CLEANUP__H_
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+namespace infini::ops::ascend {
+
+class DeferredAclCleanupScope;
+
+namespace detail {
+
+inline thread_local DeferredAclCleanupScope* active_acl_cleanup_scope = nullptr;
+
+}  // namespace detail
+
+class DeferredAclCleanupScope {
+ public:
+  DeferredAclCleanupScope() : previous_(detail::active_acl_cleanup_scope) {
+    detail::active_acl_cleanup_scope = this;
+  }
+
+  ~DeferredAclCleanupScope() {
+    detail::active_acl_cleanup_scope = previous_;
+
+    for (auto& cleanup : callbacks_) {
+      cleanup();
+    }
+  }
+
+  DeferredAclCleanupScope(const DeferredAclCleanupScope&) = delete;
+
+  DeferredAclCleanupScope& operator=(const DeferredAclCleanupScope&) = delete;
+
+  void Defer(std::function<void()> cleanup) {
+    callbacks_.push_back(std::move(cleanup));
+  }
+
+  std::vector<std::function<void()>> Release() { return std::move(callbacks_); }
+
+ private:
+  DeferredAclCleanupScope* previous_;
+
+  std::vector<std::function<void()>> callbacks_;
+};
+
+inline void DeferOrRunAclCleanup(std::function<void()> cleanup) {
+  if (detail::active_acl_cleanup_scope) {
+    detail::active_acl_cleanup_scope->Defer(std::move(cleanup));
+
+    return;
+  }
+
+  cleanup();
+}
+
+}  // namespace infini::ops::ascend
+
+#endif
diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py
new file mode 100644
index 000000000..577ababb6
--- /dev/null
+++ b/tests/test_flash_attention.py
@@ -0,0 +1,520 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, get_stream, randn_strided
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "num_heads, num_kv_heads, head_size",
+    (
+        (32, 32, 128),  # MHA
+        (32, 8, 128),  # GQA (4x)
+        (16, 4, 64),  # GQA (4x), smaller
+    ),
+)
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, 1e-2, 5e-3),
+    ),
+)
+@pytest.mark.parametrize("device", ("npu",))
+def test_flash_attention_prefill_single(
+    num_heads,
+    num_kv_heads,
+    head_size,
+    dtype,
+    rtol,
+    atol,
+    device,
+):
+    """Single sequence prefill (no block table)."""
+    if device == "npu" and not (hasattr(torch, "npu") and torch.npu.is_available()):
+        pytest.skip("NPU not available")
+
+    num_tokens = 16
+    scale = 1.0 / head_size**0.5
+
+    query = randn_strided(
+        (num_tokens, num_heads, head_size), None, dtype=dtype, device=device
+    )
+    key = randn_strided(
+        (num_tokens, num_kv_heads, head_size), None, dtype=dtype, device=device
+    )
+    value = randn_strided(
+        (num_tokens, num_kv_heads, head_size), None, dtype=dtype, device=device
+    )
+    output = torch.empty((num_tokens, num_heads, head_size), dtype=dtype, device=device)
+
+    return Payload(
+        lambda q, k, v, o: _flash_attention(
+            q,
+            k,
+            v,
+            None,
+            None,
+            None,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            scale,
+            True,
+            -1,
+            0,
+            0,
+            o,
+        ),
+        lambda q, k, v, o: _ref_flash_attention(
+            q,
+            k,
+            v,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            scale,
+            causal=True,
+        ),
+        (query, key, value, output),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "num_heads, num_kv_heads, head_size",
+    ((32, 8, 128),),
+)
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, 1e-2, 5e-3),
+    ),
+)
+@pytest.mark.parametrize("device", ("npu",))
+def test_flash_attention_prefill_multi(
+    num_heads,
+    num_kv_heads,
+    head_size,
+    dtype,
+    rtol,
+    atol,
+    device,
+):
+    """Multi-sequence prefill with cu_seqlens."""
+    if device == "npu" and not (hasattr(torch, "npu") and torch.npu.is_available()):
+        pytest.skip("NPU not available")
+
+    seq_lens = [8, 12, 4]
+    num_tokens = sum(seq_lens)
+    scale = 1.0 / head_size**0.5
+
+    query = randn_strided(
+        (num_tokens, num_heads, head_size), None, dtype=dtype, device=device
+    )
+    key = randn_strided(
+        (num_tokens, num_kv_heads, head_size), None, dtype=dtype, device=device
+    )
+    value = randn_strided(
+        (num_tokens, num_kv_heads, head_size), None, dtype=dtype, device=device
+    )
+    output = torch.empty((num_tokens, num_heads, head_size), dtype=dtype, device=device)
+
+    cu_seqlens_q = torch.tensor(
+        [0] + [sum(seq_lens[: i + 1]) for i in range(len(seq_lens))],
+        dtype=torch.int64,
+        device=device,
+    )
+    cu_seqlens_kv = cu_seqlens_q.clone()
+
+    return Payload(
+        lambda q, k, v, o: _flash_attention(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            None,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            scale,
+            True,
+            -1,
+            0,
+            0,
+            o,
+        ),
+        lambda q, k, v, o: _ref_flash_attention_multi(
+            q,
+            k,
+            v,
+            seq_lens,
+            seq_lens,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            scale,
+            causal=True,
+        ),
+        (query, key, value, output),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "num_heads, num_kv_heads, head_size, block_size",
+    (
+        (32, 8, 128, 128),
+        (16, 4, 64, 128),
+    ),
+)
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, 1e-2, 5e-3),
+    ),
+)
+@pytest.mark.parametrize("device", ("npu",))
+def test_flash_attention_decode(
+    num_heads,
+    num_kv_heads,
+    head_size,
+    block_size,
+    dtype,
+    rtol,
+    atol,
+    device,
+):
+    """Decode phase: single token per request with paged KV cache."""
+    if device == "npu" and not (hasattr(torch, "npu") and torch.npu.is_available()):
+        pytest.skip("NPU not available")
+
+    num_reqs = 3
+    kv_len = 16  # Total KV length per request.
+    num_blocks_per_req = (kv_len + block_size - 1) // block_size
+    num_blocks = num_reqs * num_blocks_per_req
+    scale = 1.0 / head_size**0.5
+
+    query = randn_strided(
+        (num_reqs, num_heads, head_size), None, dtype=dtype, device=device
+    )
+    # Paged KV cache: vLLM standard layout [num_blocks, block_size, KV_N, D].
+    kv_cache = randn_strided(
+        (num_blocks, block_size, num_kv_heads, head_size),
+        None,
+        dtype=dtype,
+        device=device,
+    )
+    output = torch.empty((num_reqs, num_heads, head_size), dtype=dtype, device=device)
+
+    # Block table: request i uses blocks [i*num_blocks_per_req, ...].
+    block_table = torch.zeros(
+        (num_reqs, num_blocks_per_req), dtype=torch.int32, device=device
+    )
+    for i in range(num_reqs):
+        for j in range(num_blocks_per_req):
+            block_table[i, j] = i * num_blocks_per_req + j
+
+    cu_seqlens_q = torch.arange(0, num_reqs + 1, dtype=torch.int64, device=device)
+    cu_seqlens_kv = torch.tensor(
+        [i * kv_len for i in range(num_reqs + 1)], dtype=torch.int64, device=device
+    )
+
+    return Payload(
+        lambda q, k, v, o: _flash_attention(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            block_table,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            scale,
+            True,
+            -1,
+            0,
+            block_size,
+            o,
+        ),
+        lambda q, k, v, o: _ref_flash_attention_paged(
+            q,
+            k,
+            block_table,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            block_size,
+            scale,
+            causal=True,
+        ),
+        (query, kv_cache, kv_cache, output),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "num_heads, num_kv_heads, head_size, block_size",
+    ((32, 8, 128, 128),),
+)
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    ((torch.float16, 1e-3, 1e-3),),
+)
+@pytest.mark.parametrize("device", ("npu",))
+def test_flash_attention_decode_cpu_cuseqlens(
+    num_heads,
+    num_kv_heads,
+    head_size,
+    block_size,
+    dtype,
+    rtol,
+    atol,
+    device,
+):
+    """Decode with CPU cu_seqlens_kv — exercises the D2H-free code path."""
+    if device == "npu" and not (hasattr(torch, "npu") and torch.npu.is_available()):
+        pytest.skip("NPU not available")
+
+    num_reqs = 3
+    kv_len = 16
+    num_blocks_per_req = (kv_len + block_size - 1) // block_size
+    num_blocks = num_reqs * num_blocks_per_req
+    scale = 1.0 / head_size**0.5
+
+    query = randn_strided(
+        (num_reqs, num_heads, head_size), None, dtype=dtype, device=device
+    )
+    kv_cache = randn_strided(
+        (num_blocks, block_size, num_kv_heads, head_size),
+        None,
+        dtype=dtype,
+        device=device,
+    )
+    output = torch.empty((num_reqs, num_heads, head_size), dtype=dtype, device=device)
+
+    block_table = torch.zeros(
+        (num_reqs, num_blocks_per_req), dtype=torch.int32, device=device
+    )
+
+    for i in range(num_reqs):
+        for j in range(num_blocks_per_req):
+            block_table[i, j] = i * num_blocks_per_req + j
+
+    cu_seqlens_q = torch.arange(0, num_reqs + 1, dtype=torch.int64, device=device)
+
+    # CPU cu_seqlens_kv — exercises `detail::extractSeqLengths` host path
+    # (direct pointer read, no D2H copy).
+    cu_seqlens_kv = torch.tensor(
+        [i * kv_len for i in range(num_reqs + 1)], dtype=torch.int64
+    )
+
+    return Payload(
+        lambda q, k, v, o: _flash_attention(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            block_table,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            scale,
+            True,
+            -1,
+            0,
+            block_size,
+            o,
+        ),
+        lambda q, k, v, o: _ref_flash_attention_paged(
+            q,
+            k,
+            block_table,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            block_size,
+            scale,
+            causal=True,
+        ),
+        (query, kv_cache, kv_cache, output),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _flash_attention(
+    query,
+    key,
+    value,
+    cu_seqlens_q,
+    cu_seqlens_kv,
+    block_table,
+    num_heads,
+    num_kv_heads,
+    head_size,
+    scale,
+    causal,
+    window_left,
+    window_right,
+    block_size,
+    output,
+):
+    infini.ops.flash_attention(
+        query,
+        key,
+        value,
+        cu_seqlens_q,
+        cu_seqlens_kv,
+        block_table,
+        num_heads,
+        num_kv_heads,
+        head_size,
+        scale,
+        causal,
+        window_left,
+        window_right,
+        block_size,
+        output,
+        stream=get_stream(query.device),
+    )
+
+    return output
+
+
+def _ref_flash_attention(
+    query, key, value, num_heads, num_kv_heads, head_size, scale, causal=True
+):
+    """PyTorch SDPA reference for single-sequence prefill."""
+    # [T, N, D] -> [N, T, D]
+    q = query.transpose(0, 1).float()
+    k = key.transpose(0, 1).float()
+    v = value.transpose(0, 1).float()
+
+    # GQA: expand K/V to match num_heads.
+    if num_kv_heads < num_heads:
+        ratio = num_heads // num_kv_heads
+        k = k.repeat_interleave(ratio, dim=0)
+        v = v.repeat_interleave(ratio, dim=0)
+
+    # [N, T, D] -> [1, N, T, D] for scaled_dot_product_attention.
+    q = q.unsqueeze(0)
+    k = k.unsqueeze(0)
+    v = v.unsqueeze(0)
+
+    out = torch.nn.functional.scaled_dot_product_attention(
+        q, k, v, scale=scale, is_causal=causal
+    )
+
+    # [1, N, T, D] -> [T, N, D] -> original dtype.
+    return out.squeeze(0).transpose(0, 1).to(query.dtype)
+
+
+def _ref_flash_attention_multi(
+    query,
+    key,
+    value,
+    seq_lens_q,
+    seq_lens_kv,
+    num_heads,
+    num_kv_heads,
+    head_size,
+    scale,
+    causal=True,
+):
+    """PyTorch SDPA reference for multi-sequence prefill."""
+    outputs = []
+    offset = 0
+    for sq, sk in zip(seq_lens_q, seq_lens_kv):
+        q = query[offset : offset + sq]
+        k = key[offset : offset + sq]
+        v = value[offset : offset + sq]
+        out = _ref_flash_attention(
+            q, k, v, num_heads, num_kv_heads, head_size, scale, causal
+        )
+        outputs.append(out)
+        offset += sq
+
+    return torch.cat(outputs, dim=0)
+
+
+def _ref_flash_attention_paged(
+    query,
+    kv_cache_arg,
+    block_table,
+    cu_seqlens_q,
+    cu_seqlens_kv,
+    num_heads,
+    num_kv_heads,
+    head_size,
+    block_size,
+    scale,
+    causal=True,
+):
+    """PyTorch SDPA reference for decode with paged KV cache."""
+    cu_kv = cu_seqlens_kv.cpu()
+    bt = block_table.cpu()
+    cache = kv_cache_arg.cpu()
+    q_cpu = query.cpu()
+    num_reqs = bt.size(0)
+    outputs = []
+
+    for i in range(num_reqs):
+        q = q_cpu[i : i + 1]  # [1, N, D]
+        kv_len = int(cu_kv[i + 1] - cu_kv[i])
+
+        # Gather KV from paged cache.
+        # cache: [num_blocks, KV_N, block_size, D]
+        blocks = bt[i]
+        k_pages = []
+        v_pages = []
+        remaining = kv_len
+
+        for b in blocks:
+            if remaining <= 0:
+                break
+
+            take = min(remaining, block_size)
+            # cache layout: [num_blocks, block_size, KV_N, D]
+            # Slice [take, KV_N, D], transpose to [KV_N, take, D] for cat.
+            k_pages.append(cache[int(b.item()), :take, :, :].transpose(0, 1))
+            v_pages.append(cache[int(b.item()), :take, :, :].transpose(0, 1))
+            remaining -= take
+        k = torch.cat(k_pages, dim=1)  # [KV_N, kv_len, D]
+        v = torch.cat(v_pages, dim=1)
+
+        # Decode: Q_S=1 attends to all past KV positions; causal masking is
+        # not applicable here (it would mask everything beyond position 0).
+        out = _ref_flash_attention(
+            q,  # [1, N, D] - already TND format
+            k.transpose(0, 1),  # [KV_N, kv_len, D] -> [kv_len, KV_N, D]
+            v.transpose(0, 1),
+            num_heads,
+            num_kv_heads,
+            head_size,
+            scale,
+            causal=False,
+        )
+        outputs.append(out)
+
+    return torch.cat(outputs, dim=0).to(query.device)