From 2b9fbc5c3dc59e86187350ba875eee819e8f2bf0 Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Tue, 5 May 2026 18:50:17 -0700
Subject: [PATCH 01/11] refactor nvte_get_fused_attn_backend with FE calls

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
---
 .../common/fused_attn/fused_attn.cpp          | 516 ++++++------------
 .../fused_attn_f16_arbitrary_seqlen.cu        | 136 +++++
 .../fused_attn_f16_arbitrary_seqlen.h         |  22 +
 .../common/fused_attn/fused_attn_fp8.cu       | 101 ++++
 .../common/fused_attn/fused_attn_fp8.h        |  25 +
 .../include/transformer_engine/fused_attn.h   |  50 +-
 .../jax/csrc/extensions/attention.cpp         |  32 +-
 .../pytorch/csrc/extensions/attention.cpp     |  17 +-
 8 files changed, 539 insertions(+), 360 deletions(-)
diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index 141767b803..615f7c2a03 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -226,357 +226,189 @@ NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout) {
   }
 }
 
+namespace {
+
+// Per-thread storage for the message string handed back through
+// NVTEFusedAttnBackendStatus::message. Re-used (cleared + re-populated) on every call to
+// nvte_get_fused_attn_backend on this thread, which is exactly the lifetime documented in the
+// public header.
+thread_local std::string g_fused_attn_backend_status_buffer;
+
+// Apply (code, msg) to *out_status (if non-null), routing the message through the
+// thread-local buffer so the returned `const char*` outlives this function call.
+void set_status(NVTEFusedAttnBackendStatus *out_status, cudnn_frontend::error_code_t code,
+                const std::string &message) {
+  if (out_status == nullptr) return;
+  g_fused_attn_backend_status_buffer = message;
+  out_status->code = static_cast<int>(code);
+  out_status->message = g_fused_attn_backend_status_buffer.c_str();
+}
+
+void set_status(NVTEFusedAttnBackendStatus *out_status, const cudnn_frontend::error_t &err) {
+  set_status(out_status, err.code, err.err_msg);
+}
+
+void set_ok(NVTEFusedAttnBackendStatus *out_status) {
+  set_status(out_status, cudnn_frontend::error_code_t::OK, "");
+}
+
+}  // namespace
+
 // select a backend for fused attention
+//
+// Routing flow:
+//   1. Apply TE post-filters that encode policies cuDNN-FE doesn't model directly:
+//        a. requires_64bit_ragged_offset -> cudnn >= 9.5
+//        b. qkv_format == THD requires a padding-style mask
+//        c. cuDNN <= 9.15 + is_training + bshd/sbhd + max_seqlen_kv % 128 != 0 +
+//           cuda_graph + non-padding mask is rejected (known capture quirk)
+//   2. Dispatch by dtype to the appropriate probe(s):
+//        - FP8 (E4M3/E5M2): is_supported_fp8_fwd (+ is_supported_fp8_bwd if training)
+//        - FP16/BF16:       is_supported_f16_fwd (+ is_supported_f16_bwd if training)
+//      The probes call the same _impl that the executor uses, with workspace=nullptr.
+//      They run validate -> build_operation_graph -> create_execution_plans ->
+//      check_support -> build_plans, and populate a thread-local cache that the
+//      executor cache-hits on.
+//   3. Return the selected backend, or NVTE_No_Backend if any probe rejects.
+//
+// When `out_status` is non-null, it is filled with a code + message describing the
+// rejection (or {OK, ""} on success). TE post-filter rejections synthesize an
+// INVALID_VALUE entry; probe rejections forward the cuDNN-FE / NVTE_CHECK error verbatim.
 NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
-    bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
-    float dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic) {
+    bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
+    NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float dropout,
+    size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
+    size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
+    bool return_max_logit, bool cuda_graph, bool deterministic, cudnnHandle_t handle,
+    NVTEFusedAttnBackendStatus *out_status) {
   using namespace transformer_engine;
-  NVTE_Fused_Attn_Backend backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-  const int device_id = cuda::current_device();
-  const int sm_arch_ = cuda::sm_arch(device_id);
+  // Initialize to OK so callers get a clean status on the success path without us having to
+  // remember to set it at every return.
+  set_ok(out_status);
   NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type.");
-  NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
-  NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout);
-  NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-  NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
-  auto cudnn_runtime_version = cudnnGetVersion();
 
-  // For ragged offsets we only support 32-bit prior to cuDNN 9.5
-  // Only used when THD format is requested.
+  const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
+  const NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
+  const auto cudnn_runtime_version = cudnnGetVersion();
+
+  // ---------- TE post-filters (apply before delegating to cuDNN-FE) ----------
+
+  // (1) Ragged-offset width: cuDNN < 9.5 only supports 32-bit offsets.
   const bool requires_64bit_ragged_offset =
       (qkv_format == NVTE_THD && fused_attn::get_ragged_offset_dtype(
                                      layout_group, num_attn_heads, num_gqa_groups, max_seqlen_q,
                                      max_seqlen_kv, head_dim_qk, head_dim_v) == DType::kInt64);
-  const bool supported_ragged_offset_size =
-      (!requires_64bit_ragged_offset || cudnn_runtime_version >= 90500);
-
-  if ((q_dtype == NVTEDType::kNVTEFloat8E4M3 || q_dtype == NVTEDType::kNVTEFloat8E5M2) &&
-      sm_arch_ >= 90 && bias_type == NVTE_Bias_Type::NVTE_NO_BIAS &&
-      // 8.9: t3hd, max_s=512, d=64, padding
-      ((cudnn_runtime_version >= 8900 && sm_arch_ < 100 &&
-        qkv_layout == NVTE_QKV_Layout::NVTE_T3HD && max_seqlen_q == max_seqlen_kv &&
-        max_seqlen_q <= 512 && head_dim_qk == 64 && head_dim_v == 64 &&
-        attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK) ||
-       // 9.2.1: {bshd, sbhd}, any seqlen, d=128, {no_mask, causal}
-       (cudnn_runtime_version >= 90201 && sm_arch_ < 100 && max_seqlen_q % 128 == 0 &&
-        max_seqlen_kv % 128 == 0 && head_dim_qk == 128 && head_dim_v == 128 &&
-        (attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-         attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK)) ||
-       // 9.7: {bshd, sbhd}, any seqlen, d<=256 for sm90 and d<=128 for sm100, {padding, padding_causal}
-       (cudnn_runtime_version >= 90700 &&
-        // TODO (cyang): add is_training to nvte_get_fused_attn_backend
-        // sm90: fwd d<=256, bwd d=128 only
-        // sm100: fwd d<=128, bwd d<=128
-        ((sm_arch_ < 100 && (!is_training) && head_dim_qk <= 256 && head_dim_v <= 256) ||
-         (sm_arch_ < 100 && is_training && head_dim_qk == 128 && head_dim_v == 128) ||
-         (sm_arch_ >= 100 && head_dim_qk <= 128 && head_dim_v <= 128)) &&
-        head_dim_qk % 16 == 0 && head_dim_v % 16 == 0 &&
-        (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK ||
-         attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-         attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-         attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)) ||
-       // 9.21: d_qk=192, d_v=128
-       (cudnn_runtime_version >= 92100 && sm_arch_ >= 100 && head_dim_qk <= 192 &&
-        head_dim_v <= 128 && head_dim_qk % 16 == 0 && head_dim_v % 16 == 0 &&
-        (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK ||
-         attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-         attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK))) &&
-      // pre-9.21: {bshd, sbhd}, {vanilla}
-      // 9.21+: {bshd, sbhd, bhsd}, {vanilla, off-by-one, learnable}
-      ((cudnn_runtime_version < 92100 &&
-        (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD) &&
-        softmax_type == NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX) ||
-       (cudnn_runtime_version >= 92100 &&
-        (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD ||
-         qkv_format == NVTE_QKV_Format::NVTE_BHSD))) &&
-      !requires_64bit_ragged_offset &&
-      // 9.10.0: known bugs with SDPA FP8
-      (cudnn_runtime_version != 91000) && !return_max_logit) {
-    if (cudnn_runtime_version >= 8900) {
-      backend = NVTE_Fused_Attn_Backend::NVTE_FP8;
-    } else {
-      backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-      std::cout << "Warning: FP8 fused attention is supported by cuDNN 8.9.0+."
-                   " Please upgrade your cuDNN version if possible."
-                << std::endl;
-    }
-  } else if ((q_dtype == NVTEDType::kNVTEFloat16) || (q_dtype == NVTEDType::kNVTEBFloat16)) {
-    bool flag_m512 = false;
-    bool flag_arb = false;
-    if ((sm_arch_ == 80 || sm_arch_ == 90) && (max_seqlen_q <= 512 && max_seqlen_q % 64 == 0) &&
-        (max_seqlen_kv <= 512 && max_seqlen_kv % 64 == 0) && (head_dim_qk == 64) &&
-        (head_dim_v == 64) && (num_attn_heads == num_gqa_groups) &&
-        ((bias_type == NVTE_Bias_Type::NVTE_NO_BIAS) ||
-         (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS)) &&
-        ((attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) ||
-         (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK) ||
-         (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
-          max_seqlen_q == max_seqlen_kv) ||
-         (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK)) &&
-        ((qkv_layout == NVTE_QKV_Layout::NVTE_SB3HD) ||
-         (qkv_layout == NVTE_QKV_Layout::NVTE_SBHD_SB2HD) ||
-         (qkv_layout == NVTE_QKV_Layout::NVTE_BS3HD) ||
-         (qkv_layout == NVTE_QKV_Layout::NVTE_BSHD_BS2HD) ||
-         (qkv_layout == NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD)) &&
-        ((window_size_left == -1) && (window_size_right == -1 || window_size_right == 0)) &&
-        !requires_64bit_ragged_offset &&
-        (softmax_type == NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX) && !return_max_logit) {
-      flag_m512 = true;
+  if (requires_64bit_ragged_offset && cudnn_runtime_version < 90500) {
+    set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
+               "Configuration requires 64-bit ragged offsets, which require cuDNN >= 9.5.");
+    return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+  }
+
+  // (2) THD requires a padding-style mask.
+  if (qkv_format == NVTE_QKV_Format::NVTE_THD &&
+      attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
+      attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
+      attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
+    set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
+               "THD-format attention requires a padding-style mask "
+               "(PADDING / PADDING_CAUSAL / PADDING_CAUSAL_BOTTOM_RIGHT).");
+    return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+  }
+
+  // (3) cuDNN-Graph capture quirk on cuDNN <= 9.15: training + bshd/sbhd with
+  // max_seqlen_kv % 128 != 0 + cuda_graph + non-padding mask hangs/miscompiles.
+  if (cudnn_runtime_version <= 91500 && is_training &&
+      (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD) &&
+      (max_seqlen_kv % 128 != 0) && cuda_graph &&
+      attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
+      attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
+      attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
+    set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
+               "Known cuDNN <= 9.15 capture quirk: training + bshd/sbhd + "
+               "max_seqlen_kv % 128 != 0 + cuda_graph + non-padding mask is unsupported.");
+    return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+  }
+
+  // ---------- Dispatch by dtype ----------
+
+  // Probes use a single-batch graph; capability checks in cuDNN-FE are batch-agnostic.
+  constexpr size_t probe_batch = 1;
+  // bottom_right_diagonal is a runtime API knob the router doesn't see; the BRCM-via-mask
+  // case is captured by attn_mask_type, so we probe with the default top-left alignment.
+  constexpr bool probe_bottom_right_diagonal = false;
+
+  const bool is_fp8 =
+      (q_dtype == NVTEDType::kNVTEFloat8E4M3 || q_dtype == NVTEDType::kNVTEFloat8E5M2);
+  const bool is_f16_or_bf16 =
+      (q_dtype == NVTEDType::kNVTEFloat16 || q_dtype == NVTEDType::kNVTEBFloat16);
+
+  if (is_fp8) {
+    // TE-only FP8 post-filters: no 64-bit ragged offsets, no max-logit output.
+    if (requires_64bit_ragged_offset) {
+      set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
+                 "FP8 fused attention does not support 64-bit ragged offsets.");
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
-    if (
-        // TODO(cyang): replace with cudnn-frontend check_support for cleaner logic and better error messaging
-        // architecture
-        ((cudnn_runtime_version < 8903 && (sm_arch_ == 80 || sm_arch_ == 90)) ||
-         (cudnn_runtime_version >= 8903 && sm_arch_ >= 80 && sm_arch_ < 100) ||
-         (cudnn_runtime_version >= 90700 && sm_arch_ >= 100)) &&
-        // sequence length
-        ((cudnn_runtime_version < 90000 && max_seqlen_q % 64 == 0 && max_seqlen_kv % 64 == 0) ||
-         (cudnn_runtime_version >= 90000)) &&
-        // number of heads
-        ((cudnn_runtime_version < 8907 && num_attn_heads == num_gqa_groups) ||
-         (cudnn_runtime_version >= 8907)) &&
-        // head dimension
-        // multiples of 8
-        (head_dim_qk % 8 == 0 && head_dim_v % 8 == 0 &&
-         // <= 128
-         ((head_dim_qk <= 128 && head_dim_v <= 128) ||
-          // 9.1: <= 256 + Hopper + fprop
-          // 9.5: <= 256 + Hopper + bprop
-          (head_dim_qk <= 256 && head_dim_v <= 256 &&
-           ((!is_training && sm_arch_ == 90 && cudnn_runtime_version >= 90100) ||
-            (is_training && sm_arch_ == 90 && cudnn_runtime_version >= 90500))) ||
-          // 9.9: any head_dim + Blackwell + fprop + non_paged + sq > 1
-          (!is_training && sm_arch_ >= 100 && cudnn_runtime_version >= 90900 && max_seqlen_q > 1 &&
-           layout_group != NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD) ||
-          // 9.10.2: any head_dim + any arch + fprop + paged
-          // 9.10.2: any head_dim + any arch + fprop + non_paged + sq > 1
-          // 9.10.2: any head_dim + any arch + fprop + non_paged + sq = 1 + {no_mask, padding, BRCM, padding_BRCM}
-          (!is_training && cudnn_runtime_version >= 91002 &&
-           (layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD || max_seqlen_q > 1 ||
-            (max_seqlen_q == 1 && attn_mask_type != NVTE_Mask_Type::NVTE_CAUSAL_MASK &&
-             attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK))) ||
-          // 9.11: d_qk = 192, d_v = 128 + Blackwell + bprop + non-paged
-          (head_dim_qk == 192 && head_dim_v == 128 && is_training && sm_arch_ >= 100 &&
-           cudnn_runtime_version >= 91100)) &&
-         // 9.11+ bug: 128 < d_qk <= 256, 128 < d_v <= 256 + Hopper + bprop + MLA
-         // Conditional to temporarily use blanket cudnn_runtime_version >= 9.11 until fixed
-         (!((cudnn_runtime_version >= 91100) && is_training && sm_arch_ == 90 &&
-            head_dim_qk >= 128 && head_dim_v >= 128 && !(head_dim_qk == 192 && head_dim_v == 128) &&
-            head_dim_qk != head_dim_v))) &&
-        // bias type
-        ((cudnn_runtime_version < 8906 && bias_type == NVTE_Bias_Type::NVTE_NO_BIAS) ||
-         (cudnn_runtime_version >= 8906 &&
-          (bias_type == NVTE_Bias_Type::NVTE_NO_BIAS ||
-           (bias_type == NVTE_Bias_Type::NVTE_ALIBI &&
-            attn_mask_type != NVTE_Mask_Type::NVTE_NO_MASK &&
-            attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
-            attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
-            attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK &&
-            sm_arch_ >= 90) ||
-           (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS && sm_arch_ >= 90))) ||
-         (cudnn_runtime_version >= 90000 &&
-          (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS && sm_arch_ >= 80))) &&
-        // mask type
-        // pre-8.9.6: causal
-        ((cudnn_runtime_version < 8906 && attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) ||
-         // 8.9.6: {bshd, sbhd} + {no_mask, causal, padding, padding_causal}
-         (cudnn_runtime_version >= 8906 &&
-          (qkv_format == NVTE_QKV_Format::NVTE_SBHD || qkv_format == NVTE_QKV_Format::NVTE_BSHD) &&
-          (attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK)) ||
-         // 9.1: adds thd + {padding, padding_causal}
-         (cudnn_runtime_version >= 90100 && qkv_format == NVTE_QKV_Format::NVTE_THD &&
-          (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)) ||
-         // 9.3: adds {bshd, sbhd} + causal_bottom_right + self/cross-attn (sq <= skv)
-         (cudnn_runtime_version >= 90300 &&
-          (qkv_format == NVTE_QKV_Format::NVTE_SBHD || qkv_format == NVTE_QKV_Format::NVTE_BSHD) &&
-          attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK &&
-          max_seqlen_q % 64 == 0 && max_seqlen_kv % 64 == 0 && max_seqlen_q <= max_seqlen_kv &&
-          bias_type == NVTE_Bias_Type::NVTE_NO_BIAS && dropout == 0.0) ||
-         // 9.5: adds {paged_kv_bshd, paged_kv_sbhd} + {padding, padding_causal, padding_causal_bottom_right}
-         (cudnn_runtime_version >= 90500 &&
-          layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD &&
-          (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK ||
-           (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK &&
-            max_seqlen_q % 64 == 0 && max_seqlen_kv % 64 == 0 && max_seqlen_q <= max_seqlen_kv)) &&
-          bias_type == NVTE_Bias_Type::NVTE_NO_BIAS && dropout == 0.0) ||
-         // 9.6: adds {bshd, sbhd, thd} + padding_causal_bottom_right + self/cross-attn (sq <= skv)
-         (cudnn_runtime_version >= 90600 &&
-          attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK &&
-          max_seqlen_q % 64 == 0 && max_seqlen_kv % 64 == 0 && max_seqlen_q <= max_seqlen_kv &&
-          bias_type == NVTE_Bias_Type::NVTE_NO_BIAS && dropout == 0.0) ||
-         // 9.7: removes s_q/s_kv % 64 = 0 for {causal_bottom_right, padding_causal_bottom_right}
-         // for any q_format/kv_format, and paged/non-paged
-         (cudnn_runtime_version >= 90700 &&
-          (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-           ((attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) &&
-            bias_type == NVTE_Bias_Type::NVTE_NO_BIAS && dropout == 0.0) ||
-           ((attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) &&
-            max_seqlen_q <= max_seqlen_kv)))) &&
-        // bias + mask combination
-        (!(cudnn_runtime_version >= 8906 &&
-           (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-            attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK) &&
-           bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS)) &&
-        // qkv format
-        (qkv_format == NVTE_QKV_Format::NVTE_SBHD || qkv_format == NVTE_QKV_Format::NVTE_BSHD ||
-         qkv_format == NVTE_QKV_Format::NVTE_BHSD ||
-         (qkv_format == NVTE_QKV_Format::NVTE_THD && sm_arch_ >= 90 &&
-          ((cudnn_runtime_version >= 90100 && num_attn_heads == num_gqa_groups) ||
-           cudnn_runtime_version >= 90600)) ||
-         ((q_format == NVTE_QKV_Format::NVTE_SBHD || q_format == NVTE_QKV_Format::NVTE_BSHD ||
-           q_format == NVTE_QKV_Format::NVTE_BHSD ||
-           (q_format == NVTE_QKV_Format::NVTE_THD && sm_arch_ >= 90) ||
-           kv_format == NVTE_QKV_Format::NVTE_SBHD || kv_format == NVTE_QKV_Format::NVTE_BSHD ||
-           kv_format == NVTE_QKV_Format::NVTE_BHSD ||
-           (kv_format == NVTE_QKV_Format::NVTE_THD && sm_arch_ >= 90)) &&
-          cudnn_runtime_version >= 90700)) &&
-        // sliding window
-        // pre-9.2: full attn, causal
-        ((cudnn_runtime_version < 90200 && window_size_left == -1 &&
-          (window_size_right == -1 || window_size_right == 0)) ||
-         // 9.2: SWA (left, 0) + top-left diagonal + {bshd, sbhd}
-         (cudnn_runtime_version >= 90200 &&
-          ((window_size_left == -1 && window_size_right == -1 &&
-            attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK) ||
-           ((window_size_left == -1 || window_size_left >= 0) && window_size_right == 0 &&
-            (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-             (attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK &&
-              max_seqlen_q == max_seqlen_kv)) &&
-            max_seqlen_q <= max_seqlen_kv && dropout == 0.0 &&
-            bias_type == NVTE_Bias_Type::NVTE_NO_BIAS &&
-            (qkv_format == NVTE_QKV_Format::NVTE_BSHD ||
-             qkv_format == NVTE_QKV_Format::NVTE_SBHD)))) ||
-         // 9.6: SWA (left, 0) + top-left/bottom-right diagonal + {bshd, sbhd, thd}
-         (cudnn_runtime_version >= 90600 &&
-          ((window_size_left == -1 && (window_size_right == -1 || window_size_right == 0)) ||
-           ((window_size_left >= 0 || window_size_left == -1) &&
-            (window_size_right >= 0 || window_size_right == -1) &&
-            ((attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK &&
-              // TODO(cyang): fix bug for BRCM + cross-attention on sm100
-              (sm_arch_ < 100 || (sm_arch_ >= 100 && ((max_seqlen_q == max_seqlen_kv &&
-                                                       cudnn_runtime_version <= 90700) ||
-                                                      cudnn_runtime_version > 90700)))) ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK ||
-             (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK &&
-              (sm_arch_ < 100 || (sm_arch_ >= 100 && ((max_seqlen_q == max_seqlen_kv &&
-                                                       cudnn_runtime_version <= 90700) ||
-                                                      cudnn_runtime_version > 90700))))) &&
-            max_seqlen_q <= max_seqlen_kv && bias_type == NVTE_Bias_Type::NVTE_NO_BIAS &&
-            dropout == 0.0)))) &&
-        // check 64-bit ragged offset support
-        (supported_ragged_offset_size) &&
-        // 9.10.0/9.10.1: known bugs with SDPA F16
-        (cudnn_runtime_version != 91000) && (cudnn_runtime_version != 91001) &&
-        // softmax type
-        // pre-9.13.1: vanilla
-        // 9.13.1+: vanilla, off-by-one, learnable
-        (cudnn_runtime_version >= 91301 ||
-         (cudnn_runtime_version < 91301 &&
-          softmax_type == NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX)) &&
-        // determinism on Blackwell
-        // pre-9.18.1: fwd: deterministic; bwd: non-deterministic
-        // 9.18.1+: fwd: deterministic; bwd: non-deterministic/deterministic
-        (sm_arch_ < 100 ||
-         (sm_arch_ >= 100 && (!is_training ||
-                              (is_training && !deterministic &&
-                               (dropout == 0.0 || bias_type == NVTE_Bias_Type::NVTE_NO_BIAS)) ||
-                              (is_training && deterministic && cudnn_runtime_version >= 91801 &&
-                               dropout == 0.0 && bias_type == NVTE_Bias_Type::NVTE_NO_BIAS))))) {
-      flag_arb = true;
+    if (return_max_logit) {
+      set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
+                 "FP8 fused attention does not support return_max_logit.");
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
-    if (((max_seqlen_q > 512) || (max_seqlen_kv > 512)) && (flag_arb == true)) {
-      backend = NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen;
+    const DType q_t = static_cast<DType>(q_dtype);
+    const DType o_t = static_cast<DType>(o_dtype);
+    auto fwd_status = is_supported_fp8_fwd(
+        probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+        head_dim_v, is_training, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
+        window_size_left, window_size_right, probe_bottom_right_diagonal, q_t, o_t, scaling_mode,
+        handle);
+    if (fwd_status.is_bad()) {
+      set_status(out_status, fwd_status);
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
-    if ((max_seqlen_q <= 512) && (max_seqlen_kv <= 512)) {
-      if (flag_arb == true) {
-        backend = NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen;
-      } else if ((flag_arb == false) && (flag_m512 == true)) {
-        backend = NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen;
-      }
-      int env_backend = static_cast<int>(backend);
-      env_backend = transformer_engine::getenv<int>("NVTE_FUSED_ATTN_BACKEND", env_backend);
-      if (((env_backend == static_cast<int>(NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen)) &&
-           flag_m512) ||
-          ((env_backend == static_cast<int>(NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen)) &&
-           flag_arb)) {
-        backend = static_cast<NVTE_Fused_Attn_Backend>(env_backend);
+    if (is_training) {
+      auto bwd_status = is_supported_fp8_bwd(
+          probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+          head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
+          window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, q_t,
+          o_t, scaling_mode, handle);
+      if (bwd_status.is_bad()) {
+        set_status(out_status, bwd_status);
+        return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
       }
     }
-    if (cudnn_runtime_version < 8901 &&
-        backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
-      backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-      std::cout << "Warning: FP16/BF16 fused attention is supported by cuDNN 8.9.1+."
-                   " Please upgrade your cuDNN version if possible."
-                << std::endl;
-    }
-    if (cudnn_runtime_version < 8900 &&
-        backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
-      backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-      std::cout << "Warning: FP16/BF16 fused attention is supported by cuDNN 8.9.0+."
-                   " Please upgrade your cuDNN version if possible."
-                << std::endl;
-    }
-    if ((cudnn_runtime_version == 91400) && (max_seqlen_kv > 1024) && (window_size_left != -1) &&
-        (attn_mask_type != NVTE_Mask_Type::NVTE_CAUSAL_MASK) &&
-        (attn_mask_type != NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK)) {
-      backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-      std::cout << "Warning: Given combination of attention mask (non-causal) and "
-                   "max_seqlen_kv (> 1024) does not support fused attention for cuDNN 9.14.0. "
-                   " Please upgrade your cuDNN version if possible."
-                << std::endl;
-    }
-    if ((cudnn_runtime_version <= 91500) && is_training &&
-        (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD) &&
-        (max_seqlen_kv % 128 != 0) && cuda_graph &&
-        (attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK) &&
-        (attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK) &&
-        (attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK)) {
-      backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-      std::cout << "Warning: Given combination of attention mask (non-padding),"
-                   " max_seqlen_kv (not divisible by 128), and qkv_format (BSHD/SBHD) for"
-                   " backward fused attention with graph capture requires cuDNN 9.15.1+. "
-                   "Please upgrade your cuDNN version if possible."
-                << std::endl;
+    return NVTE_Fused_Attn_Backend::NVTE_FP8;
+  }
+
+  if (is_f16_or_bf16) {
+    const DType q_t = static_cast<DType>(q_dtype);
+    auto fwd_status = is_supported_f16_fwd(
+        probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+        head_dim_v, is_training, return_max_logit, dropout, qkv_layout, bias_type, attn_mask_type,
+        softmax_type, window_size_left, window_size_right, probe_bottom_right_diagonal, q_t,
+        handle);
+    if (fwd_status.is_bad()) {
+      set_status(out_status, fwd_status);
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
-    if (backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen && sm_arch_ == 120) {
-      if (cudnn_runtime_version < 91801) {
-        backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-        std::cout << "Warning: Given combination of sm_arch_ == 120 and cudnn_runtime_version < "
-                     "91801 is not supported. "
-                  << " Please upgrade your cuDNN version if possible." << std::endl;
-      } else if (deterministic && is_training) {
-        backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-        std::cout << "Warning: Deterministic fused attention on SM120 is not supported."
-                  << std::endl;
-      } else {
-        // Known missing support for T3HD/TH3D layouts on SM120
-        const bool is_t3hd_or_th3d =
-            (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD || qkv_layout == NVTE_QKV_Layout::NVTE_TH3D);
-        if (is_t3hd_or_th3d) {
-          backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-          std::cout << "Warning: Given combination of T3HD/TH3D layouts on SM120 is not supported. "
-                    << " Please consider using other THD layouts if possible." << std::endl;
-        }
+    if (is_training) {
+      auto bwd_status = is_supported_f16_bwd(
+          probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+          head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
+          window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, q_t,
+          handle);
+      if (bwd_status.is_bad()) {
+        set_status(out_status, bwd_status);
+        return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
       }
     }
-  } else {
-    backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+    return NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen;
   }
-  return backend;
+
+  set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
+             "Unsupported Q dtype for fused attention "
+             "(only FP16/BF16/FP8_E4M3/FP8_E5M2 are routable).");
+  return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
 }
 
 // NVTE fused attention FWD with separate Q, K and V
@@ -661,11 +493,14 @@ void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
   auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
   const NVTEDType Q_type = static_cast<NVTEDType>(input_Q->data.dtype);
   const NVTEDType KV_type = static_cast<NVTEDType>(input_K->data.dtype);
+  const NVTEDType O_type = static_cast<NVTEDType>(output_O->data.dtype);
+  const NVTEScalingMode scaling_mode = input_Q->scaling_mode;
 
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      is_training, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout,
-      h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left, window_size_right,
-      return_max_logit, cuda_graph, false);
+      is_training, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type, attn_mask_type,
+      softmax_type, dropout, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left,
+      window_size_right, return_max_logit, cuda_graph, /*deterministic=*/false, handle,
+      /*out_status=*/nullptr);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
     fused_attn_max_512_fwd(b, h_q, max_seqlen_q, max_seqlen_kv, d_qk, is_training, attn_scale,
@@ -747,11 +582,14 @@ void nvte_fused_attn_bwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
   auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
   const NVTEDType Q_type = static_cast<NVTEDType>(input_Q->data.dtype);
   const NVTEDType KV_type = static_cast<NVTEDType>(input_K->data.dtype);
+  const NVTEDType O_type = static_cast<NVTEDType>(input_O->data.dtype);
+  const NVTEScalingMode scaling_mode = input_Q->scaling_mode;
 
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      true, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, h_q,
-      h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left, window_size_right, false,
-      cuda_graph, deterministic);
+      /*is_training=*/true, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type,
+      attn_mask_type, softmax_type, dropout, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v,
+      window_size_left, window_size_right, /*return_max_logit=*/false, cuda_graph, deterministic,
+      handle, /*out_status=*/nullptr);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
     Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]);
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
index 6df7ad35c8..57ca14a3e1 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
@@ -1333,4 +1333,140 @@ void fused_attn_arbitrary_seqlen_bwd(
     NVTE_ERROR("Unexpected workspace_size.");
   }
 }
+
+namespace {
+// Probe-time defaults for runtime-only quantities the router doesn't see (paged-KV dims,
+// ragged max-tokens, bias dims). These produce a graph whose support surface matches the
+// real executor's: for non-paged / non-ragged paths these are unused inside the impl;
+// for ragged-THD we rebind to worst-case bounds; for paged we use 1 page of full s_kv per
+// batch (= same dims as non-paged), so cuDNN-FE applies the paged-attention support rules.
+struct ProbeDims {
+  int64_t max_b;
+  int64_t max_t_q;
+  int64_t max_t_kv;
+  int64_t num_pages_k;
+  int64_t num_pages_v;
+  int64_t page_size_k;
+  int64_t page_size_v;
+  int64_t max_pages_per_seq_k;
+  int64_t max_pages_per_seq_v;
+  int64_t bias_b;
+  int64_t bias_h;
+  int64_t bias_sq;
+  int64_t bias_skv;
+};
+
+ProbeDims compute_probe_dims(int64_t batch, int64_t num_attn_heads, int64_t max_seqlen_q,
+                             int64_t max_seqlen_kv, NVTE_QKV_Layout qkv_layout,
+                             NVTE_Bias_Type bias_type) {
+  const NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout);
+  const NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
+  const bool is_ragged_q = (q_format == NVTE_QKV_Format::NVTE_THD);
+  const bool is_ragged_kv = (kv_format == NVTE_QKV_Format::NVTE_THD);
+  const NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
+  const bool is_paged_kv = (layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD);
+  const bool has_bias = (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS);
+
+  ProbeDims d{};
+  d.max_b = (is_ragged_q || is_ragged_kv) ? batch : 0;
+  d.max_t_q = is_ragged_q ? batch * max_seqlen_q : 0;
+  d.max_t_kv = is_ragged_kv ? batch * max_seqlen_kv : 0;
+  d.num_pages_k = is_paged_kv ? batch : 0;
+  d.num_pages_v = is_paged_kv ? batch : 0;
+  d.page_size_k = is_paged_kv ? max_seqlen_kv : 0;
+  d.page_size_v = is_paged_kv ? max_seqlen_kv : 0;
+  d.max_pages_per_seq_k = is_paged_kv ? 1 : 0;
+  d.max_pages_per_seq_v = is_paged_kv ? 1 : 0;
+  d.bias_b = has_bias ? batch : 0;
+  d.bias_h = has_bias ? num_attn_heads : 0;
+  d.bias_sq = has_bias ? max_seqlen_q : 0;
+  d.bias_skv = has_bias ? max_seqlen_kv : 0;
+  return d;
+}
+}  // namespace
+
+cudnn_frontend::error_t is_supported_f16_fwd(
+    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training,
+    bool return_max_logit, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+    int64_t window_size_right, bool bottom_right_diagonal, DType q_dtype, cudnnHandle_t handle) {
+  const ProbeDims d = compute_probe_dims(static_cast<int64_t>(batch),
+                                         static_cast<int64_t>(num_attn_heads),
+                                         static_cast<int64_t>(max_seqlen_q),
+                                         static_cast<int64_t>(max_seqlen_kv), qkv_layout,
+                                         bias_type);
+  const NVTE_QKV_Format o_format = nvte_get_q_format(qkv_layout);
+
+  size_t workspace_size = 0;
+  try {
+    fused_attn::fused_attn_arbitrary_seqlen_fwd_impl(
+        static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
+        static_cast<int64_t>(num_gqa_groups), static_cast<int64_t>(max_seqlen_q),
+        static_cast<int64_t>(max_seqlen_kv), static_cast<int64_t>(head_dim_qk),
+        static_cast<int64_t>(head_dim_v), d.max_b, d.max_t_q, d.max_t_kv, d.num_pages_k,
+        d.num_pages_v, d.page_size_k, d.page_size_v, d.max_pages_per_seq_k,
+        d.max_pages_per_seq_v, d.bias_b, d.bias_h, d.bias_sq, d.bias_skv, is_training,
+        return_max_logit, /*scaling_factor=*/1.0f, p_dropout, qkv_layout, o_format, bias_type,
+        mask_type, softmax_type, window_size_left, window_size_right, bottom_right_diagonal,
+        /*devPtrQ=*/nullptr, /*devPtrK=*/nullptr, /*devPtrV=*/nullptr, /*devPtrBias=*/nullptr,
+        /*devPtrSoftmaxOffset=*/nullptr, /*devPtrS1=*/nullptr, /*devPtrS2=*/nullptr,
+        /*devPtrO=*/nullptr, /*devPtrDropoutSeed=*/nullptr, /*devPtrDropoutOffset=*/nullptr,
+        /*devPtrCuSeqlensQ=*/nullptr, /*devPtrCuSeqlensKV=*/nullptr,
+        /*devPtrPageTableK=*/nullptr, /*devPtrPageTableV=*/nullptr,
+        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr,
+        get_cudnn_fe_dtype(q_dtype), /*workspace=*/nullptr, &workspace_size,
+        /*stream=*/static_cast<cudaStream_t>(0), handle);
+    return {cudnn_frontend::error_code_t::OK, ""};
+  } catch (const std::exception &e) {
+    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED, e.what()};
+  } catch (...) {
+    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED,
+            "is_supported_f16_fwd: unknown failure"};
+  }
+}
+
+cudnn_frontend::error_t is_supported_f16_bwd(
+    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
+    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
+    bool bottom_right_diagonal, bool deterministic, DType q_dtype, cudnnHandle_t handle) {
+  const ProbeDims d = compute_probe_dims(static_cast<int64_t>(batch),
+                                         static_cast<int64_t>(num_attn_heads),
+                                         static_cast<int64_t>(max_seqlen_q),
+                                         static_cast<int64_t>(max_seqlen_kv), qkv_layout,
+                                         bias_type);
+  const NVTE_QKV_Format o_format = nvte_get_q_format(qkv_layout);
+  const NVTE_QKV_Format do_format = o_format;
+  const NVTE_QKV_Layout dqkv_layout = qkv_layout;
+
+  size_t workspace_size = 0;
+  try {
+    fused_attn::fused_attn_arbitrary_seqlen_bwd_impl(
+        static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
+        static_cast<int64_t>(num_gqa_groups), static_cast<int64_t>(max_seqlen_q),
+        static_cast<int64_t>(max_seqlen_kv), static_cast<int64_t>(head_dim_qk),
+        static_cast<int64_t>(head_dim_v), d.max_b, d.max_t_q, d.max_t_kv, d.bias_b, d.bias_h,
+        d.bias_sq, d.bias_skv, /*scaling_factor=*/1.0f, p_dropout, qkv_layout, o_format, do_format,
+        dqkv_layout, bias_type, mask_type, softmax_type, window_size_left, window_size_right,
+        bottom_right_diagonal, deterministic, /*devPtrQ=*/nullptr, /*devPtrKTranspose=*/nullptr,
+        /*devPtrVTranspose=*/nullptr, /*devPtrO=*/nullptr, /*devPtrSoftmaxStats=*/nullptr,
+        /*devPtrBias=*/nullptr, /*devPtrSoftmaxOffset=*/nullptr, /*devPtrdQ=*/nullptr,
+        /*devPtrdK=*/nullptr, /*devPtrdV=*/nullptr, /*devPtrdO=*/nullptr,
+        /*devPtrdBias=*/nullptr, /*devPtrdSoftmaxOffset=*/nullptr,
+        /*devPtrDropoutSeed=*/nullptr, /*devPtrDropoutOffset=*/nullptr,
+        /*devPtrCuSeqlensQ=*/nullptr, /*devPtrCuSeqlensKV=*/nullptr,
+        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr,
+        get_cudnn_fe_dtype(q_dtype), /*workspace=*/nullptr, &workspace_size,
+        /*stream=*/static_cast<cudaStream_t>(0), handle);
+    return {cudnn_frontend::error_code_t::OK, ""};
+  } catch (const std::exception &e) {
+    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED, e.what()};
+  } catch (...) {
+    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED,
+            "is_supported_f16_bwd: unknown failure"};
+  }
+}
+
 }  // namespace transformer_engine
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
index 8f79b5bb4a..38cf48c1f0 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
@@ -12,6 +12,7 @@
 #define TRANSFORMER_ENGINE_COMMON_FUSED_ATTN_FUSED_ATTN_ARBITRARY_SEQLEN_H_
 
 #include <cudnn.h>
+#include <cudnn_frontend.h>
 
 #include "common/common.h"
 #include "transformer_engine/fused_attn.h"
@@ -47,6 +48,27 @@ void fused_attn_arbitrary_seqlen_bwd(
     const Tensor *cu_seqlens_kv_padded, const Tensor *rng_state, Tensor *workspace,
     cudaStream_t stream, cudnnHandle_t handle);
 
+// Probe: drives cuDNN-FE (validate -> build_operation_graph -> create_execution_plans ->
+// check_support -> build_plans) for an F16/BF16 forward graph with the given configuration.
+// Returns the cuDNN-FE status: error_code_t::OK iff the graph compiles end-to-end. On OK,
+// the built graph is inserted into the same thread-local cache used by
+// fused_attn_arbitrary_seqlen_fwd_impl, so the executor cache-hits on matching descriptors.
+// On rejection, err_msg contains the underlying cuDNN-FE / NVTE_CHECK message.
+cudnn_frontend::error_t is_supported_f16_fwd(
+    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training,
+    bool return_max_logit, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+    int64_t window_size_right, bool bottom_right_diagonal, DType q_dtype, cudnnHandle_t handle);
+
+// Probe: same as above for the F16/BF16 backward graph.
+cudnn_frontend::error_t is_supported_f16_bwd(
+    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
+    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
+    bool bottom_right_diagonal, bool deterministic, DType q_dtype, cudnnHandle_t handle);
+
 }  // namespace transformer_engine
 
 #endif  // TRANSFORMER_ENGINE_COMMON_FUSED_ATTN_FUSED_ATTN_ARBITRARY_SEQLEN_H_
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
index d97f388459..c9f7a9ee76 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
@@ -2991,4 +2991,105 @@ void fused_attn_fp8_bwd(
     return;
   }
 }
+
+cudnn_frontend::error_t is_supported_fp8_fwd(
+    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training, float p_dropout,
+    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
+    bool bottom_right_diagonal, DType q_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
+    cudnnHandle_t handle) {
+  // FP8 fwd impl rejects any qkv_format other than BSHD/SBHD/BHSD with NVTE_ERROR; mirror that
+  // here so the probe returns a typed rejection instead of catching the throw.
+  const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
+  if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
+      qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
+    return {cudnn_frontend::error_code_t::INVALID_VALUE,
+            "FP8 fused attention only supports BSHD/SBHD/BHSD layouts."};
+  }
+  size_t workspace_size = 0;
+  try {
+    fused_attn::fused_attn_fp8_fwd_impl(
+        static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
+        static_cast<int64_t>(num_gqa_groups), static_cast<int64_t>(max_seqlen_q),
+        static_cast<int64_t>(max_seqlen_kv), static_cast<int64_t>(head_dim_qk),
+        static_cast<int64_t>(head_dim_v), is_training, /*scaling_factor=*/1.0f, p_dropout,
+        qkv_layout, /*o_format=*/qkv_format, bias_type, mask_type, softmax_type,
+        window_size_left, window_size_right, bottom_right_diagonal,
+        /*devPtrQ=*/nullptr, /*devPtrK=*/nullptr, /*devPtrV=*/nullptr,
+        /*devPtrSoftmaxOffset=*/nullptr, /*devPtrM=*/nullptr, /*devPtrO=*/nullptr,
+        /*devPtrDescaleQ=*/nullptr, /*devPtrDescaleK=*/nullptr, /*devPtrDescaleV=*/nullptr,
+        /*devPtrDescaleS=*/nullptr, /*devPtrScaleS=*/nullptr, /*devPtrScaleO=*/nullptr,
+        /*devPtrAmaxO=*/nullptr, /*devPtrAmaxS=*/nullptr, /*devPtrcuSeqlensQ=*/nullptr,
+        /*devPtrcuSeqlensKV=*/nullptr, /*devPtrDropoutSeed=*/nullptr,
+        /*devPtrDropoutOffset=*/nullptr, get_cudnn_fe_dtype(q_dtype),
+        get_cudnn_fe_dtype(o_dtype), scaling_mode,
+        /*qkv_scale_inv_format=*/NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
+        /*workspace=*/nullptr, &workspace_size,
+        /*stream=*/static_cast<cudaStream_t>(0), handle);
+    return {cudnn_frontend::error_code_t::OK, ""};
+  } catch (const std::exception &e) {
+    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED, e.what()};
+  } catch (...) {
+    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED,
+            "is_supported_fp8_fwd: unknown failure"};
+  }
+}
+
+cudnn_frontend::error_t is_supported_fp8_bwd(
+    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
+    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
+    bool bottom_right_diagonal, bool deterministic, DType q_dtype, DType o_dtype,
+    NVTEScalingMode scaling_mode, cudnnHandle_t handle) {
+  const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
+  if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
+      qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
+    return {cudnn_frontend::error_code_t::INVALID_VALUE,
+            "FP8 fused attention only supports BSHD/SBHD/BHSD layouts."};
+  }
+  // For FP8 bwd, dO data type matches O data type and dQKV data type matches Q data type
+  // (this mirrors the assumption used by callers of fused_attn_fp8_bwd in TE).
+  const cudnn_frontend::DataType_t qkv_t = get_cudnn_fe_dtype(q_dtype);
+  const cudnn_frontend::DataType_t o_t = get_cudnn_fe_dtype(o_dtype);
+  const cudnn_frontend::DataType_t do_t = o_t;
+  const cudnn_frontend::DataType_t dqkv_t = qkv_t;
+  size_t workspace_size = 0;
+  try {
+    fused_attn::fused_attn_fp8_bwd_impl(
+        static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
+        static_cast<int64_t>(num_gqa_groups), static_cast<int64_t>(max_seqlen_q),
+        static_cast<int64_t>(max_seqlen_kv), static_cast<int64_t>(head_dim_qk),
+        static_cast<int64_t>(head_dim_v), /*scaling_factor=*/1.0f, p_dropout, qkv_layout,
+        /*o_format=*/qkv_format, /*do_format=*/qkv_format, /*dqkv_layout=*/qkv_layout, bias_type,
+        mask_type, softmax_type, window_size_left, window_size_right, bottom_right_diagonal,
+        deterministic,
+        /*devPtrQ=*/nullptr, /*devPtrK=*/nullptr, /*devPtrV=*/nullptr, /*devPtrM=*/nullptr,
+        /*devPtrO=*/nullptr, /*devPtrdO=*/nullptr, /*devPtrSoftmaxOffset=*/nullptr,
+        /*devPtrdQ=*/nullptr, /*devPtrdK=*/nullptr, /*devPtrdV=*/nullptr,
+        /*devPtrdSoftmaxOffset=*/nullptr, /*devPtrDescaleQ=*/nullptr,
+        /*devPtrDescaleK=*/nullptr, /*devPtrDescaleV=*/nullptr, /*devPtrDescaleO=*/nullptr,
+        /*devPtrDescaledO=*/nullptr, /*devPtrDescaleS=*/nullptr, /*devPtrDescaledP=*/nullptr,
+        /*devPtrScaleS=*/nullptr, /*devPtrScaledP=*/nullptr, /*devPtrScaledQ=*/nullptr,
+        /*devPtrScaledK=*/nullptr, /*devPtrScaledV=*/nullptr, /*devPtrAmaxdP=*/nullptr,
+        /*devPtrAmaxdQ=*/nullptr, /*devPtrAmaxdK=*/nullptr, /*devPtrAmaxdV=*/nullptr,
+        /*devPtrQ_t=*/nullptr, /*devPtrK_t=*/nullptr, /*devPtrdO_f16=*/nullptr,
+        /*devPtrdO_t=*/nullptr, /*devPtrDescaleQ_t=*/nullptr, /*devPtrDescaleK_t=*/nullptr,
+        /*devPtrDescaledO_t=*/nullptr, /*devPtrcuSeqlensQ=*/nullptr,
+        /*devPtrcuSeqlensKV=*/nullptr, /*devPtrDropoutSeed=*/nullptr,
+        /*devPtrDropoutOffset=*/nullptr, qkv_t, o_t, do_t, dqkv_t, scaling_mode,
+        /*qkv_scale_inv_format=*/NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
+        /*do_scale_inv_format=*/NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
+        /*workspace=*/nullptr, &workspace_size,
+        /*stream=*/static_cast<cudaStream_t>(0), handle);
+    return {cudnn_frontend::error_code_t::OK, ""};
+  } catch (const std::exception &e) {
+    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED, e.what()};
+  } catch (...) {
+    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED,
+            "is_supported_fp8_bwd: unknown failure"};
+  }
+}
+
 }  // namespace transformer_engine
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h
index aaf5039eeb..5c7f11d80e 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.h
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h
@@ -8,6 +8,8 @@
  *  \brief Functions for fused attention for FP8 with seqlen <= 512
  */
 
+#include <cudnn_frontend.h>
+
 #include "transformer_engine/fused_attn.h"
 #include "transformer_engine/transformer_engine.h"
 
@@ -39,4 +41,27 @@ void fused_attn_fp8_bwd(
     const Tensor *output_dQ, const Tensor *output_dK, const Tensor *output_dV,
     Tensor *output_dSoftmaxOffset, const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv,
     const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);
+
+// Probe: drives cuDNN-FE (validate -> build_operation_graph -> create_execution_plans ->
+// check_support -> build_plans) for an FP8 forward graph with the given configuration.
+// Returns the cuDNN-FE status: error_code_t::OK iff the graph compiles end-to-end. On OK,
+// the built graph is inserted into the same thread-local cache used by fused_attn_fp8_fwd_impl.
+// On rejection, err_msg contains the underlying cuDNN-FE / NVTE_CHECK message.
+cudnn_frontend::error_t is_supported_fp8_fwd(
+    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training, float p_dropout,
+    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
+    bool bottom_right_diagonal, DType q_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
+    cudnnHandle_t handle);
+
+// Probe: same as above for the FP8 backward graph.
+cudnn_frontend::error_t is_supported_fp8_bwd(
+    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
+    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
+    bool bottom_right_diagonal, bool deterministic, DType q_dtype, DType o_dtype,
+    NVTEScalingMode scaling_mode, cudnnHandle_t handle);
+>>>>>>> c9006435 (refactor nvte_get_fused_attn_backend with FE calls)
 }  // namespace transformer_engine
diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h
index 912dc32d35..787e97d628 100644
--- a/transformer_engine/common/include/transformer_engine/fused_attn.h
+++ b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -11,6 +11,8 @@
 #ifndef TRANSFORMER_ENGINE_FUSED_ATTN_FP8_H_
 #define TRANSFORMER_ENGINE_FUSED_ATTN_FP8_H_
 
+#include <cudnn.h>
+
 #include "stdint.h"
 #include "transformer_engine.h"
 
@@ -196,11 +198,40 @@ NVTE_QKV_Format nvte_get_q_format(NVTE_QKV_Layout qkv_layout);
  */
 NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout);
 
+/*! \struct NVTEFusedAttnBackendStatus
+ *  \brief Diagnostic info from \c nvte_get_fused_attn_backend.
+ *
+ *  Filled by \c nvte_get_fused_attn_backend when the caller passes a non-NULL pointer.
+ *  When the routing decision is supported, \c code is 0 and \c message is the empty
+ *  string. When the routing rejects the configuration, \c code is the underlying
+ *  cuDNN-FE \c cudnn_frontend::error_code_t cast to \c int (TE-synthesized post-filter
+ *  rejections use \c INVALID_VALUE), and \c message is a null-terminated human-readable
+ *  reason that points into per-thread storage owned by TE. The pointer is valid only
+ *  until the next call to \c nvte_get_fused_attn_backend on the same thread.
+ */
+typedef struct NVTEFusedAttnBackendStatus {
+  int code;
+  const char *message;
+} NVTEFusedAttnBackendStatus;
+
 /*! \brief Get fused attention backend based on input parameters.
+ *
+ *  Authoritative routing: when a non-NVTE_No_Backend value is returned, the configuration
+ *  is guaranteed to compile through cuDNN-FE (validate -> build_operation_graph ->
+ *  create_execution_plans -> check_support -> build_plans). The router applies a small
+ *  set of TE-specific post-filters in addition to delegating to cuDNN-FE for capability
+ *  checks. On success the built plan is cached, so the executor avoids rebuilding.
  *
  *  \param[in]     is_training         Whether the model is in training mode.
  *  \param[in]     q_dtype             The data type of Tensor Q.
  *  \param[in]     kv_dtype            The data type of Tensors K, V.
+ *  \param[in]     o_dtype             The data type of output Tensor O. Used by the FP8
+ *                                     branch to disambiguate FP8 vs HALF/BF16 output;
+ *                                     ignored by the F16/BF16 branch (pass q_dtype).
+ *  \param[in]     scaling_mode        Scaling mode of the input tensors. Used by the FP8
+ *                                     branch to select among delayed/current/MXFP8 recipes;
+ *                                     ignored by the F16/BF16 branch
+ *                                     (pass NVTE_DELAYED_TENSOR_SCALING).
  *  \param[in]     qkv_layout          The layout of Tensors Q, K, V.
  *  \param[in]     bias_type           The attention bias type.
  *  \param[in]     attn_mask_type      The attention mask type.
@@ -217,13 +248,22 @@ NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout);
  *  \param[in]     return_max_logit    Whether to produce Max along with Stats.
  *  \param[in]     cuda_graph          Whether cuda graph capture is enabled or not.
  *  \param[in]     deterministic       Whether determinism is required or not.
+ *  \param[in]     handle              cuDNN handle used for the support chain. Required.
+ *  \param[out]    out_status          Optional. When non-NULL, populated with a code +
+ *                                     message describing why the configuration was
+ *                                     rejected (NVTE_No_Backend) or with code=0 and
+ *                                     message="" on success. The message buffer lives in
+ *                                     thread-local storage and is overwritten on every
+ *                                     call on the same thread.
  */
 NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
-    bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
-    float dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic);
+    bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
+    NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float dropout,
+    size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
+    size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
+    bool return_max_logit, bool cuda_graph, bool deterministic, cudnnHandle_t handle,
+    NVTEFusedAttnBackendStatus *out_status);
 
 /*! \brief Compute dot product attention with separate Q, K and V.
  *
diff --git a/transformer_engine/jax/csrc/extensions/attention.cpp b/transformer_engine/jax/csrc/extensions/attention.cpp
index 76f2d92891..c6a8897089 100644
--- a/transformer_engine/jax/csrc/extensions/attention.cpp
+++ b/transformer_engine/jax/csrc/extensions/attention.cpp
@@ -5,6 +5,7 @@
  ************************************************************************/
 
 #include "../extensions.h"
+#include "common/cudnn_utils.h"
 #include "transformer_engine/fused_attn.h"
 #include "transformer_engine/transformer_engine.h"
 
@@ -17,11 +18,14 @@ NVTE_Fused_Attn_Backend GetFusedAttnBackend(
     float dropout_probability, size_t q_attn_heads, size_t kv_attn_heads, size_t q_max_seqlen,
     size_t kv_max_seqlen, size_t qk_head_dim, size_t v_head_dim, int64_t window_size_left,
     int64_t window_size_right, bool deterministic) {
+  auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
   auto backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype), qkv_layout,
-      bias_type, mask_type, softmax_type, dropout_probability, q_attn_heads, kv_attn_heads,
-      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
-      false, false, deterministic);
+      is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
+      static_cast<NVTEDType>(q_dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type,
+      mask_type, softmax_type, dropout_probability, q_attn_heads, kv_attn_heads, q_max_seqlen,
+      kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
+      /*return_max_logit=*/false, /*cuda_graph=*/false, deterministic, handle,
+      /*out_status=*/nullptr);
   return backend;
 }
 
@@ -272,11 +276,13 @@ static void FusedAttnForwardImpl(
   /* Prepare RNG state */
   auto rng_state_tensor = TensorWrapper(rng_state, std::vector<size_t>{2}, DType::kInt64);
 
+  auto _handle_fwd = cudnnExecutionPlanManager::Instance().GetHandle();
   auto backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype), qkv_layout,
-      bias_type, mask_type, softmax_type, dropout_probability, attn_heads, num_gqa_groups,
-      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
-      false, false, deterministic);
+      is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype),
+      static_cast<NVTEDType>(dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type, mask_type,
+      softmax_type, dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen,
+      qk_head_dim, v_head_dim, window_size_left, window_size_right, /*return_max_logit=*/false,
+      /*cuda_graph=*/false, deterministic, _handle_fwd, /*out_status=*/nullptr);
   nvte_populate_rng_state_async(rng_state, seed, q_max_seqlen, kv_max_seqlen, backend, stream);
 
   /* Auxiliary tensors (to be propagated to the backward pass later) */
@@ -548,11 +554,13 @@ static void FusedAttnBackwardImpl(
   /* Auxiliary tensors (propagated from the forward pass) */
   NVTETensorPack aux_input_tensors;
   nvte_tensor_pack_create(&aux_input_tensors);
+  auto _handle_bwd = cudnnExecutionPlanManager::Instance().GetHandle();
   auto backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype), qkv_layout,
-      bias_type, mask_type, softmax_type, dropout_probability, attn_heads, num_gqa_groups,
-      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
-      false, false, deterministic);
+      is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype),
+      static_cast<NVTEDType>(dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type, mask_type,
+      softmax_type, dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen,
+      qk_head_dim, v_head_dim, window_size_left, window_size_right, /*return_max_logit=*/false,
+      /*cuda_graph=*/false, deterministic, _handle_bwd, /*out_status=*/nullptr);
   PrepareFusedAttnBackwardAuxTensors(&aux_input_tensors, input_batch, bias_batch, attn_heads,
                                      bias_heads, q_max_seqlen, kv_max_seqlen, dtype, backend,
                                      softmax_aux, rng_state, bias, softmax_offset);
diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cpp b/transformer_engine/pytorch/csrc/extensions/attention.cpp
index e6781bd58a..d67cd4a6b9 100644
--- a/transformer_engine/pytorch/csrc/extensions/attention.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/attention.cpp
@@ -6,6 +6,7 @@
 
 #include "../extensions.h"
 #include "common.h"
+#include "common/cudnn_utils.h"
 #include "pybind.h"
 
 namespace {
@@ -40,17 +41,25 @@ void mha_fill(const transformer_engine::TensorWrapper &self, const at::Tensor &s
 namespace transformer_engine::pytorch {
 
 // get the fused attention backend
+//
+// NOTE: the underlying nvte_get_fused_attn_backend now takes o_dtype and scaling_mode in
+// addition to q_dtype/kv_dtype. For the F16/BF16 routing path those are ignored, so we pass
+// q_dtype as o_dtype and DELAYED_TENSOR_SCALING. This Python-facing wrapper therefore keeps
+// its existing signature; FP8 callers that want authoritative routing for non-default scaling
+// recipes should add o_dtype / scaling_mode parameters in a follow-up.
 NVTE_Fused_Attn_Backend get_fused_attn_backend(
     bool is_training, const DType q_dtype, const DType kv_dtype, NVTE_QKV_Layout qkv_layout,
     NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
     float p_dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
     int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic) {
+  auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype), qkv_layout,
-      bias_type, attn_mask_type, softmax_type, p_dropout, num_attn_heads, num_gqa_groups,
-      max_seqlen_q, max_seqlen_kv, head_dim_qk, head_dim_v, window_size_left, window_size_right,
-      return_max_logit, cuda_graph, deterministic);
+      is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
+      static_cast<NVTEDType>(q_dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type,
+      attn_mask_type, softmax_type, p_dropout, num_attn_heads, num_gqa_groups, max_seqlen_q,
+      max_seqlen_kv, head_dim_qk, head_dim_v, window_size_left, window_size_right,
+      return_max_logit, cuda_graph, deterministic, handle, /*out_status=*/nullptr);
   return fused_attention_backend;
 }
 

From 16b837cd0f3e27b7638bfb2a90d39056680a6b6e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 01:58:25 +0000
Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../common/fused_attn/fused_attn.cpp          |  4 +--
 .../fused_attn_f16_arbitrary_seqlen.cu        | 34 +++++++++----------
 .../common/fused_attn/fused_attn_fp8.cu       | 12 +++----
 .../pytorch/csrc/extensions/attention.cpp     |  4 +--
 4 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index 615f7c2a03..95405c0d6f 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -370,8 +370,8 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       auto bwd_status = is_supported_fp8_bwd(
           probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-          window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, q_t,
-          o_t, scaling_mode, handle);
+          window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, q_t, o_t,
+          scaling_mode, handle);
       if (bwd_status.is_bad()) {
         set_status(out_status, bwd_status);
         return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
index 57ca14a3e1..1ced84755c 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
@@ -1391,11 +1391,10 @@ cudnn_frontend::error_t is_supported_f16_fwd(
     bool return_max_logit, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
     NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
     int64_t window_size_right, bool bottom_right_diagonal, DType q_dtype, cudnnHandle_t handle) {
-  const ProbeDims d = compute_probe_dims(static_cast<int64_t>(batch),
-                                         static_cast<int64_t>(num_attn_heads),
-                                         static_cast<int64_t>(max_seqlen_q),
-                                         static_cast<int64_t>(max_seqlen_kv), qkv_layout,
-                                         bias_type);
+  const ProbeDims d =
+      compute_probe_dims(static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
+                         static_cast<int64_t>(max_seqlen_q), static_cast<int64_t>(max_seqlen_kv),
+                         qkv_layout, bias_type);
   const NVTE_QKV_Format o_format = nvte_get_q_format(qkv_layout);
 
   size_t workspace_size = 0;
@@ -1405,17 +1404,17 @@ cudnn_frontend::error_t is_supported_f16_fwd(
         static_cast<int64_t>(num_gqa_groups), static_cast<int64_t>(max_seqlen_q),
         static_cast<int64_t>(max_seqlen_kv), static_cast<int64_t>(head_dim_qk),
         static_cast<int64_t>(head_dim_v), d.max_b, d.max_t_q, d.max_t_kv, d.num_pages_k,
-        d.num_pages_v, d.page_size_k, d.page_size_v, d.max_pages_per_seq_k,
-        d.max_pages_per_seq_v, d.bias_b, d.bias_h, d.bias_sq, d.bias_skv, is_training,
-        return_max_logit, /*scaling_factor=*/1.0f, p_dropout, qkv_layout, o_format, bias_type,
-        mask_type, softmax_type, window_size_left, window_size_right, bottom_right_diagonal,
+        d.num_pages_v, d.page_size_k, d.page_size_v, d.max_pages_per_seq_k, d.max_pages_per_seq_v,
+        d.bias_b, d.bias_h, d.bias_sq, d.bias_skv, is_training, return_max_logit,
+        /*scaling_factor=*/1.0f, p_dropout, qkv_layout, o_format, bias_type, mask_type,
+        softmax_type, window_size_left, window_size_right, bottom_right_diagonal,
         /*devPtrQ=*/nullptr, /*devPtrK=*/nullptr, /*devPtrV=*/nullptr, /*devPtrBias=*/nullptr,
         /*devPtrSoftmaxOffset=*/nullptr, /*devPtrS1=*/nullptr, /*devPtrS2=*/nullptr,
         /*devPtrO=*/nullptr, /*devPtrDropoutSeed=*/nullptr, /*devPtrDropoutOffset=*/nullptr,
         /*devPtrCuSeqlensQ=*/nullptr, /*devPtrCuSeqlensKV=*/nullptr,
         /*devPtrPageTableK=*/nullptr, /*devPtrPageTableV=*/nullptr,
-        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr,
-        get_cudnn_fe_dtype(q_dtype), /*workspace=*/nullptr, &workspace_size,
+        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr, get_cudnn_fe_dtype(q_dtype),
+        /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
     return {cudnn_frontend::error_code_t::OK, ""};
   } catch (const std::exception &e) {
@@ -1432,11 +1431,10 @@ cudnn_frontend::error_t is_supported_f16_bwd(
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
     NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
     bool bottom_right_diagonal, bool deterministic, DType q_dtype, cudnnHandle_t handle) {
-  const ProbeDims d = compute_probe_dims(static_cast<int64_t>(batch),
-                                         static_cast<int64_t>(num_attn_heads),
-                                         static_cast<int64_t>(max_seqlen_q),
-                                         static_cast<int64_t>(max_seqlen_kv), qkv_layout,
-                                         bias_type);
+  const ProbeDims d =
+      compute_probe_dims(static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
+                         static_cast<int64_t>(max_seqlen_q), static_cast<int64_t>(max_seqlen_kv),
+                         qkv_layout, bias_type);
   const NVTE_QKV_Format o_format = nvte_get_q_format(qkv_layout);
   const NVTE_QKV_Format do_format = o_format;
   const NVTE_QKV_Layout dqkv_layout = qkv_layout;
@@ -1457,8 +1455,8 @@ cudnn_frontend::error_t is_supported_f16_bwd(
         /*devPtrdBias=*/nullptr, /*devPtrdSoftmaxOffset=*/nullptr,
         /*devPtrDropoutSeed=*/nullptr, /*devPtrDropoutOffset=*/nullptr,
         /*devPtrCuSeqlensQ=*/nullptr, /*devPtrCuSeqlensKV=*/nullptr,
-        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr,
-        get_cudnn_fe_dtype(q_dtype), /*workspace=*/nullptr, &workspace_size,
+        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr, get_cudnn_fe_dtype(q_dtype),
+        /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
     return {cudnn_frontend::error_code_t::OK, ""};
   } catch (const std::exception &e) {
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
index c9f7a9ee76..8a152cf489 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
@@ -3014,21 +3014,21 @@ cudnn_frontend::error_t is_supported_fp8_fwd(
         static_cast<int64_t>(num_gqa_groups), static_cast<int64_t>(max_seqlen_q),
         static_cast<int64_t>(max_seqlen_kv), static_cast<int64_t>(head_dim_qk),
         static_cast<int64_t>(head_dim_v), is_training, /*scaling_factor=*/1.0f, p_dropout,
-        qkv_layout, /*o_format=*/qkv_format, bias_type, mask_type, softmax_type,
-        window_size_left, window_size_right, bottom_right_diagonal,
+        qkv_layout, /*o_format=*/qkv_format, bias_type, mask_type, softmax_type, window_size_left,
+        window_size_right, bottom_right_diagonal,
         /*devPtrQ=*/nullptr, /*devPtrK=*/nullptr, /*devPtrV=*/nullptr,
         /*devPtrSoftmaxOffset=*/nullptr, /*devPtrM=*/nullptr, /*devPtrO=*/nullptr,
         /*devPtrDescaleQ=*/nullptr, /*devPtrDescaleK=*/nullptr, /*devPtrDescaleV=*/nullptr,
         /*devPtrDescaleS=*/nullptr, /*devPtrScaleS=*/nullptr, /*devPtrScaleO=*/nullptr,
         /*devPtrAmaxO=*/nullptr, /*devPtrAmaxS=*/nullptr, /*devPtrcuSeqlensQ=*/nullptr,
         /*devPtrcuSeqlensKV=*/nullptr, /*devPtrDropoutSeed=*/nullptr,
-        /*devPtrDropoutOffset=*/nullptr, get_cudnn_fe_dtype(q_dtype),
-        get_cudnn_fe_dtype(o_dtype), scaling_mode,
+        /*devPtrDropoutOffset=*/nullptr, get_cudnn_fe_dtype(q_dtype), get_cudnn_fe_dtype(o_dtype),
+        scaling_mode,
         /*qkv_scale_inv_format=*/NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
         /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
     return {cudnn_frontend::error_code_t::OK, ""};
-  } catch (const std::exception &e) {
+  } catch (const std::exception& e) {
     return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED, e.what()};
   } catch (...) {
     return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED,
@@ -3084,7 +3084,7 @@ cudnn_frontend::error_t is_supported_fp8_bwd(
         /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
     return {cudnn_frontend::error_code_t::OK, ""};
-  } catch (const std::exception &e) {
+  } catch (const std::exception& e) {
     return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED, e.what()};
   } catch (...) {
     return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED,
diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cpp b/transformer_engine/pytorch/csrc/extensions/attention.cpp
index d67cd4a6b9..256ede6e55 100644
--- a/transformer_engine/pytorch/csrc/extensions/attention.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/attention.cpp
@@ -58,8 +58,8 @@ NVTE_Fused_Attn_Backend get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
       static_cast<NVTEDType>(q_dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type,
       attn_mask_type, softmax_type, p_dropout, num_attn_heads, num_gqa_groups, max_seqlen_q,
-      max_seqlen_kv, head_dim_qk, head_dim_v, window_size_left, window_size_right,
-      return_max_logit, cuda_graph, deterministic, handle, /*out_status=*/nullptr);
+      max_seqlen_kv, head_dim_qk, head_dim_v, window_size_left, window_size_right, return_max_logit,
+      cuda_graph, deterministic, handle, /*out_status=*/nullptr);
   return fused_attention_backend;
 }
 

From 42bcd89036fc8d093a192985304018c4497b22ab Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Thu, 7 May 2026 14:11:46 -0700
Subject: [PATCH 03/11] replace code+string with string only

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
---
 .../common/fused_attn/fused_attn.cpp          | 98 +++++++++----------
 .../fused_attn_f16_arbitrary_seqlen.cu        | 18 ++--
 .../fused_attn_f16_arbitrary_seqlen.h         | 19 ++--
 .../common/fused_attn/fused_attn_fp8.cu       | 24 ++---
 .../common/fused_attn/fused_attn_fp8.h        | 18 ++--
 .../include/transformer_engine/fused_attn.h   | 37 +++----
 .../jax/csrc/extensions/attention.cpp         |  6 +-
 .../pytorch/csrc/extensions/attention.cpp     |  2 +-
 8 files changed, 103 insertions(+), 119 deletions(-)

diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index 95405c0d6f..961b503c1c 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -228,28 +228,17 @@ NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout) {
 
 namespace {
 
-// Per-thread storage for the message string handed back through
-// NVTEFusedAttnBackendStatus::message. Re-used (cleared + re-populated) on every call to
-// nvte_get_fused_attn_backend on this thread, which is exactly the lifetime documented in the
-// public header.
-thread_local std::string g_fused_attn_backend_status_buffer;
-
-// Apply (code, msg) to *out_status (if non-null), routing the message through the
-// thread-local buffer so the returned `const char*` outlives this function call.
-void set_status(NVTEFusedAttnBackendStatus *out_status, cudnn_frontend::error_code_t code,
-                const std::string &message) {
-  if (out_status == nullptr) return;
-  g_fused_attn_backend_status_buffer = message;
-  out_status->code = static_cast<int>(code);
-  out_status->message = g_fused_attn_backend_status_buffer.c_str();
-}
-
-void set_status(NVTEFusedAttnBackendStatus *out_status, const cudnn_frontend::error_t &err) {
-  set_status(out_status, err.code, err.err_msg);
-}
-
-void set_ok(NVTEFusedAttnBackendStatus *out_status) {
-  set_status(out_status, cudnn_frontend::error_code_t::OK, "");
+// Per-thread storage for the diagnostic string handed back through *out_reason. Re-used
+// (cleared + re-populated) on every call to nvte_get_fused_attn_backend on this thread,
+// which is exactly the lifetime documented in the public header.
+thread_local std::string g_fused_attn_backend_reason_buffer;
+
+// Stash `reason` in the thread-local buffer and (if non-null) point *out_reason at it,
+// so the returned `const char*` outlives this function call.
+void set_reason(const char **out_reason, const std::string &reason) {
+  if (out_reason == nullptr) return;
+  g_fused_attn_backend_reason_buffer = reason;
+  *out_reason = g_fused_attn_backend_reason_buffer.c_str();
 }
 
 }  // namespace
@@ -271,9 +260,9 @@ void set_ok(NVTEFusedAttnBackendStatus *out_status) {
 //      executor cache-hits on.
 //   3. Return the selected backend, or NVTE_No_Backend if any probe rejects.
 //
-// When `out_status` is non-null, it is filled with a code + message describing the
-// rejection (or {OK, ""} on success). TE post-filter rejections synthesize an
-// INVALID_VALUE entry; probe rejections forward the cuDNN-FE / NVTE_CHECK error verbatim.
+// When `out_reason` is non-null, it is set to "" on success or to a tagged diagnostic
+// string on rejection. TE post-filter rejections are tagged "[INVALID_VALUE] ...";
+// probe rejections forward the probe's tagged string verbatim.
 NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
     NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
@@ -281,11 +270,11 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
     size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
     bool return_max_logit, bool cuda_graph, bool deterministic, cudnnHandle_t handle,
-    NVTEFusedAttnBackendStatus *out_status) {
+    const char **out_reason) {
   using namespace transformer_engine;
-  // Initialize to OK so callers get a clean status on the success path without us having to
+  // Initialize to "" so callers get a clean status on the success path without us having to
   // remember to set it at every return.
-  set_ok(out_status);
+  set_reason(out_reason, "");
   NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type.");
 
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
@@ -300,8 +289,9 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
                                      layout_group, num_attn_heads, num_gqa_groups, max_seqlen_q,
                                      max_seqlen_kv, head_dim_qk, head_dim_v) == DType::kInt64);
   if (requires_64bit_ragged_offset && cudnn_runtime_version < 90500) {
-    set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
-               "Configuration requires 64-bit ragged offsets, which require cuDNN >= 9.5.");
+    set_reason(out_reason,
+               "[INVALID_VALUE] Configuration requires 64-bit ragged offsets, which require "
+               "cuDNN >= 9.5.");
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
 
@@ -310,8 +300,8 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
-    set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
-               "THD-format attention requires a padding-style mask "
+    set_reason(out_reason,
+               "[INVALID_VALUE] THD-format attention requires a padding-style mask "
                "(PADDING / PADDING_CAUSAL / PADDING_CAUSAL_BOTTOM_RIGHT).");
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
@@ -324,8 +314,8 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
-    set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
-               "Known cuDNN <= 9.15 capture quirk: training + bshd/sbhd + "
+    set_reason(out_reason,
+               "[INVALID_VALUE] Known cuDNN <= 9.15 capture quirk: training + bshd/sbhd + "
                "max_seqlen_kv % 128 != 0 + cuda_graph + non-padding mask is unsupported.");
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
@@ -346,34 +336,34 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
   if (is_fp8) {
     // TE-only FP8 post-filters: no 64-bit ragged offsets, no max-logit output.
     if (requires_64bit_ragged_offset) {
-      set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
-                 "FP8 fused attention does not support 64-bit ragged offsets.");
+      set_reason(out_reason,
+                 "[INVALID_VALUE] FP8 fused attention does not support 64-bit ragged offsets.");
       return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
     if (return_max_logit) {
-      set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
-                 "FP8 fused attention does not support return_max_logit.");
+      set_reason(out_reason,
+                 "[INVALID_VALUE] FP8 fused attention does not support return_max_logit.");
       return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
     const DType q_t = static_cast<DType>(q_dtype);
     const DType o_t = static_cast<DType>(o_dtype);
-    auto fwd_status = is_supported_fp8_fwd(
+    std::string fwd_reason = is_supported_fp8_fwd(
         probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
         head_dim_v, is_training, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
         window_size_left, window_size_right, probe_bottom_right_diagonal, q_t, o_t, scaling_mode,
         handle);
-    if (fwd_status.is_bad()) {
-      set_status(out_status, fwd_status);
+    if (!fwd_reason.empty()) {
+      set_reason(out_reason, fwd_reason);
       return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
     if (is_training) {
-      auto bwd_status = is_supported_fp8_bwd(
+      std::string bwd_reason = is_supported_fp8_bwd(
           probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
           window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, q_t, o_t,
           scaling_mode, handle);
-      if (bwd_status.is_bad()) {
-        set_status(out_status, bwd_status);
+      if (!bwd_reason.empty()) {
+        set_reason(out_reason, bwd_reason);
         return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
       }
     }
@@ -382,31 +372,31 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
 
   if (is_f16_or_bf16) {
     const DType q_t = static_cast<DType>(q_dtype);
-    auto fwd_status = is_supported_f16_fwd(
+    std::string fwd_reason = is_supported_f16_fwd(
         probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
         head_dim_v, is_training, return_max_logit, dropout, qkv_layout, bias_type, attn_mask_type,
         softmax_type, window_size_left, window_size_right, probe_bottom_right_diagonal, q_t,
         handle);
-    if (fwd_status.is_bad()) {
-      set_status(out_status, fwd_status);
+    if (!fwd_reason.empty()) {
+      set_reason(out_reason, fwd_reason);
       return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
     if (is_training) {
-      auto bwd_status = is_supported_f16_bwd(
+      std::string bwd_reason = is_supported_f16_bwd(
           probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
           window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, q_t,
           handle);
-      if (bwd_status.is_bad()) {
-        set_status(out_status, bwd_status);
+      if (!bwd_reason.empty()) {
+        set_reason(out_reason, bwd_reason);
         return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
       }
     }
     return NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen;
   }
 
-  set_status(out_status, cudnn_frontend::error_code_t::INVALID_VALUE,
-             "Unsupported Q dtype for fused attention "
+  set_reason(out_reason,
+             "[INVALID_VALUE] Unsupported Q dtype for fused attention "
              "(only FP16/BF16/FP8_E4M3/FP8_E5M2 are routable).");
   return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
 }
@@ -500,7 +490,7 @@ void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
       is_training, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type, attn_mask_type,
       softmax_type, dropout, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left,
       window_size_right, return_max_logit, cuda_graph, /*deterministic=*/false, handle,
-      /*out_status=*/nullptr);
+      /*out_reason=*/nullptr);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
     fused_attn_max_512_fwd(b, h_q, max_seqlen_q, max_seqlen_kv, d_qk, is_training, attn_scale,
@@ -589,7 +579,7 @@ void nvte_fused_attn_bwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
       /*is_training=*/true, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type,
       attn_mask_type, softmax_type, dropout, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v,
       window_size_left, window_size_right, /*return_max_logit=*/false, cuda_graph, deterministic,
-      handle, /*out_status=*/nullptr);
+      handle, /*out_reason=*/nullptr);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
     Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]);
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
index 1ced84755c..70094c4e93 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
@@ -1385,7 +1385,7 @@ ProbeDims compute_probe_dims(int64_t batch, int64_t num_attn_heads, int64_t max_
 }
 }  // namespace
 
-cudnn_frontend::error_t is_supported_f16_fwd(
+std::string is_supported_f16_fwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training,
     bool return_max_logit, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
@@ -1416,16 +1416,15 @@ cudnn_frontend::error_t is_supported_f16_fwd(
         /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr, get_cudnn_fe_dtype(q_dtype),
         /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
-    return {cudnn_frontend::error_code_t::OK, ""};
+    return "";
   } catch (const std::exception &e) {
-    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED, e.what()};
+    return std::string("[GRAPH_NOT_SUPPORTED] ") + e.what();
   } catch (...) {
-    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED,
-            "is_supported_f16_fwd: unknown failure"};
+    return "[GRAPH_NOT_SUPPORTED] is_supported_f16_fwd: unknown failure";
   }
 }
 
-cudnn_frontend::error_t is_supported_f16_bwd(
+std::string is_supported_f16_bwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
@@ -1458,12 +1457,11 @@ cudnn_frontend::error_t is_supported_f16_bwd(
         /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr, get_cudnn_fe_dtype(q_dtype),
         /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
-    return {cudnn_frontend::error_code_t::OK, ""};
+    return "";
   } catch (const std::exception &e) {
-    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED, e.what()};
+    return std::string("[GRAPH_NOT_SUPPORTED] ") + e.what();
   } catch (...) {
-    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED,
-            "is_supported_f16_bwd: unknown failure"};
+    return "[GRAPH_NOT_SUPPORTED] is_supported_f16_bwd: unknown failure";
   }
 }
 
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
index 38cf48c1f0..0eabe3e8dc 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
@@ -12,7 +12,8 @@
 #define TRANSFORMER_ENGINE_COMMON_FUSED_ATTN_FUSED_ATTN_ARBITRARY_SEQLEN_H_
 
 #include <cudnn.h>
-#include <cudnn_frontend.h>
+
+#include <string>
 
 #include "common/common.h"
 #include "transformer_engine/fused_attn.h"
@@ -50,11 +51,15 @@ void fused_attn_arbitrary_seqlen_bwd(
 
 // Probe: drives cuDNN-FE (validate -> build_operation_graph -> create_execution_plans ->
 // check_support -> build_plans) for an F16/BF16 forward graph with the given configuration.
-// Returns the cuDNN-FE status: error_code_t::OK iff the graph compiles end-to-end. On OK,
-// the built graph is inserted into the same thread-local cache used by
-// fused_attn_arbitrary_seqlen_fwd_impl, so the executor cache-hits on matching descriptors.
-// On rejection, err_msg contains the underlying cuDNN-FE / NVTE_CHECK message.
-cudnn_frontend::error_t is_supported_f16_fwd(
+// Returns an empty string iff the graph compiles end-to-end; on OK the built graph is
+// inserted into the same thread-local cache used by fused_attn_arbitrary_seqlen_fwd_impl,
+// so the executor cache-hits on matching descriptors.
+//
+// On rejection, returns a non-empty diagnostic of the form
+//   "[<CATEGORY>] <human-readable message>"
+// where <CATEGORY> is a stable tag mirroring cudnn_frontend::error_code_t names
+// (e.g. GRAPH_NOT_SUPPORTED for cuDNN-FE rejections forwarded from the support chain).
+std::string is_supported_f16_fwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training,
     bool return_max_logit, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
@@ -62,7 +67,7 @@ cudnn_frontend::error_t is_supported_f16_fwd(
     int64_t window_size_right, bool bottom_right_diagonal, DType q_dtype, cudnnHandle_t handle);
 
 // Probe: same as above for the F16/BF16 backward graph.
-cudnn_frontend::error_t is_supported_f16_bwd(
+std::string is_supported_f16_bwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
index 8a152cf489..27bd0af3f3 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
@@ -2992,7 +2992,7 @@ void fused_attn_fp8_bwd(
   }
 }
 
-cudnn_frontend::error_t is_supported_fp8_fwd(
+std::string is_supported_fp8_fwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
@@ -3004,8 +3004,7 @@ cudnn_frontend::error_t is_supported_fp8_fwd(
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
   if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
       qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
-    return {cudnn_frontend::error_code_t::INVALID_VALUE,
-            "FP8 fused attention only supports BSHD/SBHD/BHSD layouts."};
+    return "[INVALID_VALUE] FP8 fused attention only supports BSHD/SBHD/BHSD layouts.";
   }
   size_t workspace_size = 0;
   try {
@@ -3027,16 +3026,15 @@ cudnn_frontend::error_t is_supported_fp8_fwd(
         /*qkv_scale_inv_format=*/NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
         /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
-    return {cudnn_frontend::error_code_t::OK, ""};
+    return "";
   } catch (const std::exception& e) {
-    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED, e.what()};
+    return std::string("[GRAPH_NOT_SUPPORTED] ") + e.what();
   } catch (...) {
-    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED,
-            "is_supported_fp8_fwd: unknown failure"};
+    return "[GRAPH_NOT_SUPPORTED] is_supported_fp8_fwd: unknown failure";
   }
 }
 
-cudnn_frontend::error_t is_supported_fp8_bwd(
+std::string is_supported_fp8_bwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
@@ -3046,8 +3044,7 @@ cudnn_frontend::error_t is_supported_fp8_bwd(
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
   if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
       qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
-    return {cudnn_frontend::error_code_t::INVALID_VALUE,
-            "FP8 fused attention only supports BSHD/SBHD/BHSD layouts."};
+    return "[INVALID_VALUE] FP8 fused attention only supports BSHD/SBHD/BHSD layouts.";
   }
   // For FP8 bwd, dO data type matches O data type and dQKV data type matches Q data type
   // (this mirrors the assumption used by callers of fused_attn_fp8_bwd in TE).
@@ -3083,12 +3080,11 @@ cudnn_frontend::error_t is_supported_fp8_bwd(
         /*do_scale_inv_format=*/NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
         /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
-    return {cudnn_frontend::error_code_t::OK, ""};
+    return "";
   } catch (const std::exception& e) {
-    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED, e.what()};
+    return std::string("[GRAPH_NOT_SUPPORTED] ") + e.what();
   } catch (...) {
-    return {cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED,
-            "is_supported_fp8_bwd: unknown failure"};
+    return "[GRAPH_NOT_SUPPORTED] is_supported_fp8_bwd: unknown failure";
   }
 }
 
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h
index 5c7f11d80e..f91cdcf291 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.h
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h
@@ -8,7 +8,7 @@
  *  \brief Functions for fused attention for FP8 with seqlen <= 512
  */
 
-#include <cudnn_frontend.h>
+#include <string>
 
 #include "transformer_engine/fused_attn.h"
 #include "transformer_engine/transformer_engine.h"
@@ -44,10 +44,15 @@ void fused_attn_fp8_bwd(
 
 // Probe: drives cuDNN-FE (validate -> build_operation_graph -> create_execution_plans ->
 // check_support -> build_plans) for an FP8 forward graph with the given configuration.
-// Returns the cuDNN-FE status: error_code_t::OK iff the graph compiles end-to-end. On OK,
-// the built graph is inserted into the same thread-local cache used by fused_attn_fp8_fwd_impl.
-// On rejection, err_msg contains the underlying cuDNN-FE / NVTE_CHECK message.
-cudnn_frontend::error_t is_supported_fp8_fwd(
+// Returns an empty string iff the graph compiles end-to-end; on OK the built graph is
+// inserted into the same thread-local cache used by fused_attn_fp8_fwd_impl.
+//
+// On rejection, returns a non-empty diagnostic of the form
+//   "[<CATEGORY>] <human-readable message>"
+// where <CATEGORY> mirrors cudnn_frontend::error_code_t names (INVALID_VALUE for the
+// FP8-only layout pre-filter, GRAPH_NOT_SUPPORTED for cuDNN-FE rejections forwarded
+// from the support chain).
+std::string is_supported_fp8_fwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
@@ -56,12 +61,11 @@ cudnn_frontend::error_t is_supported_fp8_fwd(
     cudnnHandle_t handle);
 
 // Probe: same as above for the FP8 backward graph.
-cudnn_frontend::error_t is_supported_fp8_bwd(
+std::string is_supported_fp8_bwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
     NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
     bool bottom_right_diagonal, bool deterministic, DType q_dtype, DType o_dtype,
     NVTEScalingMode scaling_mode, cudnnHandle_t handle);
->>>>>>> c9006435 (refactor nvte_get_fused_attn_backend with FE calls)
 }  // namespace transformer_engine
diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h
index 787e97d628..bbcdf08995 100644
--- a/transformer_engine/common/include/transformer_engine/fused_attn.h
+++ b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -198,22 +198,6 @@ NVTE_QKV_Format nvte_get_q_format(NVTE_QKV_Layout qkv_layout);
  */
 NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout);
 
-/*! \struct NVTEFusedAttnBackendStatus
- *  \brief Diagnostic info from \c nvte_get_fused_attn_backend.
- *
- *  Filled by \c nvte_get_fused_attn_backend when the caller passes a non-NULL pointer.
- *  When the routing decision is supported, \c code is 0 and \c message is the empty
- *  string. When the routing rejects the configuration, \c code is the underlying
- *  cuDNN-FE \c cudnn_frontend::error_code_t cast to \c int (TE-synthesized post-filter
- *  rejections use \c INVALID_VALUE), and \c message is a null-terminated human-readable
- *  reason that points into per-thread storage owned by TE. The pointer is valid only
- *  until the next call to \c nvte_get_fused_attn_backend on the same thread.
- */
-typedef struct NVTEFusedAttnBackendStatus {
-  int code;
-  const char *message;
-} NVTEFusedAttnBackendStatus;
-
 /*! \brief Get fused attention backend based on input parameters.
  *
  *  Authoritative routing: when a non-NVTE_No_Backend value is returned, the configuration
@@ -249,12 +233,19 @@ typedef struct NVTEFusedAttnBackendStatus {
  *  \param[in]     cuda_graph          Whether cuda graph capture is enabled or not.
  *  \param[in]     deterministic       Whether determinism is required or not.
  *  \param[in]     handle              cuDNN handle used for the support chain. Required.
- *  \param[out]    out_status          Optional. When non-NULL, populated with a code +
- *                                     message describing why the configuration was
- *                                     rejected (NVTE_No_Backend) or with code=0 and
- *                                     message="" on success. The message buffer lives in
- *                                     thread-local storage and is overwritten on every
- *                                     call on the same thread.
+ *  \param[out]    out_reason          Optional. When non-NULL, set to a null-terminated
+ *                                     diagnostic string describing why the configuration
+ *                                     was rejected (NVTE_No_Backend) or set to "" on
+ *                                     success. Rejection messages are tagged with a
+ *                                     stable category prefix that mirrors
+ *                                     \c cudnn_frontend::error_code_t, e.g.
+ *                                     \c "[INVALID_VALUE] ..." for TE post-filter
+ *                                     rejections and FP8 layout pre-filter rejections,
+ *                                     \c "[GRAPH_NOT_SUPPORTED] ..." for cuDNN-FE
+ *                                     rejections forwarded from the support chain. The
+ *                                     pointer points into per-thread storage owned by TE
+ *                                     and is valid only until the next call to
+ *                                     \c nvte_get_fused_attn_backend on the same thread.
  */
 NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
@@ -263,7 +254,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
     size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
     bool return_max_logit, bool cuda_graph, bool deterministic, cudnnHandle_t handle,
-    NVTEFusedAttnBackendStatus *out_status);
+    const char **out_reason);
 
 /*! \brief Compute dot product attention with separate Q, K and V.
  *
diff --git a/transformer_engine/jax/csrc/extensions/attention.cpp b/transformer_engine/jax/csrc/extensions/attention.cpp
index c6a8897089..669570daa5 100644
--- a/transformer_engine/jax/csrc/extensions/attention.cpp
+++ b/transformer_engine/jax/csrc/extensions/attention.cpp
@@ -25,7 +25,7 @@ NVTE_Fused_Attn_Backend GetFusedAttnBackend(
       mask_type, softmax_type, dropout_probability, q_attn_heads, kv_attn_heads, q_max_seqlen,
       kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
       /*return_max_logit=*/false, /*cuda_graph=*/false, deterministic, handle,
-      /*out_status=*/nullptr);
+      /*out_reason=*/nullptr);
   return backend;
 }
 
@@ -282,7 +282,7 @@ static void FusedAttnForwardImpl(
       static_cast<NVTEDType>(dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type, mask_type,
       softmax_type, dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen,
       qk_head_dim, v_head_dim, window_size_left, window_size_right, /*return_max_logit=*/false,
-      /*cuda_graph=*/false, deterministic, _handle_fwd, /*out_status=*/nullptr);
+      /*cuda_graph=*/false, deterministic, _handle_fwd, /*out_reason=*/nullptr);
   nvte_populate_rng_state_async(rng_state, seed, q_max_seqlen, kv_max_seqlen, backend, stream);
 
   /* Auxiliary tensors (to be propagated to the backward pass later) */
@@ -560,7 +560,7 @@ static void FusedAttnBackwardImpl(
       static_cast<NVTEDType>(dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type, mask_type,
       softmax_type, dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen,
       qk_head_dim, v_head_dim, window_size_left, window_size_right, /*return_max_logit=*/false,
-      /*cuda_graph=*/false, deterministic, _handle_bwd, /*out_status=*/nullptr);
+      /*cuda_graph=*/false, deterministic, _handle_bwd, /*out_reason=*/nullptr);
   PrepareFusedAttnBackwardAuxTensors(&aux_input_tensors, input_batch, bias_batch, attn_heads,
                                      bias_heads, q_max_seqlen, kv_max_seqlen, dtype, backend,
                                      softmax_aux, rng_state, bias, softmax_offset);
diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cpp b/transformer_engine/pytorch/csrc/extensions/attention.cpp
index 256ede6e55..3af4ba3831 100644
--- a/transformer_engine/pytorch/csrc/extensions/attention.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/attention.cpp
@@ -59,7 +59,7 @@ NVTE_Fused_Attn_Backend get_fused_attn_backend(
       static_cast<NVTEDType>(q_dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type,
       attn_mask_type, softmax_type, p_dropout, num_attn_heads, num_gqa_groups, max_seqlen_q,
       max_seqlen_kv, head_dim_qk, head_dim_v, window_size_left, window_size_right, return_max_logit,
-      cuda_graph, deterministic, handle, /*out_status=*/nullptr);
+      cuda_graph, deterministic, handle, /*out_reason=*/nullptr);
   return fused_attention_backend;
 }
 

From de8e81457b8576b222313ae068042ff6f5b68598 Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Thu, 7 May 2026 17:05:13 -0700
Subject: [PATCH 04/11] clean up logic/comments/structure

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
---
 tests/jax/test_fused_attn.py                  |   4 +-
 .../common/fused_attn/fused_attn.cpp          | 113 +++++---------
 .../fused_attn_f16_arbitrary_seqlen.cu        | 140 ++++++++----------
 .../fused_attn_f16_arbitrary_seqlen.h         |  21 +--
 .../common/fused_attn/fused_attn_fp8.cu       |  26 ++--
 .../common/fused_attn/fused_attn_fp8.h        |  21 +--
 .../include/transformer_engine/fused_attn.h   |  34 +----
 .../common/util/pybind_helper.h               |   7 +
 .../jax/cpp_extensions/attention.py           |  20 ++-
 transformer_engine/jax/csrc/extensions.h      |  15 +-
 .../jax/csrc/extensions/attention.cpp         |  32 ++--
 .../attention/dot_product_attention/utils.py  |   9 +-
 transformer_engine/pytorch/csrc/extensions.h  |  15 +-
 .../pytorch/csrc/extensions/attention.cpp     |  30 ++--
 14 files changed, 209 insertions(+), 278 deletions(-)

diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py
index 8b727b1d43..f21c6cb2d0 100644
--- a/tests/jax/test_fused_attn.py
+++ b/tests/jax/test_fused_attn.py
@@ -444,7 +444,7 @@ def _check_configs(self):
                 "is either BSHD_BSHD_BSHD or THD_THD_THD"
             )
 
-        self.backend = FusedAttnHelper(
+        self.backend, message = FusedAttnHelper(
             self.is_training,
             self.dtype,
             self.dtype,
@@ -462,7 +462,7 @@ def _check_configs(self):
             (-1, -1) if self.window_size is None else self.window_size,
         ).get_fused_attn_backend()
         if self.backend != NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen:
-            pytest.skip("Unsupported inputs combination or device compute capability.")
+            pytest.skip(message)
 
         if (
             self.attn_bias_type == AttnBiasType.POST_SCALE_BIAS
diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index 961b503c1c..9587693645 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -228,41 +228,19 @@ NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout) {
 
 namespace {
 
-// Per-thread storage for the diagnostic string handed back through *out_reason. Re-used
-// (cleared + re-populated) on every call to nvte_get_fused_attn_backend on this thread,
-// which is exactly the lifetime documented in the public header.
-thread_local std::string g_fused_attn_backend_reason_buffer;
-
-// Stash `reason` in the thread-local buffer and (if non-null) point *out_reason at it,
-// so the returned `const char*` outlives this function call.
-void set_reason(const char **out_reason, const std::string &reason) {
-  if (out_reason == nullptr) return;
-  g_fused_attn_backend_reason_buffer = reason;
-  *out_reason = g_fused_attn_backend_reason_buffer.c_str();
+// per-thread storage for the diagnostic string
+// re-used (cleared + re-populated) on every call to nvte_get_fused_attn_backend on this thread
+thread_local std::string fused_attn_backend_message_buffer;
+
+void set_message(const char **message, const std::string &reason) {
+  if (message == nullptr) return;
+  fused_attn_backend_message_buffer = reason;
+  *message = fused_attn_backend_message_buffer.c_str();
 }
 
 }  // namespace
 
 // select a backend for fused attention
-//
-// Routing flow:
-//   1. Apply TE post-filters that encode policies cuDNN-FE doesn't model directly:
-//        a. requires_64bit_ragged_offset -> cudnn >= 9.5
-//        b. qkv_format == THD requires a padding-style mask
-//        c. cuDNN <= 9.15 + is_training + bshd/sbhd + max_seqlen_kv % 128 != 0 +
-//           cuda_graph + non-padding mask is rejected (known capture quirk)
-//   2. Dispatch by dtype to the appropriate probe(s):
-//        - FP8 (E4M3/E5M2): is_supported_fp8_fwd (+ is_supported_fp8_bwd if training)
-//        - FP16/BF16:       is_supported_f16_fwd (+ is_supported_f16_bwd if training)
-//      The probes call the same _impl that the executor uses, with workspace=nullptr.
-//      They run validate -> build_operation_graph -> create_execution_plans ->
-//      check_support -> build_plans, and populate a thread-local cache that the
-//      executor cache-hits on.
-//   3. Return the selected backend, or NVTE_No_Backend if any probe rejects.
-//
-// When `out_reason` is non-null, it is set to "" on success or to a tagged diagnostic
-// string on rejection. TE post-filter rejections are tagged "[INVALID_VALUE] ...";
-// probe rejections forward the probe's tagged string verbatim.
 NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
     NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
@@ -270,62 +248,50 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
     size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
     bool return_max_logit, bool cuda_graph, bool deterministic, cudnnHandle_t handle,
-    const char **out_reason) {
+    const char **message) {
   using namespace transformer_engine;
-  // Initialize to "" so callers get a clean status on the success path without us having to
-  // remember to set it at every return.
-  set_reason(out_reason, "");
+  set_message(message, "");
   NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type.");
 
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
   const NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
   const auto cudnn_runtime_version = cudnnGetVersion();
 
-  // ---------- TE post-filters (apply before delegating to cuDNN-FE) ----------
-
-  // (1) Ragged-offset width: cuDNN < 9.5 only supports 32-bit offsets.
+  // THD + 64-bit ragged offsets require cuDNN >= 9.5
   const bool requires_64bit_ragged_offset =
       (qkv_format == NVTE_THD && fused_attn::get_ragged_offset_dtype(
                                      layout_group, num_attn_heads, num_gqa_groups, max_seqlen_q,
                                      max_seqlen_kv, head_dim_qk, head_dim_v) == DType::kInt64);
   if (requires_64bit_ragged_offset && cudnn_runtime_version < 90500) {
-    set_reason(out_reason,
-               "[INVALID_VALUE] Configuration requires 64-bit ragged offsets, which require "
+    set_message(message,
+               "Configuration requires 64-bit ragged offsets, which require "
                "cuDNN >= 9.5.");
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
 
-  // (2) THD requires a padding-style mask.
+  // THD requires padding-style mask
   if (qkv_format == NVTE_QKV_Format::NVTE_THD &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
-    set_reason(out_reason,
-               "[INVALID_VALUE] THD-format attention requires a padding-style mask "
-               "(PADDING / PADDING_CAUSAL / PADDING_CAUSAL_BOTTOM_RIGHT).");
+    set_message(message,
+               "THD format requires PADDING / PADDING_CAUSAL / PADDING_CAUSAL_BOTTOM_RIGHT mask.");
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
 
-  // (3) cuDNN-Graph capture quirk on cuDNN <= 9.15: training + bshd/sbhd with
-  // max_seqlen_kv % 128 != 0 + cuda_graph + non-padding mask hangs/miscompiles.
+  // avoid CUDA graph issue with cuDNN <= 9.15
   if (cudnn_runtime_version <= 91500 && is_training &&
       (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD) &&
       (max_seqlen_kv % 128 != 0) && cuda_graph &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
-    set_reason(out_reason,
-               "[INVALID_VALUE] Known cuDNN <= 9.15 capture quirk: training + bshd/sbhd + "
-               "max_seqlen_kv % 128 != 0 + cuda_graph + non-padding mask is unsupported.");
+    set_message(message,
+               "Known cuDNN <= 9.15 issue with CUDA graph. Please upgrade cuDNN.");
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
 
-  // ---------- Dispatch by dtype ----------
-
-  // Probes use a single-batch graph; capability checks in cuDNN-FE are batch-agnostic.
   constexpr size_t probe_batch = 1;
-  // bottom_right_diagonal is a runtime API knob the router doesn't see; the BRCM-via-mask
-  // case is captured by attn_mask_type, so we probe with the default top-left alignment.
   constexpr bool probe_bottom_right_diagonal = false;
 
   const bool is_fp8 =
@@ -334,36 +300,30 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       (q_dtype == NVTEDType::kNVTEFloat16 || q_dtype == NVTEDType::kNVTEBFloat16);
 
   if (is_fp8) {
-    // TE-only FP8 post-filters: no 64-bit ragged offsets, no max-logit output.
-    if (requires_64bit_ragged_offset) {
-      set_reason(out_reason,
-                 "[INVALID_VALUE] FP8 fused attention does not support 64-bit ragged offsets.");
-      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-    }
     if (return_max_logit) {
-      set_reason(out_reason,
-                 "[INVALID_VALUE] FP8 fused attention does not support return_max_logit.");
+      set_message(message,
+                 "FP8 fused attention does not support return_max_logit=True.");
       return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
-    const DType q_t = static_cast<DType>(q_dtype);
+    const DType qkv_t = static_cast<DType>(q_dtype);
     const DType o_t = static_cast<DType>(o_dtype);
     std::string fwd_reason = is_supported_fp8_fwd(
         probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
         head_dim_v, is_training, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-        window_size_left, window_size_right, probe_bottom_right_diagonal, q_t, o_t, scaling_mode,
+        window_size_left, window_size_right, probe_bottom_right_diagonal, qkv_t, o_t, scaling_mode,
         handle);
     if (!fwd_reason.empty()) {
-      set_reason(out_reason, fwd_reason);
+      set_message(message, fwd_reason);
       return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
     if (is_training) {
       std::string bwd_reason = is_supported_fp8_bwd(
           probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-          window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, q_t, o_t,
-          scaling_mode, handle);
+          window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, qkv_t,
+          o_t, scaling_mode, handle);
       if (!bwd_reason.empty()) {
-        set_reason(out_reason, bwd_reason);
+        set_message(message, bwd_reason);
         return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
       }
     }
@@ -371,33 +331,32 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
   }
 
   if (is_f16_or_bf16) {
-    const DType q_t = static_cast<DType>(q_dtype);
+    const DType qkv_t = static_cast<DType>(q_dtype);
     std::string fwd_reason = is_supported_f16_fwd(
         probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
         head_dim_v, is_training, return_max_logit, dropout, qkv_layout, bias_type, attn_mask_type,
-        softmax_type, window_size_left, window_size_right, probe_bottom_right_diagonal, q_t,
+        softmax_type, window_size_left, window_size_right, probe_bottom_right_diagonal, qkv_t,
         handle);
     if (!fwd_reason.empty()) {
-      set_reason(out_reason, fwd_reason);
+      set_message(message, fwd_reason);
       return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
     if (is_training) {
       std::string bwd_reason = is_supported_f16_bwd(
           probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-          window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, q_t,
+          window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, qkv_t,
           handle);
       if (!bwd_reason.empty()) {
-        set_reason(out_reason, bwd_reason);
+        set_message(message, bwd_reason);
         return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
       }
     }
     return NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen;
   }
 
-  set_reason(out_reason,
-             "[INVALID_VALUE] Unsupported Q dtype for fused attention "
-             "(only FP16/BF16/FP8_E4M3/FP8_E5M2 are routable).");
+  set_message(message,
+             "Unsupported QKV dtype qkv_dtype=" + std::to_string(q_dtype) + " .");
   return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
 }
 
@@ -490,7 +449,7 @@ void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
       is_training, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type, attn_mask_type,
       softmax_type, dropout, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left,
       window_size_right, return_max_logit, cuda_graph, /*deterministic=*/false, handle,
-      /*out_reason=*/nullptr);
+      /*message=*/nullptr);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
     fused_attn_max_512_fwd(b, h_q, max_seqlen_q, max_seqlen_kv, d_qk, is_training, attn_scale,
@@ -579,7 +538,7 @@ void nvte_fused_attn_bwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
       /*is_training=*/true, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type,
       attn_mask_type, softmax_type, dropout, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v,
       window_size_left, window_size_right, /*return_max_logit=*/false, cuda_graph, deterministic,
-      handle, /*out_reason=*/nullptr);
+      handle, /*message=*/nullptr);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
     Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]);
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
index 70094c4e93..81e66d8800 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
@@ -1334,31 +1334,18 @@ void fused_attn_arbitrary_seqlen_bwd(
   }
 }
 
-namespace {
-// Probe-time defaults for runtime-only quantities the router doesn't see (paged-KV dims,
-// ragged max-tokens, bias dims). These produce a graph whose support surface matches the
-// real executor's: for non-paged / non-ragged paths these are unused inside the impl;
-// for ragged-THD we rebind to worst-case bounds; for paged we use 1 page of full s_kv per
-// batch (= same dims as non-paged), so cuDNN-FE applies the paged-attention support rules.
-struct ProbeDims {
-  int64_t max_b;
-  int64_t max_t_q;
-  int64_t max_t_kv;
-  int64_t num_pages_k;
-  int64_t num_pages_v;
-  int64_t page_size_k;
-  int64_t page_size_v;
-  int64_t max_pages_per_seq_k;
-  int64_t max_pages_per_seq_v;
-  int64_t bias_b;
-  int64_t bias_h;
-  int64_t bias_sq;
-  int64_t bias_skv;
-};
-
-ProbeDims compute_probe_dims(int64_t batch, int64_t num_attn_heads, int64_t max_seqlen_q,
-                             int64_t max_seqlen_kv, NVTE_QKV_Layout qkv_layout,
-                             NVTE_Bias_Type bias_type) {
+std::string is_supported_f16_fwd(
+    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training,
+    bool return_max_logit, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+    int64_t window_size_right, bool bottom_right_diagonal, DType qkv_dtype,
+    cudnnHandle_t handle) {
+  const auto b = static_cast<int64_t>(batch);
+  const auto h = static_cast<int64_t>(num_attn_heads);
+  const auto sq = static_cast<int64_t>(max_seqlen_q);
+  const auto skv = static_cast<int64_t>(max_seqlen_kv);
+
   const NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout);
   const NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
   const bool is_ragged_q = (q_format == NVTE_QKV_Format::NVTE_THD);
@@ -1367,45 +1354,29 @@ ProbeDims compute_probe_dims(int64_t batch, int64_t num_attn_heads, int64_t max_
   const bool is_paged_kv = (layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD);
   const bool has_bias = (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS);
 
-  ProbeDims d{};
-  d.max_b = (is_ragged_q || is_ragged_kv) ? batch : 0;
-  d.max_t_q = is_ragged_q ? batch * max_seqlen_q : 0;
-  d.max_t_kv = is_ragged_kv ? batch * max_seqlen_kv : 0;
-  d.num_pages_k = is_paged_kv ? batch : 0;
-  d.num_pages_v = is_paged_kv ? batch : 0;
-  d.page_size_k = is_paged_kv ? max_seqlen_kv : 0;
-  d.page_size_v = is_paged_kv ? max_seqlen_kv : 0;
-  d.max_pages_per_seq_k = is_paged_kv ? 1 : 0;
-  d.max_pages_per_seq_v = is_paged_kv ? 1 : 0;
-  d.bias_b = has_bias ? batch : 0;
-  d.bias_h = has_bias ? num_attn_heads : 0;
-  d.bias_sq = has_bias ? max_seqlen_q : 0;
-  d.bias_skv = has_bias ? max_seqlen_kv : 0;
-  return d;
-}
-}  // namespace
-
-std::string is_supported_f16_fwd(
-    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training,
-    bool return_max_logit, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-    int64_t window_size_right, bool bottom_right_diagonal, DType q_dtype, cudnnHandle_t handle) {
-  const ProbeDims d =
-      compute_probe_dims(static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
-                         static_cast<int64_t>(max_seqlen_q), static_cast<int64_t>(max_seqlen_kv),
-                         qkv_layout, bias_type);
-  const NVTE_QKV_Format o_format = nvte_get_q_format(qkv_layout);
+  const int64_t max_b = (is_ragged_q || is_ragged_kv) ? b : 0;
+  const int64_t max_t_q = is_ragged_q ? b * sq : 0;
+  const int64_t max_t_kv = is_ragged_kv ? b * skv : 0;
+  const int64_t num_pages_k = is_paged_kv ? b : 0;
+  const int64_t num_pages_v = is_paged_kv ? b : 0;
+  const int64_t page_size_k = is_paged_kv ? skv : 0;
+  const int64_t page_size_v = is_paged_kv ? skv : 0;
+  const int64_t max_pages_per_seq_k = is_paged_kv ? 1 : 0;
+  const int64_t max_pages_per_seq_v = is_paged_kv ? 1 : 0;
+  const int64_t bias_b = has_bias ? b : 0;
+  const int64_t bias_h = has_bias ? h : 0;
+  const int64_t bias_sq = has_bias ? sq : 0;
+  const int64_t bias_skv = has_bias ? skv : 0;
+
+  const NVTE_QKV_Format o_format = q_format;
 
   size_t workspace_size = 0;
   try {
     fused_attn::fused_attn_arbitrary_seqlen_fwd_impl(
-        static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
-        static_cast<int64_t>(num_gqa_groups), static_cast<int64_t>(max_seqlen_q),
-        static_cast<int64_t>(max_seqlen_kv), static_cast<int64_t>(head_dim_qk),
-        static_cast<int64_t>(head_dim_v), d.max_b, d.max_t_q, d.max_t_kv, d.num_pages_k,
-        d.num_pages_v, d.page_size_k, d.page_size_v, d.max_pages_per_seq_k, d.max_pages_per_seq_v,
-        d.bias_b, d.bias_h, d.bias_sq, d.bias_skv, is_training, return_max_logit,
+        b, h, static_cast<int64_t>(num_gqa_groups), sq, skv, static_cast<int64_t>(head_dim_qk),
+        static_cast<int64_t>(head_dim_v), max_b, max_t_q, max_t_kv, num_pages_k, num_pages_v,
+        page_size_k, page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, bias_b, bias_h,
+        bias_sq, bias_skv, is_training, return_max_logit,
         /*scaling_factor=*/1.0f, p_dropout, qkv_layout, o_format, bias_type, mask_type,
         softmax_type, window_size_left, window_size_right, bottom_right_diagonal,
         /*devPtrQ=*/nullptr, /*devPtrK=*/nullptr, /*devPtrV=*/nullptr, /*devPtrBias=*/nullptr,
@@ -1413,14 +1384,15 @@ std::string is_supported_f16_fwd(
         /*devPtrO=*/nullptr, /*devPtrDropoutSeed=*/nullptr, /*devPtrDropoutOffset=*/nullptr,
         /*devPtrCuSeqlensQ=*/nullptr, /*devPtrCuSeqlensKV=*/nullptr,
         /*devPtrPageTableK=*/nullptr, /*devPtrPageTableV=*/nullptr,
-        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr, get_cudnn_fe_dtype(q_dtype),
+        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr,
+        get_cudnn_fe_dtype(qkv_dtype),
         /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
     return "";
   } catch (const std::exception &e) {
-    return std::string("[GRAPH_NOT_SUPPORTED] ") + e.what();
+    return e.what();
   } catch (...) {
-    return "[GRAPH_NOT_SUPPORTED] is_supported_f16_fwd: unknown failure";
+    return "is_supported_f16_fwd: unknown failure.";
   }
 }
 
@@ -1429,23 +1401,36 @@ std::string is_supported_f16_bwd(
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
     NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, bool deterministic, DType q_dtype, cudnnHandle_t handle) {
-  const ProbeDims d =
-      compute_probe_dims(static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
-                         static_cast<int64_t>(max_seqlen_q), static_cast<int64_t>(max_seqlen_kv),
-                         qkv_layout, bias_type);
-  const NVTE_QKV_Format o_format = nvte_get_q_format(qkv_layout);
+    bool bottom_right_diagonal, bool deterministic, DType qkv_dtype, cudnnHandle_t handle) {
+  const auto b = static_cast<int64_t>(batch);
+  const auto h = static_cast<int64_t>(num_attn_heads);
+  const auto sq = static_cast<int64_t>(max_seqlen_q);
+  const auto skv = static_cast<int64_t>(max_seqlen_kv);
+
+  const NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout);
+  const NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
+  const bool is_ragged_q = (q_format == NVTE_QKV_Format::NVTE_THD);
+  const bool is_ragged_kv = (kv_format == NVTE_QKV_Format::NVTE_THD);
+  const bool has_bias = (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS);
+
+  const int64_t max_b = (is_ragged_q || is_ragged_kv) ? b : 0;
+  const int64_t max_t_q = is_ragged_q ? b * sq : 0;
+  const int64_t max_t_kv = is_ragged_kv ? b * skv : 0;
+  const int64_t bias_b = has_bias ? b : 0;
+  const int64_t bias_h = has_bias ? h : 0;
+  const int64_t bias_sq = has_bias ? sq : 0;
+  const int64_t bias_skv = has_bias ? skv : 0;
+
+  const NVTE_QKV_Format o_format = q_format;
   const NVTE_QKV_Format do_format = o_format;
   const NVTE_QKV_Layout dqkv_layout = qkv_layout;
 
   size_t workspace_size = 0;
   try {
     fused_attn::fused_attn_arbitrary_seqlen_bwd_impl(
-        static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
-        static_cast<int64_t>(num_gqa_groups), static_cast<int64_t>(max_seqlen_q),
-        static_cast<int64_t>(max_seqlen_kv), static_cast<int64_t>(head_dim_qk),
-        static_cast<int64_t>(head_dim_v), d.max_b, d.max_t_q, d.max_t_kv, d.bias_b, d.bias_h,
-        d.bias_sq, d.bias_skv, /*scaling_factor=*/1.0f, p_dropout, qkv_layout, o_format, do_format,
+        b, h, static_cast<int64_t>(num_gqa_groups), sq, skv, static_cast<int64_t>(head_dim_qk),
+        static_cast<int64_t>(head_dim_v), max_b, max_t_q, max_t_kv, bias_b, bias_h, bias_sq,
+        bias_skv, /*scaling_factor=*/1.0f, p_dropout, qkv_layout, o_format, do_format,
         dqkv_layout, bias_type, mask_type, softmax_type, window_size_left, window_size_right,
         bottom_right_diagonal, deterministic, /*devPtrQ=*/nullptr, /*devPtrKTranspose=*/nullptr,
         /*devPtrVTranspose=*/nullptr, /*devPtrO=*/nullptr, /*devPtrSoftmaxStats=*/nullptr,
@@ -1454,14 +1439,15 @@ std::string is_supported_f16_bwd(
         /*devPtrdBias=*/nullptr, /*devPtrdSoftmaxOffset=*/nullptr,
         /*devPtrDropoutSeed=*/nullptr, /*devPtrDropoutOffset=*/nullptr,
         /*devPtrCuSeqlensQ=*/nullptr, /*devPtrCuSeqlensKV=*/nullptr,
-        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr, get_cudnn_fe_dtype(q_dtype),
+        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr,
+        get_cudnn_fe_dtype(qkv_dtype),
         /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
     return "";
   } catch (const std::exception &e) {
-    return std::string("[GRAPH_NOT_SUPPORTED] ") + e.what();
+    return e.what();
   } catch (...) {
-    return "[GRAPH_NOT_SUPPORTED] is_supported_f16_bwd: unknown failure";
+    return "is_supported_f16_bwd: unknown failure.";
   }
 }
 
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
index 0eabe3e8dc..3f5ae717bb 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
@@ -49,30 +49,25 @@ void fused_attn_arbitrary_seqlen_bwd(
     const Tensor *cu_seqlens_kv_padded, const Tensor *rng_state, Tensor *workspace,
     cudaStream_t stream, cudnnHandle_t handle);
 
-// Probe: drives cuDNN-FE (validate -> build_operation_graph -> create_execution_plans ->
-// check_support -> build_plans) for an F16/BF16 forward graph with the given configuration.
-// Returns an empty string iff the graph compiles end-to-end; on OK the built graph is
-// inserted into the same thread-local cache used by fused_attn_arbitrary_seqlen_fwd_impl,
-// so the executor cache-hits on matching descriptors.
-//
-// On rejection, returns a non-empty diagnostic of the form
-//   "[<CATEGORY>] <human-readable message>"
-// where <CATEGORY> is a stable tag mirroring cudnn_frontend::error_code_t names
-// (e.g. GRAPH_NOT_SUPPORTED for cuDNN-FE rejections forwarded from the support chain).
+// check if a given configuration is supported for F16/BF16 forward;
+// if it is, cache the graph built for this config, and return an empty string;
+// if not, return a diagnostic message in the form of a string.
 std::string is_supported_f16_fwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training,
     bool return_max_logit, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
     NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-    int64_t window_size_right, bool bottom_right_diagonal, DType q_dtype, cudnnHandle_t handle);
+    int64_t window_size_right, bool bottom_right_diagonal, DType qkv_dtype, cudnnHandle_t handle);
 
-// Probe: same as above for the F16/BF16 backward graph.
+// check if a given configuration is supported for F16/BF16 backward;
+// if it is, cache the graph built for this config, and return an empty string;
+// if not, return a diagnostic message in the form of a string.
 std::string is_supported_f16_bwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
     NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, bool deterministic, DType q_dtype, cudnnHandle_t handle);
+    bool bottom_right_diagonal, bool deterministic, DType qkv_dtype, cudnnHandle_t handle);
 
 }  // namespace transformer_engine
 
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
index 27bd0af3f3..c0b515138c 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
@@ -2997,14 +2997,12 @@ std::string is_supported_fp8_fwd(
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
     NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, DType q_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
+    bool bottom_right_diagonal, DType qkv_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
     cudnnHandle_t handle) {
-  // FP8 fwd impl rejects any qkv_format other than BSHD/SBHD/BHSD with NVTE_ERROR; mirror that
-  // here so the probe returns a typed rejection instead of catching the throw.
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
   if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
       qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
-    return "[INVALID_VALUE] FP8 fused attention only supports BSHD/SBHD/BHSD layouts.";
+    return "FP8 fused attention supports BSHD/SBHD/BHSD formats, found " + qkv_format + ".";
   }
   size_t workspace_size = 0;
   try {
@@ -3021,16 +3019,16 @@ std::string is_supported_fp8_fwd(
         /*devPtrDescaleS=*/nullptr, /*devPtrScaleS=*/nullptr, /*devPtrScaleO=*/nullptr,
         /*devPtrAmaxO=*/nullptr, /*devPtrAmaxS=*/nullptr, /*devPtrcuSeqlensQ=*/nullptr,
         /*devPtrcuSeqlensKV=*/nullptr, /*devPtrDropoutSeed=*/nullptr,
-        /*devPtrDropoutOffset=*/nullptr, get_cudnn_fe_dtype(q_dtype), get_cudnn_fe_dtype(o_dtype),
-        scaling_mode,
+        /*devPtrDropoutOffset=*/nullptr, get_cudnn_fe_dtype(qkv_dtype),
+        get_cudnn_fe_dtype(o_dtype), scaling_mode,
         /*qkv_scale_inv_format=*/NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
         /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
     return "";
   } catch (const std::exception& e) {
-    return std::string("[GRAPH_NOT_SUPPORTED] ") + e.what();
+    return e.what();
   } catch (...) {
-    return "[GRAPH_NOT_SUPPORTED] is_supported_fp8_fwd: unknown failure";
+    return "is_supported_fp8_fwd: unknown failure.";
   }
 }
 
@@ -3039,16 +3037,14 @@ std::string is_supported_fp8_bwd(
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
     NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, bool deterministic, DType q_dtype, DType o_dtype,
+    bool bottom_right_diagonal, bool deterministic, DType qkv_dtype, DType o_dtype,
     NVTEScalingMode scaling_mode, cudnnHandle_t handle) {
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
   if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
       qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
-    return "[INVALID_VALUE] FP8 fused attention only supports BSHD/SBHD/BHSD layouts.";
+    return "FP8 fused attention supports BSHD/SBHD/BHSD formats, found " + qkv_format + ".";
   }
-  // For FP8 bwd, dO data type matches O data type and dQKV data type matches Q data type
-  // (this mirrors the assumption used by callers of fused_attn_fp8_bwd in TE).
-  const cudnn_frontend::DataType_t qkv_t = get_cudnn_fe_dtype(q_dtype);
+  const cudnn_frontend::DataType_t qkv_t = get_cudnn_fe_dtype(qkv_dtype);
   const cudnn_frontend::DataType_t o_t = get_cudnn_fe_dtype(o_dtype);
   const cudnn_frontend::DataType_t do_t = o_t;
   const cudnn_frontend::DataType_t dqkv_t = qkv_t;
@@ -3082,9 +3078,9 @@ std::string is_supported_fp8_bwd(
         /*stream=*/static_cast<cudaStream_t>(0), handle);
     return "";
   } catch (const std::exception& e) {
-    return std::string("[GRAPH_NOT_SUPPORTED] ") + e.what();
+    return e.what();
   } catch (...) {
-    return "[GRAPH_NOT_SUPPORTED] is_supported_fp8_bwd: unknown failure";
+    return "is_supported_fp8_bwd: unknown failure.";
   }
 }
 
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h
index f91cdcf291..7c7460e4ea 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.h
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h
@@ -42,30 +42,25 @@ void fused_attn_fp8_bwd(
     Tensor *output_dSoftmaxOffset, const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv,
     const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);
 
-// Probe: drives cuDNN-FE (validate -> build_operation_graph -> create_execution_plans ->
-// check_support -> build_plans) for an FP8 forward graph with the given configuration.
-// Returns an empty string iff the graph compiles end-to-end; on OK the built graph is
-// inserted into the same thread-local cache used by fused_attn_fp8_fwd_impl.
-//
-// On rejection, returns a non-empty diagnostic of the form
-//   "[<CATEGORY>] <human-readable message>"
-// where <CATEGORY> mirrors cudnn_frontend::error_code_t names (INVALID_VALUE for the
-// FP8-only layout pre-filter, GRAPH_NOT_SUPPORTED for cuDNN-FE rejections forwarded
-// from the support chain).
+// check if a given configuration is supported for FP8 forward;
+// if it is, cache the graph built for this config, and return an empty string;
+// if not, return a diagnostic message in the form of a string.
 std::string is_supported_fp8_fwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
     NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, DType q_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
+    bool bottom_right_diagonal, DType qkv_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
     cudnnHandle_t handle);
 
-// Probe: same as above for the FP8 backward graph.
+// check if a given configuration is supported for FP8 backward;
+// if it is, cache the graph built for this config, and return an empty string;
+// if not, return a diagnostic message in the form of a string.
 std::string is_supported_fp8_bwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
     NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, bool deterministic, DType q_dtype, DType o_dtype,
+    bool bottom_right_diagonal, bool deterministic, DType qkv_dtype, DType o_dtype,
     NVTEScalingMode scaling_mode, cudnnHandle_t handle);
 }  // namespace transformer_engine
diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h
index bbcdf08995..b90749c8ee 100644
--- a/transformer_engine/common/include/transformer_engine/fused_attn.h
+++ b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -199,23 +199,12 @@ NVTE_QKV_Format nvte_get_q_format(NVTE_QKV_Layout qkv_layout);
 NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout);
 
 /*! \brief Get fused attention backend based on input parameters.
- *
- *  Authoritative routing: when a non-NVTE_No_Backend value is returned, the configuration
- *  is guaranteed to compile through cuDNN-FE (validate -> build_operation_graph ->
- *  create_execution_plans -> check_support -> build_plans). The router applies a small
- *  set of TE-specific post-filters in addition to delegating to cuDNN-FE for capability
- *  checks. On success the built plan is cached, so the executor avoids rebuilding.
  *
  *  \param[in]     is_training         Whether the model is in training mode.
  *  \param[in]     q_dtype             The data type of Tensor Q.
  *  \param[in]     kv_dtype            The data type of Tensors K, V.
- *  \param[in]     o_dtype             The data type of output Tensor O. Used by the FP8
- *                                     branch to disambiguate FP8 vs HALF/BF16 output;
- *                                     ignored by the F16/BF16 branch (pass q_dtype).
- *  \param[in]     scaling_mode        Scaling mode of the input tensors. Used by the FP8
- *                                     branch to select among delayed/current/MXFP8 recipes;
- *                                     ignored by the F16/BF16 branch
- *                                     (pass NVTE_DELAYED_TENSOR_SCALING).
+ *  \param[in]     o_dtype             The data type of Tensor O.
+ *  \param[in]     scaling_mode        Scaling mode of attention.
  *  \param[in]     qkv_layout          The layout of Tensors Q, K, V.
  *  \param[in]     bias_type           The attention bias type.
  *  \param[in]     attn_mask_type      The attention mask type.
@@ -232,20 +221,9 @@ NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout);
  *  \param[in]     return_max_logit    Whether to produce Max along with Stats.
  *  \param[in]     cuda_graph          Whether cuda graph capture is enabled or not.
  *  \param[in]     deterministic       Whether determinism is required or not.
- *  \param[in]     handle              cuDNN handle used for the support chain. Required.
- *  \param[out]    out_reason          Optional. When non-NULL, set to a null-terminated
- *                                     diagnostic string describing why the configuration
- *                                     was rejected (NVTE_No_Backend) or set to "" on
- *                                     success. Rejection messages are tagged with a
- *                                     stable category prefix that mirrors
- *                                     \c cudnn_frontend::error_code_t, e.g.
- *                                     \c "[INVALID_VALUE] ..." for TE post-filter
- *                                     rejections and FP8 layout pre-filter rejections,
- *                                     \c "[GRAPH_NOT_SUPPORTED] ..." for cuDNN-FE
- *                                     rejections forwarded from the support chain. The
- *                                     pointer points into per-thread storage owned by TE
- *                                     and is valid only until the next call to
- *                                     \c nvte_get_fused_attn_backend on the same thread.
+ *  \param[in]     handle              cuDNN handle.
+ *  \param[out]    message             Empty string on success, otherwise a diagnostic string
+ *                                     describing why the configuration was rejected.
  */
 NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
@@ -254,7 +232,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
     size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
     bool return_max_logit, bool cuda_graph, bool deterministic, cudnnHandle_t handle,
-    const char **out_reason);
+    const char **message);
 
 /*! \brief Compute dot product attention with separate Q, K and V.
  *
diff --git a/transformer_engine/common/util/pybind_helper.h b/transformer_engine/common/util/pybind_helper.h
index fdfa47da8f..fb5096b9a7 100644
--- a/transformer_engine/common/util/pybind_helper.h
+++ b/transformer_engine/common/util/pybind_helper.h
@@ -83,6 +83,13 @@
       .value("NVTE_F16_arbitrary_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen)      \
       .value("NVTE_FP8", NVTE_Fused_Attn_Backend::NVTE_FP8)                                        \
       .value("NVTE_No_Backend", NVTE_Fused_Attn_Backend::NVTE_No_Backend);                         \
+  pybind11::enum_<NVTEScalingMode>(m, "NVTEScalingMode", pybind11::module_local())                 \
+      .value("NVTE_DELAYED_TENSOR_SCALING", NVTEScalingMode::NVTE_DELAYED_TENSOR_SCALING)          \
+      .value("NVTE_MXFP8_1D_SCALING", NVTEScalingMode::NVTE_MXFP8_1D_SCALING)                      \
+      .value("NVTE_BLOCK_SCALING_1D", NVTEScalingMode::NVTE_BLOCK_SCALING_1D)                      \
+      .value("NVTE_BLOCK_SCALING_2D", NVTEScalingMode::NVTE_BLOCK_SCALING_2D)                      \
+      .value("NVTE_NVFP4_1D_SCALING", NVTEScalingMode::NVTE_NVFP4_1D_SCALING)                      \
+      .value("NVTE_INVALID_SCALING", NVTEScalingMode::NVTE_INVALID_SCALING);                       \
   pybind11::enum_<transformer_engine::Float8BlockScaleTensorFormat>(                               \
       m, "Float8BlockScaleTensorFormat", pybind11::module_local())                                 \
       .value("GEMM_READY", transformer_engine::Float8BlockScaleTensorFormat::GEMM_READY)           \
diff --git a/transformer_engine/jax/cpp_extensions/attention.py b/transformer_engine/jax/cpp_extensions/attention.py
index 40d02f40e1..2a38e5f6bd 100644
--- a/transformer_engine/jax/cpp_extensions/attention.py
+++ b/transformer_engine/jax/cpp_extensions/attention.py
@@ -16,7 +16,7 @@
 from jax.experimental.custom_partitioning import SdyShardingRule
 
 import transformer_engine_jax
-from transformer_engine_jax import NVTE_Fused_Attn_Backend
+from transformer_engine_jax import NVTE_Fused_Attn_Backend, NVTEScalingMode
 from transformer_engine.jax.attention import (
     AttnBiasType,
     AttnMaskType,
@@ -125,14 +125,22 @@ class FusedAttnHelper:
 
     def is_fused_attn_kernel_available(self):
         """Check if there is available fused attention kernel"""
-        return self.get_fused_attn_backend() != NVTE_Fused_Attn_Backend.NVTE_No_Backend
+        backend, _ = self.get_fused_attn_backend()
+        return backend != NVTE_Fused_Attn_Backend.NVTE_No_Backend
 
     def get_fused_attn_backend(self):
-        """Get the fused attention kernel backend"""
+        """Get the fused attention kernel backend.
+
+        Returns a ``(backend, message)`` tuple. ``message`` is empty on success, otherwise a
+        diagnostic string describing why the configuration was rejected when backend = NVTE_No_Backend.
+        """
+        q_type = jax_dtype_to_te_dtype(self.q_dtype)
         return transformer_engine_jax.get_fused_attn_backend(
             self.is_training,
-            jax_dtype_to_te_dtype(self.q_dtype),
+            q_type,
             jax_dtype_to_te_dtype(self.kv_dtype),
+            q_type,
+            NVTEScalingMode.NVTE_INVALID_SCALING,
             self.qkv_layout.value,
             self.attn_bias_type.value,
             self.attn_mask_type.value,
@@ -335,7 +343,7 @@ def abstract(
         out_aval = q_aval.update(shape=output_shape, dtype=q_dtype)
 
         # backend determines the softmax buffer shape/dtype
-        backend = FusedAttnHelper(
+        backend, message = FusedAttnHelper(
             config.is_training,
             q_dtype,
             k_dtype,
@@ -372,7 +380,7 @@ def abstract(
                 )
             softmax_dtype = dtypes.canonicalize_dtype(jnp.float32)
         else:
-            raise ValueError(f"Unsupported {backend=}")
+            raise ValueError(f"Unsupported backend: {message}")
         softmax_aux_aval = q_aval.update(shape=softmax_shape, dtype=softmax_dtype)
 
         # JAX does not enable 64-bit int by default so we get XLA to allocate x8 memory with
diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h
index 2ecfedc8a2..629b6dc3bf 100644
--- a/transformer_engine/jax/csrc/extensions.h
+++ b/transformer_engine/jax/csrc/extensions.h
@@ -23,6 +23,7 @@
 #include <iostream>
 #include <stdexcept>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include "common/common.h"
@@ -146,12 +147,14 @@ XLA_FFI_DECLARE_HANDLER_SYMBOL(FusedAttnForwardHandler);
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(FusedAttnBackwardHandler);
 
-NVTE_Fused_Attn_Backend GetFusedAttnBackend(
-    bool is_training, DType q_dtype, DType kv_dtype, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
-    float dropout_probability, size_t q_attn_heads, size_t kv_attn_heads, size_t q_max_seqlen,
-    size_t kv_max_seqlen, size_t qk_head_dim, size_t v_head_dim, int64_t window_size_left,
-    int64_t window_size_right, bool deterministic);
+// Returns (backend, message). `message` is empty on success, otherwise a diagnostic string
+// describing why the configuration was rejected when backend = NVTE_No_Backend.
+std::tuple<NVTE_Fused_Attn_Backend, std::string> GetFusedAttnBackend(
+    bool is_training, DType q_dtype, DType kv_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
+    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+    NVTE_Softmax_Type softmax_type, float dropout_probability, size_t q_attn_heads,
+    size_t kv_attn_heads, size_t q_max_seqlen, size_t kv_max_seqlen, size_t qk_head_dim,
+    size_t v_head_dim, int64_t window_size_left, int64_t window_size_right, bool deterministic);
 
 pybind11::tuple GetFusedAttnForwardWorkspaceSizes(
     size_t input_batch, size_t bias_batch, size_t q_max_seqlen, size_t kv_max_seqlen,
diff --git a/transformer_engine/jax/csrc/extensions/attention.cpp b/transformer_engine/jax/csrc/extensions/attention.cpp
index 669570daa5..83bddcabb1 100644
--- a/transformer_engine/jax/csrc/extensions/attention.cpp
+++ b/transformer_engine/jax/csrc/extensions/attention.cpp
@@ -12,21 +12,21 @@
 namespace transformer_engine {
 namespace jax {
 
-NVTE_Fused_Attn_Backend GetFusedAttnBackend(
-    bool is_training, DType q_dtype, DType kv_dtype, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
-    float dropout_probability, size_t q_attn_heads, size_t kv_attn_heads, size_t q_max_seqlen,
-    size_t kv_max_seqlen, size_t qk_head_dim, size_t v_head_dim, int64_t window_size_left,
-    int64_t window_size_right, bool deterministic) {
+std::tuple<NVTE_Fused_Attn_Backend, std::string> GetFusedAttnBackend(
+    bool is_training, DType q_dtype, DType kv_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
+    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+    NVTE_Softmax_Type softmax_type, float dropout_probability, size_t q_attn_heads,
+    size_t kv_attn_heads, size_t q_max_seqlen, size_t kv_max_seqlen, size_t qk_head_dim,
+    size_t v_head_dim, int64_t window_size_left, int64_t window_size_right, bool deterministic) {
   auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
+  const char *message = nullptr;
   auto backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
-      static_cast<NVTEDType>(q_dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type,
-      mask_type, softmax_type, dropout_probability, q_attn_heads, kv_attn_heads, q_max_seqlen,
-      kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
-      /*return_max_logit=*/false, /*cuda_graph=*/false, deterministic, handle,
-      /*out_reason=*/nullptr);
-  return backend;
+      static_cast<NVTEDType>(o_dtype), scaling_mode, qkv_layout, bias_type, mask_type, softmax_type,
+      dropout_probability, q_attn_heads, kv_attn_heads, q_max_seqlen, kv_max_seqlen, qk_head_dim,
+      v_head_dim, window_size_left, window_size_right,
+      /*return_max_logit=*/false, /*cuda_graph=*/false, deterministic, handle, &message);
+  return {backend, message ? std::string(message) : std::string()};
 }
 
 /*
@@ -279,10 +279,10 @@ static void FusedAttnForwardImpl(
   auto _handle_fwd = cudnnExecutionPlanManager::Instance().GetHandle();
   auto backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype),
-      static_cast<NVTEDType>(dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type, mask_type,
+      static_cast<NVTEDType>(dtype), NVTE_INVALID_SCALING, qkv_layout, bias_type, mask_type,
       softmax_type, dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen,
       qk_head_dim, v_head_dim, window_size_left, window_size_right, /*return_max_logit=*/false,
-      /*cuda_graph=*/false, deterministic, _handle_fwd, /*out_reason=*/nullptr);
+      /*cuda_graph=*/false, deterministic, _handle_fwd, /*message=*/nullptr);
   nvte_populate_rng_state_async(rng_state, seed, q_max_seqlen, kv_max_seqlen, backend, stream);
 
   /* Auxiliary tensors (to be propagated to the backward pass later) */
@@ -557,10 +557,10 @@ static void FusedAttnBackwardImpl(
   auto _handle_bwd = cudnnExecutionPlanManager::Instance().GetHandle();
   auto backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype),
-      static_cast<NVTEDType>(dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type, mask_type,
+      static_cast<NVTEDType>(dtype), NVTE_INVALID_SCALING, qkv_layout, bias_type, mask_type,
       softmax_type, dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen,
       qk_head_dim, v_head_dim, window_size_left, window_size_right, /*return_max_logit=*/false,
-      /*cuda_graph=*/false, deterministic, _handle_bwd, /*out_reason=*/nullptr);
+      /*cuda_graph=*/false, deterministic, _handle_bwd, /*message=*/nullptr);
   PrepareFusedAttnBackwardAuxTensors(&aux_input_tensors, input_batch, bias_batch, attn_heads,
                                      bias_heads, q_max_seqlen, kv_max_seqlen, dtype, backend,
                                      softmax_aux, rng_state, bias, softmax_offset);
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
index ed87423534..f236d5a26c 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -1229,10 +1229,12 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
         if fp8 and fp8_meta["recipe"].fp8_dpa:
             q_type = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True)
             kv_type = q_type
-        fused_attention_backend = tex.get_fused_attn_backend(
+        fused_attention_backend, reject_message = tex.get_fused_attn_backend(
             is_training,
             q_type,
             kv_type,
+            q_type,
+            tex.NVTEScalingMode.NVTE_INVALID_SCALING,
             QKVLayout[qkv_layout],
             AttnBiasType[fu_core_attention_bias_type],
             AttnMaskType[attn_mask_type],
@@ -1251,7 +1253,10 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
             deterministic,
         )
         if fused_attention_backend == FusedAttnBackend["No_Backend"]:
-            logger.debug("Disabling FusedAttention as no backend supports the provided input")
+            logger.debug(
+                "Disabling FusedAttention as %s",
+                reject_message,
+            )
             use_fused_attention = False
             fused_attention_backend = None
         if (
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
index 4a2ea7412b..733f98e575 100644
--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -75,12 +75,15 @@ std::tuple<at::Tensor, at::Tensor> moe_unpermute_bwd(at::Tensor input_bwd, at::T
  * Attention
  **************************************************************************************************/
 
-NVTE_Fused_Attn_Backend get_fused_attn_backend(
-    bool is_training, const DType q_dtype, const DType kv_dtype, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
-    float p_dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic);
+// Returns (backend, reason). `reason` is empty on success, otherwise a diagnostic string
+// describing why the configuration was rejected when backend = NVTE_No_Backend.
+std::tuple<NVTE_Fused_Attn_Backend, std::string> get_fused_attn_backend(
+    bool is_training, const DType q_dtype, const DType kv_dtype, const DType o_dtype,
+    NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float p_dropout,
+    size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
+    size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
+    bool return_max_logit, bool cuda_graph, bool deterministic);
 
 std::vector<py::object> fused_attn_fwd(
     size_t max_seqlen_q, size_t max_seqlen_kv, bool is_training, float attn_scale, float p_dropout,
diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cpp b/transformer_engine/pytorch/csrc/extensions/attention.cpp
index 3af4ba3831..0c5f99ef33 100644
--- a/transformer_engine/pytorch/csrc/extensions/attention.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/attention.cpp
@@ -41,26 +41,22 @@ void mha_fill(const transformer_engine::TensorWrapper &self, const at::Tensor &s
 namespace transformer_engine::pytorch {
 
 // get the fused attention backend
-//
-// NOTE: the underlying nvte_get_fused_attn_backend now takes o_dtype and scaling_mode in
-// addition to q_dtype/kv_dtype. For the F16/BF16 routing path those are ignored, so we pass
-// q_dtype as o_dtype and DELAYED_TENSOR_SCALING. This Python-facing wrapper therefore keeps
-// its existing signature; FP8 callers that want authoritative routing for non-default scaling
-// recipes should add o_dtype / scaling_mode parameters in a follow-up.
-NVTE_Fused_Attn_Backend get_fused_attn_backend(
-    bool is_training, const DType q_dtype, const DType kv_dtype, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
-    float p_dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic) {
+std::tuple<NVTE_Fused_Attn_Backend, std::string> get_fused_attn_backend(
+    bool is_training, const DType q_dtype, const DType kv_dtype, const DType o_dtype,
+    NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float p_dropout,
+    size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
+    size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
+    bool return_max_logit, bool cuda_graph, bool deterministic) {
   auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
+  const char *message = nullptr;
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
-      static_cast<NVTEDType>(q_dtype), NVTE_DELAYED_TENSOR_SCALING, qkv_layout, bias_type,
-      attn_mask_type, softmax_type, p_dropout, num_attn_heads, num_gqa_groups, max_seqlen_q,
-      max_seqlen_kv, head_dim_qk, head_dim_v, window_size_left, window_size_right, return_max_logit,
-      cuda_graph, deterministic, handle, /*out_reason=*/nullptr);
-  return fused_attention_backend;
+      static_cast<NVTEDType>(o_dtype), scaling_mode, qkv_layout, bias_type, attn_mask_type,
+      softmax_type, p_dropout, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv,
+      head_dim_qk, head_dim_v, window_size_left, window_size_right, return_max_logit, cuda_graph,
+      deterministic, handle, &message);
+  return {fused_attention_backend, message ? std::string(message) : std::string()};
 }
 
 // helper function for S and dP quantizers

From 81e59a9c86d8a6de3686244e7a1801e5cd3db487 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 00:11:48 +0000
Subject: [PATCH 05/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../common/fused_attn/fused_attn.cpp          | 15 ++++----
 .../fused_attn_f16_arbitrary_seqlen.cu        | 36 ++++++++++---------
 .../fused_attn_f16_arbitrary_seqlen.h         | 27 +++++++-------
 .../common/fused_attn/fused_attn_fp8.cu       | 34 +++++++++---------
 .../common/fused_attn/fused_attn_fp8.h        | 30 ++++++++--------
 5 files changed, 74 insertions(+), 68 deletions(-)

diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index 21b0d80f4f..41607f05f7 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -263,8 +263,8 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
                                      max_seqlen_kv, head_dim_qk, head_dim_v) == DType::kInt64);
   if (requires_64bit_ragged_offset && cudnn_runtime_version < 90500) {
     set_message(message,
-               "Configuration requires 64-bit ragged offsets, which require "
-               "cuDNN >= 9.5.");
+                "Configuration requires 64-bit ragged offsets, which require "
+                "cuDNN >= 9.5.");
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
 
@@ -274,7 +274,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
     set_message(message,
-               "THD format requires PADDING / PADDING_CAUSAL / PADDING_CAUSAL_BOTTOM_RIGHT mask.");
+                "THD format requires PADDING / PADDING_CAUSAL / PADDING_CAUSAL_BOTTOM_RIGHT mask.");
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
 
@@ -285,8 +285,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
       attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
-    set_message(message,
-               "Known cuDNN <= 9.15 issue with CUDA graph. Please upgrade cuDNN.");
+    set_message(message, "Known cuDNN <= 9.15 issue with CUDA graph. Please upgrade cuDNN.");
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
 
@@ -300,8 +299,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
 
   if (is_fp8) {
     if (return_max_logit) {
-      set_message(message,
-                 "FP8 fused attention does not support return_max_logit=True.");
+      set_message(message, "FP8 fused attention does not support return_max_logit=True.");
       return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
     const DType qkv_t = static_cast<DType>(q_dtype);
@@ -354,8 +352,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     return NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen;
   }
 
-  set_message(message,
-             "Unsupported QKV dtype qkv_dtype=" + std::to_string(q_dtype) + " .");
+  set_message(message, "Unsupported QKV dtype qkv_dtype=" + std::to_string(q_dtype) + " .");
   return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
 }
 
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
index 81e66d8800..3a2b296ffc 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
@@ -1334,13 +1334,14 @@ void fused_attn_arbitrary_seqlen_bwd(
   }
 }
 
-std::string is_supported_f16_fwd(
-    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training,
-    bool return_max_logit, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-    int64_t window_size_right, bool bottom_right_diagonal, DType qkv_dtype,
-    cudnnHandle_t handle) {
+std::string is_supported_f16_fwd(size_t batch, size_t num_attn_heads, size_t num_gqa_groups,
+                                 size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim_qk,
+                                 size_t head_dim_v, bool is_training, bool return_max_logit,
+                                 float p_dropout, NVTE_QKV_Layout qkv_layout,
+                                 NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+                                 NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+                                 int64_t window_size_right, bool bottom_right_diagonal,
+                                 DType qkv_dtype, cudnnHandle_t handle) {
   const auto b = static_cast<int64_t>(batch);
   const auto h = static_cast<int64_t>(num_attn_heads);
   const auto sq = static_cast<int64_t>(max_seqlen_q);
@@ -1375,8 +1376,8 @@ std::string is_supported_f16_fwd(
     fused_attn::fused_attn_arbitrary_seqlen_fwd_impl(
         b, h, static_cast<int64_t>(num_gqa_groups), sq, skv, static_cast<int64_t>(head_dim_qk),
         static_cast<int64_t>(head_dim_v), max_b, max_t_q, max_t_kv, num_pages_k, num_pages_v,
-        page_size_k, page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, bias_b, bias_h,
-        bias_sq, bias_skv, is_training, return_max_logit,
+        page_size_k, page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, bias_b, bias_h, bias_sq,
+        bias_skv, is_training, return_max_logit,
         /*scaling_factor=*/1.0f, p_dropout, qkv_layout, o_format, bias_type, mask_type,
         softmax_type, window_size_left, window_size_right, bottom_right_diagonal,
         /*devPtrQ=*/nullptr, /*devPtrK=*/nullptr, /*devPtrV=*/nullptr, /*devPtrBias=*/nullptr,
@@ -1396,12 +1397,13 @@ std::string is_supported_f16_fwd(
   }
 }
 
-std::string is_supported_f16_bwd(
-    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
-    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
-    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, bool deterministic, DType qkv_dtype, cudnnHandle_t handle) {
+std::string is_supported_f16_bwd(size_t batch, size_t num_attn_heads, size_t num_gqa_groups,
+                                 size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim_qk,
+                                 size_t head_dim_v, float p_dropout, NVTE_QKV_Layout qkv_layout,
+                                 NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+                                 NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+                                 int64_t window_size_right, bool bottom_right_diagonal,
+                                 bool deterministic, DType qkv_dtype, cudnnHandle_t handle) {
   const auto b = static_cast<int64_t>(batch);
   const auto h = static_cast<int64_t>(num_attn_heads);
   const auto sq = static_cast<int64_t>(max_seqlen_q);
@@ -1430,8 +1432,8 @@ std::string is_supported_f16_bwd(
     fused_attn::fused_attn_arbitrary_seqlen_bwd_impl(
         b, h, static_cast<int64_t>(num_gqa_groups), sq, skv, static_cast<int64_t>(head_dim_qk),
         static_cast<int64_t>(head_dim_v), max_b, max_t_q, max_t_kv, bias_b, bias_h, bias_sq,
-        bias_skv, /*scaling_factor=*/1.0f, p_dropout, qkv_layout, o_format, do_format,
-        dqkv_layout, bias_type, mask_type, softmax_type, window_size_left, window_size_right,
+        bias_skv, /*scaling_factor=*/1.0f, p_dropout, qkv_layout, o_format, do_format, dqkv_layout,
+        bias_type, mask_type, softmax_type, window_size_left, window_size_right,
         bottom_right_diagonal, deterministic, /*devPtrQ=*/nullptr, /*devPtrKTranspose=*/nullptr,
         /*devPtrVTranspose=*/nullptr, /*devPtrO=*/nullptr, /*devPtrSoftmaxStats=*/nullptr,
         /*devPtrBias=*/nullptr, /*devPtrSoftmaxOffset=*/nullptr, /*devPtrdQ=*/nullptr,
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
index 3f5ae717bb..fe94d0c10c 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
@@ -52,22 +52,25 @@ void fused_attn_arbitrary_seqlen_bwd(
 // check if a given configuration is supported for F16/BF16 forward;
 // if it is, cache the graph built for this config, and return an empty string;
 // if not, return a diagnostic message in the form of a string.
-std::string is_supported_f16_fwd(
-    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training,
-    bool return_max_logit, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-    int64_t window_size_right, bool bottom_right_diagonal, DType qkv_dtype, cudnnHandle_t handle);
+std::string is_supported_f16_fwd(size_t batch, size_t num_attn_heads, size_t num_gqa_groups,
+                                 size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim_qk,
+                                 size_t head_dim_v, bool is_training, bool return_max_logit,
+                                 float p_dropout, NVTE_QKV_Layout qkv_layout,
+                                 NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+                                 NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+                                 int64_t window_size_right, bool bottom_right_diagonal,
+                                 DType qkv_dtype, cudnnHandle_t handle);
 
 // check if a given configuration is supported for F16/BF16 backward;
 // if it is, cache the graph built for this config, and return an empty string;
 // if not, return a diagnostic message in the form of a string.
-std::string is_supported_f16_bwd(
-    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
-    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
-    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, bool deterministic, DType qkv_dtype, cudnnHandle_t handle);
+std::string is_supported_f16_bwd(size_t batch, size_t num_attn_heads, size_t num_gqa_groups,
+                                 size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim_qk,
+                                 size_t head_dim_v, float p_dropout, NVTE_QKV_Layout qkv_layout,
+                                 NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+                                 NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+                                 int64_t window_size_right, bool bottom_right_diagonal,
+                                 bool deterministic, DType qkv_dtype, cudnnHandle_t handle);
 
 }  // namespace transformer_engine
 
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
index db4f25c05f..40b4aa4299 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
@@ -1325,13 +1325,14 @@ void fused_attn_fp8_bwd(
   }
 }
 
-std::string is_supported_fp8_fwd(
-    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training, float p_dropout,
-    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
-    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, DType qkv_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
-    cudnnHandle_t handle) {
+std::string is_supported_fp8_fwd(size_t batch, size_t num_attn_heads, size_t num_gqa_groups,
+                                 size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim_qk,
+                                 size_t head_dim_v, bool is_training, float p_dropout,
+                                 NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+                                 NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
+                                 int64_t window_size_left, int64_t window_size_right,
+                                 bool bottom_right_diagonal, DType qkv_dtype, DType o_dtype,
+                                 NVTEScalingMode scaling_mode, cudnnHandle_t handle) {
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
   if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
       qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
@@ -1352,8 +1353,8 @@ std::string is_supported_fp8_fwd(
         /*devPtrDescaleS=*/nullptr, /*devPtrScaleS=*/nullptr, /*devPtrScaleO=*/nullptr,
         /*devPtrAmaxO=*/nullptr, /*devPtrAmaxS=*/nullptr, /*devPtrcuSeqlensQ=*/nullptr,
         /*devPtrcuSeqlensKV=*/nullptr, /*devPtrDropoutSeed=*/nullptr,
-        /*devPtrDropoutOffset=*/nullptr, get_cudnn_fe_dtype(qkv_dtype),
-        get_cudnn_fe_dtype(o_dtype), scaling_mode,
+        /*devPtrDropoutOffset=*/nullptr, get_cudnn_fe_dtype(qkv_dtype), get_cudnn_fe_dtype(o_dtype),
+        scaling_mode,
         /*qkv_scale_inv_format=*/NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
         /*workspace=*/nullptr, &workspace_size,
         /*stream=*/static_cast<cudaStream_t>(0), handle);
@@ -1365,13 +1366,14 @@ std::string is_supported_fp8_fwd(
   }
 }
 
-std::string is_supported_fp8_bwd(
-    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
-    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
-    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, bool deterministic, DType qkv_dtype, DType o_dtype,
-    NVTEScalingMode scaling_mode, cudnnHandle_t handle) {
+std::string is_supported_fp8_bwd(size_t batch, size_t num_attn_heads, size_t num_gqa_groups,
+                                 size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim_qk,
+                                 size_t head_dim_v, float p_dropout, NVTE_QKV_Layout qkv_layout,
+                                 NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+                                 NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+                                 int64_t window_size_right, bool bottom_right_diagonal,
+                                 bool deterministic, DType qkv_dtype, DType o_dtype,
+                                 NVTEScalingMode scaling_mode, cudnnHandle_t handle) {
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
   if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
       qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h
index 96f5d54968..d52dfd246b 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.h
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h
@@ -45,22 +45,24 @@ void fused_attn_fp8_bwd(
 // check if a given configuration is supported for FP8 forward;
 // if it is, cache the graph built for this config, and return an empty string;
 // if not, return a diagnostic message in the form of a string.
-std::string is_supported_fp8_fwd(
-    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, bool is_training, float p_dropout,
-    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
-    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, DType qkv_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
-    cudnnHandle_t handle);
+std::string is_supported_fp8_fwd(size_t batch, size_t num_attn_heads, size_t num_gqa_groups,
+                                 size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim_qk,
+                                 size_t head_dim_v, bool is_training, float p_dropout,
+                                 NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+                                 NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
+                                 int64_t window_size_left, int64_t window_size_right,
+                                 bool bottom_right_diagonal, DType qkv_dtype, DType o_dtype,
+                                 NVTEScalingMode scaling_mode, cudnnHandle_t handle);
 
 // check if a given configuration is supported for FP8 backward;
 // if it is, cache the graph built for this config, and return an empty string;
 // if not, return a diagnostic message in the form of a string.
-std::string is_supported_fp8_bwd(
-    size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, float p_dropout,
-    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
-    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, bool deterministic, DType qkv_dtype, DType o_dtype,
-    NVTEScalingMode scaling_mode, cudnnHandle_t handle);
+std::string is_supported_fp8_bwd(size_t batch, size_t num_attn_heads, size_t num_gqa_groups,
+                                 size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim_qk,
+                                 size_t head_dim_v, float p_dropout, NVTE_QKV_Layout qkv_layout,
+                                 NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+                                 NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+                                 int64_t window_size_right, bool bottom_right_diagonal,
+                                 bool deterministic, DType qkv_dtype, DType o_dtype,
+                                 NVTEScalingMode scaling_mode, cudnnHandle_t handle);
 }  // namespace transformer_engine

From 6c5126db51cf52657eafa01d853503fd254113e2 Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Thu, 7 May 2026 17:22:09 -0700
Subject: [PATCH 06/11] fix compilation errors

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
---
 transformer_engine/common/fused_attn/fused_attn_fp8.cu | 6 ++++--
 transformer_engine/common/fused_attn/fused_attn_fp8.h  | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
index 40b4aa4299..842e3958bc 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
@@ -1336,7 +1336,8 @@ std::string is_supported_fp8_fwd(size_t batch, size_t num_attn_heads, size_t num
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
   if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
       qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
-    return "FP8 fused attention supports BSHD/SBHD/BHSD formats, found " + qkv_format + ".";
+    return "FP8 fused attention supports BSHD/SBHD/BHSD formats, found " +
+           std::to_string(static_cast<int>(qkv_format)) + ".";
   }
   size_t workspace_size = 0;
   try {
@@ -1377,7 +1378,8 @@ std::string is_supported_fp8_bwd(size_t batch, size_t num_attn_heads, size_t num
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
   if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
       qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
-    return "FP8 fused attention supports BSHD/SBHD/BHSD formats, found " + qkv_format + ".";
+    return "FP8 fused attention supports BSHD/SBHD/BHSD formats, found " +
+           std::to_string(static_cast<int>(qkv_format)) + ".";
   }
   const cudnn_frontend::DataType_t qkv_t = get_cudnn_fe_dtype(qkv_dtype);
   const cudnn_frontend::DataType_t o_t = get_cudnn_fe_dtype(o_dtype);
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h
index d52dfd246b..21487898a6 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.h
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h
@@ -36,8 +36,8 @@ void fused_attn_fp8_bwd(
     NVTE_Softmax_Type softmax_type, size_t window_size_left, size_t window_size_right,
     bool bottom_right_diagonal, bool deterministic, const Tensor *input_Q, const Tensor *input_K,
     const Tensor *input_V, const Tensor *input_O, const Tensor *input_dO,
-    const Tensor *input_dO_f16, const Tensor *input_M, const Tensor *input_ZInv,
-    const Tensor *input_S, const Tensor *input_SoftmaxOffset, Tensor *input_output_dP,
+    const Tensor *input_dO_f16, const Tensor *input_M, const Tensor *input_S,
+    const Tensor *input_SoftmaxOffset, Tensor *input_output_dP,
     const Tensor *output_dQ, const Tensor *output_dK, const Tensor *output_dV,
     Tensor *output_dSoftmaxOffset, const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv,
     const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);

From d35bff72911e239577e63aa99db96db7571dbb97 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 00:23:00 +0000
Subject: [PATCH 07/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/common/fused_attn/fused_attn_fp8.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h
index 21487898a6..01c7561402 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.h
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h
@@ -37,10 +37,10 @@ void fused_attn_fp8_bwd(
     bool bottom_right_diagonal, bool deterministic, const Tensor *input_Q, const Tensor *input_K,
     const Tensor *input_V, const Tensor *input_O, const Tensor *input_dO,
     const Tensor *input_dO_f16, const Tensor *input_M, const Tensor *input_S,
-    const Tensor *input_SoftmaxOffset, Tensor *input_output_dP,
-    const Tensor *output_dQ, const Tensor *output_dK, const Tensor *output_dV,
-    Tensor *output_dSoftmaxOffset, const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv,
-    const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);
+    const Tensor *input_SoftmaxOffset, Tensor *input_output_dP, const Tensor *output_dQ,
+    const Tensor *output_dK, const Tensor *output_dV, Tensor *output_dSoftmaxOffset,
+    const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv, const Tensor *rng_state,
+    Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);
 
 // check if a given configuration is supported for FP8 forward;
 // if it is, cache the graph built for this config, and return an empty string;

From f6fc58568823aae209065c387f55d59c44d2dd4f Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Thu, 7 May 2026 18:08:56 -0700
Subject: [PATCH 08/11] remove handle from API; add bottom_right_diagonal

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
---
 tests/jax/test_fused_attn.py                  |  1 +
 .../common/fused_attn/fused_attn.cpp          | 49 ++++++++++--------
 .../common/fused_attn/fused_attn_fp8.cu       | 10 ----
 .../include/transformer_engine/fused_attn.h   | 51 ++++++++++---------
 transformer_engine/jax/attention.py           | 12 ++++-
 .../jax/cpp_extensions/attention.py           |  3 ++
 transformer_engine/jax/csrc/extensions.h      |  3 +-
 .../jax/csrc/extensions/attention.cpp         | 19 +++----
 .../attention/dot_product_attention/utils.py  |  3 +-
 transformer_engine/pytorch/csrc/extensions.h  |  2 +-
 .../pytorch/csrc/extensions/attention.cpp     |  8 ++-
 11 files changed, 85 insertions(+), 76 deletions(-)

diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py
index e8da8c7366..7dc7cc4c97 100644
--- a/tests/jax/test_fused_attn.py
+++ b/tests/jax/test_fused_attn.py
@@ -460,6 +460,7 @@ def _check_configs(self):
             self.head_dim_qk,
             self.head_dim_v,
             (-1, -1) if self.window_size is None else self.window_size,
+            self.attn_mask_type.is_bottom_right(),
         ).get_fused_attn_backend()
         if self.backend != NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen:
             pytest.skip(message)
diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index 41607f05f7..9c5b91d1fc 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -246,12 +246,13 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float dropout,
     size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
     size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
-    bool return_max_logit, bool cuda_graph, bool deterministic, cudnnHandle_t handle,
+    bool bottom_right_diagonal, bool return_max_logit, bool cuda_graph, bool deterministic,
     const char **message) {
   using namespace transformer_engine;
   set_message(message, "");
   NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type.");
 
+  cudnnHandle_t handle = cudnnExecutionPlanManager::Instance().GetHandle();
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
   const NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
   const auto cudnn_runtime_version = cudnnGetVersion();
@@ -278,19 +279,10 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
 
-  // avoid CUDA graph issue with cuDNN <= 9.15
-  if (cudnn_runtime_version <= 91500 && is_training &&
-      (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD) &&
-      (max_seqlen_kv % 128 != 0) && cuda_graph &&
-      attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
-      attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
-      attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
-    set_message(message, "Known cuDNN <= 9.15 issue with CUDA graph. Please upgrade cuDNN.");
-    return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-  }
-
+  // Use batch=1 for the probe to keep graph caches minimal; batch is not part of cuDNN-FE's
+  // support-check criteria. All other params are passed through verbatim so the cached graph
+  // matches what the eventual nvte_fused_attn_fwd/bwd call will build.
   constexpr size_t probe_batch = 1;
-  constexpr bool probe_bottom_right_diagonal = false;
 
   const bool is_fp8 =
       (q_dtype == NVTEDType::kNVTEFloat8E4M3 || q_dtype == NVTEDType::kNVTEFloat8E5M2);
@@ -302,12 +294,18 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       set_message(message, "FP8 fused attention does not support return_max_logit=True.");
       return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
+    if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
+        qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
+      set_message(message, "FP8 fused attention supports BSHD/SBHD/BHSD formats, found " +
+                               std::to_string(static_cast<int>(qkv_format)) + ".");
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+    }
     const DType qkv_t = static_cast<DType>(q_dtype);
     const DType o_t = static_cast<DType>(o_dtype);
     std::string fwd_reason = is_supported_fp8_fwd(
         probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
         head_dim_v, is_training, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-        window_size_left, window_size_right, probe_bottom_right_diagonal, qkv_t, o_t, scaling_mode,
+        window_size_left, window_size_right, bottom_right_diagonal, qkv_t, o_t, scaling_mode,
         handle);
     if (!fwd_reason.empty()) {
       set_message(message, fwd_reason);
@@ -317,7 +315,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       std::string bwd_reason = is_supported_fp8_bwd(
           probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-          window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, qkv_t,
+          window_size_left, window_size_right, bottom_right_diagonal, deterministic, qkv_t,
           o_t, scaling_mode, handle);
       if (!bwd_reason.empty()) {
         set_message(message, bwd_reason);
@@ -328,11 +326,20 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
   }
 
   if (is_f16_or_bf16) {
+    if (cudnn_runtime_version <= 91500 && is_training &&
+        (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD) &&
+        (max_seqlen_kv % 128 != 0) && cuda_graph &&
+        attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
+        attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
+        attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
+      set_message(message, "Known cuDNN <= 9.15 issue with CUDA graph. Please upgrade cuDNN.");
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+    }
     const DType qkv_t = static_cast<DType>(q_dtype);
     std::string fwd_reason = is_supported_f16_fwd(
         probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
         head_dim_v, is_training, return_max_logit, dropout, qkv_layout, bias_type, attn_mask_type,
-        softmax_type, window_size_left, window_size_right, probe_bottom_right_diagonal, qkv_t,
+        softmax_type, window_size_left, window_size_right, bottom_right_diagonal, qkv_t,
         handle);
     if (!fwd_reason.empty()) {
       set_message(message, fwd_reason);
@@ -342,7 +349,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       std::string bwd_reason = is_supported_f16_bwd(
           probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-          window_size_left, window_size_right, probe_bottom_right_diagonal, deterministic, qkv_t,
+          window_size_left, window_size_right, bottom_right_diagonal, deterministic, qkv_t,
           handle);
       if (!bwd_reason.empty()) {
         set_message(message, bwd_reason);
@@ -444,8 +451,8 @@ void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       is_training, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type, attn_mask_type,
       softmax_type, dropout, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left,
-      window_size_right, return_max_logit, cuda_graph, /*deterministic=*/false, handle,
-      /*message=*/nullptr);
+      window_size_right, bottom_right_diagonal, return_max_logit, cuda_graph,
+      /*deterministic=*/false, /*message=*/nullptr);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
     fused_attn_arbitrary_seqlen_fwd(
@@ -528,8 +535,8 @@ void nvte_fused_attn_bwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       /*is_training=*/true, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type,
       attn_mask_type, softmax_type, dropout, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v,
-      window_size_left, window_size_right, /*return_max_logit=*/false, cuda_graph, deterministic,
-      handle, /*message=*/nullptr);
+      window_size_left, window_size_right, bottom_right_diagonal, /*return_max_logit=*/false,
+      cuda_graph, deterministic, /*message=*/nullptr);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
     size_t i = 0;
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
index 842e3958bc..f4064a8d34 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
@@ -1334,11 +1334,6 @@ std::string is_supported_fp8_fwd(size_t batch, size_t num_attn_heads, size_t num
                                  bool bottom_right_diagonal, DType qkv_dtype, DType o_dtype,
                                  NVTEScalingMode scaling_mode, cudnnHandle_t handle) {
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
-  if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
-      qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
-    return "FP8 fused attention supports BSHD/SBHD/BHSD formats, found " +
-           std::to_string(static_cast<int>(qkv_format)) + ".";
-  }
   size_t workspace_size = 0;
   try {
     fused_attn::fused_attn_fp8_fwd_impl(
@@ -1376,11 +1371,6 @@ std::string is_supported_fp8_bwd(size_t batch, size_t num_attn_heads, size_t num
                                  bool deterministic, DType qkv_dtype, DType o_dtype,
                                  NVTEScalingMode scaling_mode, cudnnHandle_t handle) {
   const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
-  if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
-      qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
-    return "FP8 fused attention supports BSHD/SBHD/BHSD formats, found " +
-           std::to_string(static_cast<int>(qkv_format)) + ".";
-  }
   const cudnn_frontend::DataType_t qkv_t = get_cudnn_fe_dtype(qkv_dtype);
   const cudnn_frontend::DataType_t o_t = get_cudnn_fe_dtype(o_dtype);
   const cudnn_frontend::DataType_t do_t = o_t;
diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h
index 9bcbcc5716..85e3ea68ed 100644
--- a/transformer_engine/common/include/transformer_engine/fused_attn.h
+++ b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -198,30 +198,31 @@ NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout);
 
 /*! \brief Get fused attention backend based on input parameters.
  *
- *  \param[in]     is_training         Whether the model is in training mode.
- *  \param[in]     q_dtype             The data type of Tensor Q.
- *  \param[in]     kv_dtype            The data type of Tensors K, V.
- *  \param[in]     o_dtype             The data type of Tensor O.
- *  \param[in]     scaling_mode        Scaling mode of attention.
- *  \param[in]     qkv_layout          The layout of Tensors Q, K, V.
- *  \param[in]     bias_type           The attention bias type.
- *  \param[in]     attn_mask_type      The attention mask type.
- *  \param[in]     softmax_type        The attention softmax type.
- *  \param[in]     dropout             The dropout probability.
- *  \param[in]     num_attn_heads      The number of heads in Q.
- *  \param[in]     num_gqa_groups      The number of heads in K, V.
- *  \param[in]     max_seqlen_q        The sequence length of Q.
- *  \param[in]     max_seqlen_kv       The sequence length of K, V.
- *  \param[in]     head_dim_qk         The head dimension of Q, K.
- *  \param[in]     head_dim_v          The head dimension of V.
- *  \param[in]     window_size_left    Sliding window size (the left half).
- *  \param[in]     window_size_right   Sliding window size (the right half).
- *  \param[in]     return_max_logit    Whether to produce Max along with Stats.
- *  \param[in]     cuda_graph          Whether cuda graph capture is enabled or not.
- *  \param[in]     deterministic       Whether determinism is required or not.
- *  \param[in]     handle              cuDNN handle.
- *  \param[out]    message             Empty string on success, otherwise a diagnostic string
- *                                     describing why the configuration was rejected.
+ *  \param[in]     is_training           Whether the model is in training mode.
+ *  \param[in]     q_dtype               The data type of Tensor Q.
+ *  \param[in]     kv_dtype              The data type of Tensors K, V.
+ *  \param[in]     o_dtype               The data type of Tensor O.
+ *  \param[in]     scaling_mode          Scaling mode of attention.
+ *  \param[in]     qkv_layout            The layout of Tensors Q, K, V.
+ *  \param[in]     bias_type             The attention bias type.
+ *  \param[in]     attn_mask_type        The attention mask type.
+ *  \param[in]     softmax_type          The attention softmax type.
+ *  \param[in]     dropout               The dropout probability.
+ *  \param[in]     num_attn_heads        The number of heads in Q.
+ *  \param[in]     num_gqa_groups        The number of heads in K, V.
+ *  \param[in]     max_seqlen_q          The sequence length of Q.
+ *  \param[in]     max_seqlen_kv         The sequence length of K, V.
+ *  \param[in]     head_dim_qk           The head dimension of Q, K.
+ *  \param[in]     head_dim_v            The head dimension of V.
+ *  \param[in]     window_size_left      Sliding window size (the left half).
+ *  \param[in]     window_size_right     Sliding window size (the right half).
+ *  \param[in]     bottom_right_diagonal Whether to align sliding window and ALiBi diagonal to the
+ *                                       bottom right corner of the softmax matrix.
+ *  \param[in]     return_max_logit      Whether to produce Max along with Stats.
+ *  \param[in]     cuda_graph            Whether cuda graph capture is enabled or not.
+ *  \param[in]     deterministic         Whether determinism is required or not.
+ *  \param[out]    message               Empty string on success, otherwise a diagnostic string
+ *                                       describing why the configuration was rejected.
  */
 NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
@@ -229,7 +230,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float dropout,
     size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
     size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
-    bool return_max_logit, bool cuda_graph, bool deterministic, cudnnHandle_t handle,
+    bool bottom_right_diagonal, bool return_max_logit, bool cuda_graph, bool deterministic,
     const char **message);
 
 /*! \brief Compute dot product attention with separate Q, K and V.
diff --git a/transformer_engine/jax/attention.py b/transformer_engine/jax/attention.py
index f54a043fd2..d0e125297f 100644
--- a/transformer_engine/jax/attention.py
+++ b/transformer_engine/jax/attention.py
@@ -339,13 +339,22 @@ def is_fused_attn_kernel_available(
     head_dim_qk,
     head_dim_v,
     window_size: Optional[Tuple[int, int]] = None,
+    bottom_right_diagonal: Optional[bool] = None,
 ):
     """
-    To check whether the fused attention kernel is supported
+    To check whether the fused attention kernel is supported.
+
+    If ``bottom_right_diagonal`` is None, it is derived from the mask type, matching the
+    convention used everywhere else in JAX TE (see ``_FusedAttnConfig`` constructions).
     """
     window_size_tuple = (-1, -1) if window_size is None else window_size
 
     def make_helper(attn_mask_type):
+        bottom_right = (
+            attn_mask_type.is_bottom_right()
+            if bottom_right_diagonal is None
+            else bottom_right_diagonal
+        )
         return tex.FusedAttnHelper(
             is_training,
             q_dtype,
@@ -362,6 +371,7 @@ def make_helper(attn_mask_type):
             head_dim_qk,
             head_dim_v,
             window_size_tuple,
+            bottom_right,
         )
 
     return make_helper(attn_mask_type).is_fused_attn_kernel_available()
diff --git a/transformer_engine/jax/cpp_extensions/attention.py b/transformer_engine/jax/cpp_extensions/attention.py
index 1631afe4f4..e6cbb10e44 100644
--- a/transformer_engine/jax/cpp_extensions/attention.py
+++ b/transformer_engine/jax/cpp_extensions/attention.py
@@ -122,6 +122,7 @@ class FusedAttnHelper:
     head_dim_qk: int
     head_dim_v: int
     window_size: Tuple[int, int]
+    bottom_right_diagonal: bool
 
     def is_fused_attn_kernel_available(self):
         """Check if there is available fused attention kernel"""
@@ -154,6 +155,7 @@ def get_fused_attn_backend(self):
             self.head_dim_v,
             self.window_size[0],
             self.window_size[1],
+            self.bottom_right_diagonal,
             not self.is_non_deterministic_allowed(),
         )
 
@@ -359,6 +361,7 @@ def abstract(
             q_head_dim,
             v_head_dim,
             config.window_size,
+            config.bottom_right_diagonal,
         ).get_fused_attn_backend()
 
         if backend == NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen:
diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h
index 629b6dc3bf..d958193a7d 100644
--- a/transformer_engine/jax/csrc/extensions.h
+++ b/transformer_engine/jax/csrc/extensions.h
@@ -154,7 +154,8 @@ std::tuple<NVTE_Fused_Attn_Backend, std::string> GetFusedAttnBackend(
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
     NVTE_Softmax_Type softmax_type, float dropout_probability, size_t q_attn_heads,
     size_t kv_attn_heads, size_t q_max_seqlen, size_t kv_max_seqlen, size_t qk_head_dim,
-    size_t v_head_dim, int64_t window_size_left, int64_t window_size_right, bool deterministic);
+    size_t v_head_dim, int64_t window_size_left, int64_t window_size_right,
+    bool bottom_right_diagonal, bool deterministic);
 
 pybind11::tuple GetFusedAttnForwardWorkspaceSizes(
     size_t input_batch, size_t bias_batch, size_t q_max_seqlen, size_t kv_max_seqlen,
diff --git a/transformer_engine/jax/csrc/extensions/attention.cpp b/transformer_engine/jax/csrc/extensions/attention.cpp
index 4ce09368d8..d5673df8a5 100644
--- a/transformer_engine/jax/csrc/extensions/attention.cpp
+++ b/transformer_engine/jax/csrc/extensions/attention.cpp
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "../extensions.h"
-#include "common/cudnn_utils.h"
 #include "transformer_engine/fused_attn.h"
 #include "transformer_engine/transformer_engine.h"
 
@@ -17,15 +16,15 @@ std::tuple<NVTE_Fused_Attn_Backend, std::string> GetFusedAttnBackend(
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
     NVTE_Softmax_Type softmax_type, float dropout_probability, size_t q_attn_heads,
     size_t kv_attn_heads, size_t q_max_seqlen, size_t kv_max_seqlen, size_t qk_head_dim,
-    size_t v_head_dim, int64_t window_size_left, int64_t window_size_right, bool deterministic) {
-  auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
+    size_t v_head_dim, int64_t window_size_left, int64_t window_size_right,
+    bool bottom_right_diagonal, bool deterministic) {
   const char *message = nullptr;
   auto backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
       static_cast<NVTEDType>(o_dtype), scaling_mode, qkv_layout, bias_type, mask_type, softmax_type,
       dropout_probability, q_attn_heads, kv_attn_heads, q_max_seqlen, kv_max_seqlen, qk_head_dim,
-      v_head_dim, window_size_left, window_size_right,
-      /*return_max_logit=*/false, /*cuda_graph=*/false, deterministic, handle, &message);
+      v_head_dim, window_size_left, window_size_right, bottom_right_diagonal,
+      /*return_max_logit=*/false, /*cuda_graph=*/false, deterministic, &message);
   return {backend, message ? std::string(message) : std::string()};
 }
 
@@ -265,13 +264,12 @@ static void FusedAttnForwardImpl(
   /* Prepare RNG state */
   auto rng_state_tensor = TensorWrapper(rng_state, std::vector<size_t>{2}, DType::kInt64);
 
-  auto _handle_fwd = cudnnExecutionPlanManager::Instance().GetHandle();
   auto backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype),
       static_cast<NVTEDType>(dtype), NVTE_INVALID_SCALING, qkv_layout, bias_type, mask_type,
       softmax_type, dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen,
-      qk_head_dim, v_head_dim, window_size_left, window_size_right, /*return_max_logit=*/false,
-      /*cuda_graph=*/false, deterministic, _handle_fwd, /*message=*/nullptr);
+      qk_head_dim, v_head_dim, window_size_left, window_size_right, bottom_right_diagonal,
+      /*return_max_logit=*/false, /*cuda_graph=*/false, deterministic, /*message=*/nullptr);
   nvte_populate_rng_state_async(rng_state, seed, q_max_seqlen, kv_max_seqlen, backend, stream);
 
   /* Auxiliary tensors (to be propagated to the backward pass later) */
@@ -543,13 +541,12 @@ static void FusedAttnBackwardImpl(
   /* Auxiliary tensors (propagated from the forward pass) */
   NVTETensorPack aux_input_tensors;
   nvte_tensor_pack_create(&aux_input_tensors);
-  auto _handle_bwd = cudnnExecutionPlanManager::Instance().GetHandle();
   auto backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype),
       static_cast<NVTEDType>(dtype), NVTE_INVALID_SCALING, qkv_layout, bias_type, mask_type,
       softmax_type, dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen,
-      qk_head_dim, v_head_dim, window_size_left, window_size_right, /*return_max_logit=*/false,
-      /*cuda_graph=*/false, deterministic, _handle_bwd, /*message=*/nullptr);
+      qk_head_dim, v_head_dim, window_size_left, window_size_right, bottom_right_diagonal,
+      /*return_max_logit=*/false, /*cuda_graph=*/false, deterministic, /*message=*/nullptr);
   PrepareFusedAttnBackwardAuxTensors(&aux_input_tensors, input_batch, bias_batch, attn_heads,
                                      bias_heads, q_max_seqlen, kv_max_seqlen, dtype, backend,
                                      softmax_aux, rng_state, bias, softmax_offset);
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
index 7f97a1e0f2..38542586d2 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -1244,13 +1244,14 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
             head_dim_v,
             window_size[0],
             window_size[1],
+            bottom_right_diagonal,
             return_max_logit,
             cuda_graph,
             deterministic,
         )
         if fused_attention_backend == FusedAttnBackend["No_Backend"]:
             logger.debug(
-                "Disabling FusedAttention as %s",
+                "Disabling FusedAttention: %s",
                 reject_message,
             )
             use_fused_attention = False
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
index 733f98e575..016721f8b0 100644
--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -83,7 +83,7 @@ std::tuple<NVTE_Fused_Attn_Backend, std::string> get_fused_attn_backend(
     NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float p_dropout,
     size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
     size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
-    bool return_max_logit, bool cuda_graph, bool deterministic);
+    bool bottom_right_diagonal, bool return_max_logit, bool cuda_graph, bool deterministic);
 
 std::vector<py::object> fused_attn_fwd(
     size_t max_seqlen_q, size_t max_seqlen_kv, bool is_training, float attn_scale, float p_dropout,
diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cpp b/transformer_engine/pytorch/csrc/extensions/attention.cpp
index 4732d47908..2f5c7058c5 100644
--- a/transformer_engine/pytorch/csrc/extensions/attention.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/attention.cpp
@@ -6,7 +6,6 @@
 
 #include "../extensions.h"
 #include "common.h"
-#include "common/cudnn_utils.h"
 #include "pybind.h"
 
 namespace {
@@ -47,15 +46,14 @@ std::tuple<NVTE_Fused_Attn_Backend, std::string> get_fused_attn_backend(
     NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float p_dropout,
     size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
     size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
-    bool return_max_logit, bool cuda_graph, bool deterministic) {
-  auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
+    bool bottom_right_diagonal, bool return_max_logit, bool cuda_graph, bool deterministic) {
   const char *message = nullptr;
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
       static_cast<NVTEDType>(o_dtype), scaling_mode, qkv_layout, bias_type, attn_mask_type,
       softmax_type, p_dropout, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv,
-      head_dim_qk, head_dim_v, window_size_left, window_size_right, return_max_logit, cuda_graph,
-      deterministic, handle, &message);
+      head_dim_qk, head_dim_v, window_size_left, window_size_right, bottom_right_diagonal,
+      return_max_logit, cuda_graph, deterministic, &message);
   return {fused_attention_backend, message ? std::string(message) : std::string()};
 }
 

From 3e666b0c59d90a2af5ab4be38b892e1549aefd91 Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Thu, 7 May 2026 18:30:39 -0700
Subject: [PATCH 09/11] add batch_size to API

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
---
 tests/jax/test_distributed_fused_attn.py      |  3 +++
 tests/jax/test_fused_attn.py                  |  1 +
 .../common/fused_attn/fused_attn.cpp          | 19 +++++++------------
 .../include/transformer_engine/fused_attn.h   |  3 ++-
 transformer_engine/jax/attention.py           |  2 ++
 .../jax/cpp_extensions/attention.py           |  4 ++++
 transformer_engine/jax/csrc/extensions.h      | 10 +++++-----
 .../jax/csrc/extensions/attention.cpp         | 16 ++++++++--------
 transformer_engine/jax/flax/transformer.py    |  3 +++
 .../attention/dot_product_attention/utils.py  |  1 +
 transformer_engine/pytorch/csrc/extensions.h  | 13 +++++++------
 .../pytorch/csrc/extensions/attention.cpp     | 15 ++++++++-------
 12 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/tests/jax/test_distributed_fused_attn.py b/tests/jax/test_distributed_fused_attn.py
index 50c5de1db7..39efabc598 100644
--- a/tests/jax/test_distributed_fused_attn.py
+++ b/tests/jax/test_distributed_fused_attn.py
@@ -75,6 +75,7 @@ def impl_test_self_attn(
 
         if not is_fused_attn_kernel_available(
             is_training,
+            batch,
             dtype,
             dtype,
             QKVLayout.BS3HD,
@@ -227,6 +228,7 @@ def test_cross_attn(
 
         if not is_fused_attn_kernel_available(
             is_training,
+            batch,
             dtype,
             dtype,
             QKVLayout.BSHD_BS2HD,
@@ -368,6 +370,7 @@ def impl_test_context_parallel_attn(
         def check_has_backend_for_mask(mask_type):
             return is_fused_attn_kernel_available(
                 is_training,
+                batch,
                 dtype,
                 dtype,
                 qkv_layout,
diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py
index 7dc7cc4c97..88c485db81 100644
--- a/tests/jax/test_fused_attn.py
+++ b/tests/jax/test_fused_attn.py
@@ -446,6 +446,7 @@ def _check_configs(self):
 
         self.backend, message = FusedAttnHelper(
             self.is_training,
+            self.batch_size,
             self.dtype,
             self.dtype,
             self.qkv_layout,
diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index 9c5b91d1fc..e0d524c783 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -241,7 +241,7 @@ void set_message(const char **message, const std::string &reason) {
 
 // select a backend for fused attention
 NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
-    bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
+    bool is_training, size_t batch_size, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
     NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
     NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float dropout,
     size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
@@ -279,11 +279,6 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   }
 
-  // Use batch=1 for the probe to keep graph caches minimal; batch is not part of cuDNN-FE's
-  // support-check criteria. All other params are passed through verbatim so the cached graph
-  // matches what the eventual nvte_fused_attn_fwd/bwd call will build.
-  constexpr size_t probe_batch = 1;
-
   const bool is_fp8 =
       (q_dtype == NVTEDType::kNVTEFloat8E4M3 || q_dtype == NVTEDType::kNVTEFloat8E5M2);
   const bool is_f16_or_bf16 =
@@ -303,7 +298,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     const DType qkv_t = static_cast<DType>(q_dtype);
     const DType o_t = static_cast<DType>(o_dtype);
     std::string fwd_reason = is_supported_fp8_fwd(
-        probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+        batch_size, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
         head_dim_v, is_training, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
         window_size_left, window_size_right, bottom_right_diagonal, qkv_t, o_t, scaling_mode,
         handle);
@@ -313,7 +308,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     }
     if (is_training) {
       std::string bwd_reason = is_supported_fp8_bwd(
-          probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+          batch_size, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
           window_size_left, window_size_right, bottom_right_diagonal, deterministic, qkv_t,
           o_t, scaling_mode, handle);
@@ -337,7 +332,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     }
     const DType qkv_t = static_cast<DType>(q_dtype);
     std::string fwd_reason = is_supported_f16_fwd(
-        probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+        batch_size, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
         head_dim_v, is_training, return_max_logit, dropout, qkv_layout, bias_type, attn_mask_type,
         softmax_type, window_size_left, window_size_right, bottom_right_diagonal, qkv_t,
         handle);
@@ -347,7 +342,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     }
     if (is_training) {
       std::string bwd_reason = is_supported_f16_bwd(
-          probe_batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+          batch_size, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
           window_size_left, window_size_right, bottom_right_diagonal, deterministic, qkv_t,
           handle);
@@ -449,7 +444,7 @@ void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
   const NVTEScalingMode scaling_mode = input_Q->scaling_mode;
 
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      is_training, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type, attn_mask_type,
+      is_training, b, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type, attn_mask_type,
       softmax_type, dropout, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left,
       window_size_right, bottom_right_diagonal, return_max_logit, cuda_graph,
       /*deterministic=*/false, /*message=*/nullptr);
@@ -533,7 +528,7 @@ void nvte_fused_attn_bwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
   const NVTEScalingMode scaling_mode = input_Q->scaling_mode;
 
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      /*is_training=*/true, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type,
+      /*is_training=*/true, b, Q_type, KV_type, O_type, scaling_mode, qkv_layout, bias_type,
       attn_mask_type, softmax_type, dropout, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v,
       window_size_left, window_size_right, bottom_right_diagonal, /*return_max_logit=*/false,
       cuda_graph, deterministic, /*message=*/nullptr);
diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h
index 85e3ea68ed..227afed24e 100644
--- a/transformer_engine/common/include/transformer_engine/fused_attn.h
+++ b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -199,6 +199,7 @@ NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout);
 /*! \brief Get fused attention backend based on input parameters.
  *
  *  \param[in]     is_training           Whether the model is in training mode.
+ *  \param[in]     batch_size            Batch size.
  *  \param[in]     q_dtype               The data type of Tensor Q.
  *  \param[in]     kv_dtype              The data type of Tensors K, V.
  *  \param[in]     o_dtype               The data type of Tensor O.
@@ -225,7 +226,7 @@ NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout);
  *                                       describing why the configuration was rejected.
  */
 NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
-    bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
+    bool is_training, size_t batch_size, NVTEDType q_dtype, NVTEDType kv_dtype, NVTEDType o_dtype,
     NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
     NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float dropout,
     size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
diff --git a/transformer_engine/jax/attention.py b/transformer_engine/jax/attention.py
index d0e125297f..ac6cf8975c 100644
--- a/transformer_engine/jax/attention.py
+++ b/transformer_engine/jax/attention.py
@@ -325,6 +325,7 @@ def canonicalize_attn_mask_type(attn_mask_type: str):
 
 def is_fused_attn_kernel_available(
     is_training,
+    batch_size,
     q_dtype,
     kv_dtype,
     qkv_layout,
@@ -357,6 +358,7 @@ def make_helper(attn_mask_type):
         )
         return tex.FusedAttnHelper(
             is_training,
+            batch_size,
             q_dtype,
             kv_dtype,
             qkv_layout,
diff --git a/transformer_engine/jax/cpp_extensions/attention.py b/transformer_engine/jax/cpp_extensions/attention.py
index e6cbb10e44..2a533c3f3e 100644
--- a/transformer_engine/jax/cpp_extensions/attention.py
+++ b/transformer_engine/jax/cpp_extensions/attention.py
@@ -108,6 +108,7 @@ class FusedAttnHelper:
     """
 
     is_training: bool
+    batch_size: int
     q_dtype: jnp.dtype
     kv_dtype: jnp.dtype
     qkv_layout: QKVLayout
@@ -138,6 +139,7 @@ def get_fused_attn_backend(self):
         q_type = jax_dtype_to_te_dtype(self.q_dtype)
         return transformer_engine_jax.get_fused_attn_backend(
             self.is_training,
+            self.batch_size,
             q_type,
             jax_dtype_to_te_dtype(self.kv_dtype),
             q_type,
@@ -345,8 +347,10 @@ def abstract(
         out_aval = q_aval.update(shape=output_shape, dtype=q_dtype)
 
         # backend determines the softmax buffer shape/dtype
+        input_batch = reduce(operator.mul, batch_shape)
         backend, message = FusedAttnHelper(
             config.is_training,
+            input_batch,
             q_dtype,
             k_dtype,
             config.qkv_layout,
diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h
index d958193a7d..1e8d99c3d8 100644
--- a/transformer_engine/jax/csrc/extensions.h
+++ b/transformer_engine/jax/csrc/extensions.h
@@ -150,11 +150,11 @@ XLA_FFI_DECLARE_HANDLER_SYMBOL(FusedAttnBackwardHandler);
 // Returns (backend, message). `message` is empty on success, otherwise a diagnostic string
 // describing why the configuration was rejected when backend = NVTE_No_Backend.
 std::tuple<NVTE_Fused_Attn_Backend, std::string> GetFusedAttnBackend(
-    bool is_training, DType q_dtype, DType kv_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
-    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
-    NVTE_Softmax_Type softmax_type, float dropout_probability, size_t q_attn_heads,
-    size_t kv_attn_heads, size_t q_max_seqlen, size_t kv_max_seqlen, size_t qk_head_dim,
-    size_t v_head_dim, int64_t window_size_left, int64_t window_size_right,
+    bool is_training, size_t batch_size, DType q_dtype, DType kv_dtype, DType o_dtype,
+    NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, float dropout_probability,
+    size_t q_attn_heads, size_t kv_attn_heads, size_t q_max_seqlen, size_t kv_max_seqlen,
+    size_t qk_head_dim, size_t v_head_dim, int64_t window_size_left, int64_t window_size_right,
     bool bottom_right_diagonal, bool deterministic);
 
 pybind11::tuple GetFusedAttnForwardWorkspaceSizes(
diff --git a/transformer_engine/jax/csrc/extensions/attention.cpp b/transformer_engine/jax/csrc/extensions/attention.cpp
index d5673df8a5..5cd3265c3e 100644
--- a/transformer_engine/jax/csrc/extensions/attention.cpp
+++ b/transformer_engine/jax/csrc/extensions/attention.cpp
@@ -12,15 +12,15 @@ namespace transformer_engine {
 namespace jax {
 
 std::tuple<NVTE_Fused_Attn_Backend, std::string> GetFusedAttnBackend(
-    bool is_training, DType q_dtype, DType kv_dtype, DType o_dtype, NVTEScalingMode scaling_mode,
-    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
-    NVTE_Softmax_Type softmax_type, float dropout_probability, size_t q_attn_heads,
-    size_t kv_attn_heads, size_t q_max_seqlen, size_t kv_max_seqlen, size_t qk_head_dim,
-    size_t v_head_dim, int64_t window_size_left, int64_t window_size_right,
+    bool is_training, size_t batch_size, DType q_dtype, DType kv_dtype, DType o_dtype,
+    NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, float dropout_probability,
+    size_t q_attn_heads, size_t kv_attn_heads, size_t q_max_seqlen, size_t kv_max_seqlen,
+    size_t qk_head_dim, size_t v_head_dim, int64_t window_size_left, int64_t window_size_right,
     bool bottom_right_diagonal, bool deterministic) {
   const char *message = nullptr;
   auto backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
+      is_training, batch_size, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
       static_cast<NVTEDType>(o_dtype), scaling_mode, qkv_layout, bias_type, mask_type, softmax_type,
       dropout_probability, q_attn_heads, kv_attn_heads, q_max_seqlen, kv_max_seqlen, qk_head_dim,
       v_head_dim, window_size_left, window_size_right, bottom_right_diagonal,
@@ -265,7 +265,7 @@ static void FusedAttnForwardImpl(
   auto rng_state_tensor = TensorWrapper(rng_state, std::vector<size_t>{2}, DType::kInt64);
 
   auto backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype),
+      is_training, input_batch, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype),
       static_cast<NVTEDType>(dtype), NVTE_INVALID_SCALING, qkv_layout, bias_type, mask_type,
       softmax_type, dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen,
       qk_head_dim, v_head_dim, window_size_left, window_size_right, bottom_right_diagonal,
@@ -542,7 +542,7 @@ static void FusedAttnBackwardImpl(
   NVTETensorPack aux_input_tensors;
   nvte_tensor_pack_create(&aux_input_tensors);
   auto backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype),
+      is_training, input_batch, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype),
       static_cast<NVTEDType>(dtype), NVTE_INVALID_SCALING, qkv_layout, bias_type, mask_type,
       softmax_type, dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen,
       qk_head_dim, v_head_dim, window_size_left, window_size_right, bottom_right_diagonal,
diff --git a/transformer_engine/jax/flax/transformer.py b/transformer_engine/jax/flax/transformer.py
index a2e7920843..184547aa92 100644
--- a/transformer_engine/jax/flax/transformer.py
+++ b/transformer_engine/jax/flax/transformer.py
@@ -748,6 +748,8 @@ def __call__(
         enable_fused_attn = int(os.getenv("NVTE_FUSED_ATTN", "1"))
 
         sequence_dim = 0 if self.transpose_batch_sequence else 1
+        batch_dim = 1 - sequence_dim
+        batch_size = query.shape[batch_dim]
         seqlen_q = query.shape[sequence_dim]
         if qkv_layout == QKVLayout.BS3HD:
             seqlen_kv = seqlen_q
@@ -763,6 +765,7 @@ def __call__(
         has_fused_attn_kernel = is_fused_attn_kernel_available(
             # This needs to be fixed: TE-Jax has historically correlated training mode with deterministic mode.
             not deterministic,
+            batch_size,
             input_dtype,
             # self._assert_dtypes enforces Q, K, V, bias to have the same dtype so using input_dtype as kv dtype is sufficient
             input_dtype,
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
index 38542586d2..52bb687851 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -1227,6 +1227,7 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
             kv_type = q_type
         fused_attention_backend, reject_message = tex.get_fused_attn_backend(
             is_training,
+            batch_size,
             q_type,
             kv_type,
             q_type,
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
index 016721f8b0..205e7eb834 100644
--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -78,12 +78,13 @@ std::tuple<at::Tensor, at::Tensor> moe_unpermute_bwd(at::Tensor input_bwd, at::T
 // Returns (backend, reason). `reason` is empty on success, otherwise a diagnostic string
 // describing why the configuration was rejected when backend = NVTE_No_Backend.
 std::tuple<NVTE_Fused_Attn_Backend, std::string> get_fused_attn_backend(
-    bool is_training, const DType q_dtype, const DType kv_dtype, const DType o_dtype,
-    NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float p_dropout,
-    size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
-    size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, bool return_max_logit, bool cuda_graph, bool deterministic);
+    bool is_training, size_t batch_size, const DType q_dtype, const DType kv_dtype,
+    const DType o_dtype, NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout,
+    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
+    float p_dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
+    int64_t window_size_right, bool bottom_right_diagonal, bool return_max_logit, bool cuda_graph,
+    bool deterministic);
 
 std::vector<py::object> fused_attn_fwd(
     size_t max_seqlen_q, size_t max_seqlen_kv, bool is_training, float attn_scale, float p_dropout,
diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cpp b/transformer_engine/pytorch/csrc/extensions/attention.cpp
index 2f5c7058c5..41dcd3301a 100644
--- a/transformer_engine/pytorch/csrc/extensions/attention.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/attention.cpp
@@ -41,15 +41,16 @@ namespace transformer_engine::pytorch {
 
 // get the fused attention backend
 std::tuple<NVTE_Fused_Attn_Backend, std::string> get_fused_attn_backend(
-    bool is_training, const DType q_dtype, const DType kv_dtype, const DType o_dtype,
-    NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, float p_dropout,
-    size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv,
-    size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, bool return_max_logit, bool cuda_graph, bool deterministic) {
+    bool is_training, size_t batch_size, const DType q_dtype, const DType kv_dtype,
+    const DType o_dtype, NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout,
+    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
+    float p_dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
+    int64_t window_size_right, bool bottom_right_diagonal, bool return_max_logit, bool cuda_graph,
+    bool deterministic) {
   const char *message = nullptr;
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
+      is_training, batch_size, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype),
       static_cast<NVTEDType>(o_dtype), scaling_mode, qkv_layout, bias_type, attn_mask_type,
       softmax_type, p_dropout, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv,
       head_dim_qk, head_dim_v, window_size_left, window_size_right, bottom_right_diagonal,

From 056aba6aebbfbab3e580d3155ccb07c076bcf940 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 01:31:54 +0000
Subject: [PATCH 10/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/common/fused_attn/fused_attn.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index e0d524c783..628bce1b54 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -310,8 +310,8 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       std::string bwd_reason = is_supported_fp8_bwd(
           batch_size, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-          window_size_left, window_size_right, bottom_right_diagonal, deterministic, qkv_t,
-          o_t, scaling_mode, handle);
+          window_size_left, window_size_right, bottom_right_diagonal, deterministic, qkv_t, o_t,
+          scaling_mode, handle);
       if (!bwd_reason.empty()) {
         set_message(message, bwd_reason);
         return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
@@ -334,8 +334,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     std::string fwd_reason = is_supported_f16_fwd(
         batch_size, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
         head_dim_v, is_training, return_max_logit, dropout, qkv_layout, bias_type, attn_mask_type,
-        softmax_type, window_size_left, window_size_right, bottom_right_diagonal, qkv_t,
-        handle);
+        softmax_type, window_size_left, window_size_right, bottom_right_diagonal, qkv_t, handle);
     if (!fwd_reason.empty()) {
       set_message(message, fwd_reason);
       return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
@@ -344,8 +343,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       std::string bwd_reason = is_supported_f16_bwd(
           batch_size, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
           head_dim_v, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-          window_size_left, window_size_right, bottom_right_diagonal, deterministic, qkv_t,
-          handle);
+          window_size_left, window_size_right, bottom_right_diagonal, deterministic, qkv_t, handle);
       if (!bwd_reason.empty()) {
         set_message(message, bwd_reason);
         return NVTE_Fused_Attn_Backend::NVTE_No_Backend;

From e054863c2fda315bd43866160e188f2be95e4aea Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Thu, 7 May 2026 20:34:12 -0700
Subject: [PATCH 11/11] fix jax binding

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
---
 transformer_engine/jax/csrc/extensions/pybind.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/transformer_engine/jax/csrc/extensions/pybind.cpp b/transformer_engine/jax/csrc/extensions/pybind.cpp
index 70d0403b3e..2d55abedc6 100644
--- a/transformer_engine/jax/csrc/extensions/pybind.cpp
+++ b/transformer_engine/jax/csrc/extensions/pybind.cpp
@@ -206,6 +206,14 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
       .value("NVFP4_2D_SCALING", JAXX_Scaling_Mode::NVFP4_2D_SCALING)
       .export_values();
 
+  pybind11::enum_<NVTEScalingMode>(m, "NVTEScalingMode", pybind11::module_local())
+      .value("NVTE_DELAYED_TENSOR_SCALING", NVTEScalingMode::NVTE_DELAYED_TENSOR_SCALING)
+      .value("NVTE_MXFP8_1D_SCALING", NVTEScalingMode::NVTE_MXFP8_1D_SCALING)
+      .value("NVTE_BLOCK_SCALING_1D", NVTEScalingMode::NVTE_BLOCK_SCALING_1D)
+      .value("NVTE_BLOCK_SCALING_2D", NVTEScalingMode::NVTE_BLOCK_SCALING_2D)
+      .value("NVTE_NVFP4_1D_SCALING", NVTEScalingMode::NVTE_NVFP4_1D_SCALING)
+      .value("NVTE_INVALID_SCALING", NVTEScalingMode::NVTE_INVALID_SCALING);
+
   pybind11::enum_<JAXX_Quantize_Layout>(m, "JAXX_Quantize_Layout", pybind11::module_local())
       .value("ROWWISE", JAXX_Quantize_Layout::ROWWISE)
       .value("COLWISE", JAXX_Quantize_Layout::COLWISE)