Force layout on Q for MLA.

fhoushmand · Google-ML-Automation · commit b0ed5110e247 · 2026-01-23T10:57:41.000-08:00
This helps in non-pallas splash attention and removes copies when num_heads is 128.
                       major to minor layout
original query:        1, 2, 192, 1024, 128
attention expectation: 1, 2, 128, 192 , 1024

PiperOrigin-RevId: 855382451
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -1016,3 +1016,5 @@ use_jax_splash: false
 vllm_hf_config_path: ""
 # JSON string containing additional configuration for the vLLM model (e.g. '{"maxtext_config": {...}}')
 vllm_additional_config: {}
+# When use_jax_splash=True, force the layout of the query tensor to be [..., NUM_HEADS, HEAD_DIM, SEQ_LENGTH]
+force_q_layout: false
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -480,6 +480,7 @@ class Attention(BaseModel):
   enable_padding_causal_mask: bool = Field(True, description="Temporary flag for TE padding.")
   use_tokamax_splash: bool = Field(False, description="Whether to use tokamax splash attention.")
   use_jax_splash: bool = Field(False, description="Whether to use jax splash attention.")
+  force_q_layout: bool = Field(False, description="Force the Q layout")
 
 
 class MoBa(BaseModel):
@@ -2231,6 +2232,10 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           "Muon dimension numbers haven't been tested for this model. Run this command first: "
           f"`python3 -m MaxText.muon_utils {self.model_name} True`"
       )
+    if self.force_q_layout and not self.use_jax_splash:
+      raise ValueError(
+          "`force_q_layout` can only be true if `use_jax_splash` is also true."
+      )
 
     # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
     # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.
diff --git a/src/MaxText/kernels/jax_flash_attention.py b/src/MaxText/kernels/jax_flash_attention.py
@@ -107,14 +107,14 @@ def flash_attention_block_masked(
   # `l` is initialized to 0 since no blocks have been processed yet and the sum
   # is 0.
   l = jnp.zeros(
-      (batch_size, num_kv_heads, q_groups, q_seq_len), dtype=jnp.float32
+      (batch_size, num_kv_heads, q_groups, q_seq_len), dtype=data_type
   )
   # `m` is initialized to the mask_value so that the first block's maximum logit
   # correctly becomes the running maximum.
   m = jnp.full(
       (batch_size, num_kv_heads, q_groups, q_seq_len),
       mask_value,
-      dtype=jnp.float32,
+      dtype=data_type,
   )
 
   output = jnp.zeros(
@@ -138,11 +138,12 @@ def outer_loop_body(j, carried):
     def inner_loop_body(i, carried_inner):
       output, l, m = carried_inner
 
+      # let's get the slice of Q in N dimension
+      q_slice = jax.lax.dynamic_slice_in_dim(q, i * block_q, block_q, axis=-2)
+
       # Calculates the attention computation (Q@K.T)@V with online softmax for
       # the current query and key/value blocks.
       def compute_attention_block(output, l, m):
-        # let's get the slice of Q in N dimension
-        q_slice = jax.lax.dynamic_slice_in_dim(q, i * block_q, block_q, axis=-2)
         output_i_slice = jax.lax.dynamic_slice_in_dim(
             output, i * block_q, block_q, axis=-2
         )
@@ -156,7 +157,7 @@ def compute_attention_block(output, l, m):
             "bxhqc,bxkc->bxhqk",
             q_slice,
             k_j_slice,
-            preferred_element_type=jnp.float32,
+            preferred_element_type=data_type,
         )
         full_mask_i_j_slice = jax.lax.dynamic_slice(
             mask_full,
@@ -193,7 +194,7 @@ def compute_attention_block(output, l, m):
 
         output_i_slice_new = numerator / divider
         output = jax.lax.dynamic_update_index_in_dim(
-            output, output_i_slice_new.astype(data_type), i * block_q, axis=-2
+            output, output_i_slice_new, i * block_q, axis=-2
         )
         l = jax.lax.dynamic_update_index_in_dim(
             l, l_i_new, i * block_q, axis=-1
diff --git a/src/MaxText/layers/attention_mla.py b/src/MaxText/layers/attention_mla.py
@@ -16,10 +16,17 @@
 
 import math
 from typing import Any, Optional, Tuple
-
+import jax
 from jax.ad_checkpoint import checkpoint_name
-from jax.sharding import Mesh, NamedSharding
+from jax.experimental import layout
 import jax.numpy as jnp
+from jax.sharding import Mesh, NamedSharding
+
+Layout = layout.Format
+if jax.__version_info__ >= (0, 6, 3):
+  DLL = layout.Layout
+else:
+  DLL = layout.DeviceLocalLayout  # type: ignore
 
 from flax import nnx
 
@@ -738,6 +745,10 @@ def __call__(
       out_logical_name = (BATCH, LENGTH_NO_EXP, HEAD, D_KV)
 
     query = self.mla_query_projection(inputs_q, inputs_positions, model_mode)
+    if self.config.force_q_layout:
+      query = layout.with_layout_constraint(
+          query, DLL(major_to_minor=(0, 2, 3, 1))
+      )
     key, value, cached_values = self.mla_kv_projection(
         inputs_kv, inputs_positions, decoder_segment_ids, model_mode, previous_chunk
     )