Fix unit test

hsuan-lun-chiang · hsuan-lun-chiang · commit 80ebfcb4e266 · 2026-03-25T09:06:47.000Z
diff --git a/src/maxtext/layers/initializers.py b/src/maxtext/layers/initializers.py
@@ -94,6 +94,16 @@ def variable_to_logically_partitioned(variable: nnx.VariableState):
     out_sharding = metadata["sharding"]
 
   if out_sharding is not None:
+    if nnx.PARTITION_NAME in metadata:
+      partition_name = metadata[nnx.PARTITION_NAME]
+      scan_axis = metadata.get("param_scan_axis", 0) if variable.type == nnx.Param else 0
+
+      sharding_list = [out_sharding] if isinstance(out_sharding, str) else list(out_sharding)
+      if partition_name not in sharding_list:
+        sharding_list.insert(scan_axis, partition_name)
+
+      out_sharding = tuple(sharding_list)
+
     return nn.LogicallyPartitioned(  # type: ignore[wrong-keyword-args]
         variable.value,
         out_sharding,  # type: ignore[arg-type]
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -71,7 +71,7 @@
 
 class NNXDecoderLayer(nnx.Module):
   """
-  Transformer decoder layer converted to NNX.
+  Transformer decoder layer converted to NNX
   """
 
   def __init__(
@@ -307,11 +307,10 @@ def __init__(
         dense_cls, moe_cls = decoder_block_classes
 
         num_dense = config.first_num_dense_layers
-        self.dense_layers = self._create_scanned_layers(dense_cls, length=num_dense, rngs=rngs)
-
+        self.dense_layers = self._create_scanned_layers(dense_cls, length=num_dense, metadata_axis_name="dense_layers", rngs=rngs)
         num_moe = config.num_decoder_layers - config.first_num_dense_layers
-
-        self.moe_layers = self._create_scanned_layers(moe_cls, length=num_moe, rngs=rngs)
+        self.moe_layers = self._create_scanned_layers(moe_cls, length=num_moe, metadata_axis_name="moe_layers", rngs=rngs)      
+      
       elif self.is_gemma3:
         attention_pattern_length = len(gemma3.GEMMA3_ATTENTION_PATTERN)
         scan_length = config.num_decoder_layers // attention_pattern_length
@@ -323,7 +322,9 @@ def __init__(
         RemattedGemma3Block = gemma3.Gemma3ScannableBlock
 
         if scan_length > 0:
-          self.layers = self._create_scanned_layers(RemattedGemma3Block, length=scan_length, rngs=rngs, **layer_kwargs)
+          self.layers = self._create_scanned_layers(
+              RemattedGemma3Block, length=scan_length, metadata_axis_name="layers", rngs=rngs, **layer_kwargs
+          )
         self.layers_remainder = RemattedGemma3Block(
             config=self.config, mesh=mesh, quant=self.quant, model_mode=self.model_mode, **rem_layer_kwargs, rngs=rngs
         )  # pytype: disable=wrong-keyword-args
@@ -338,7 +339,9 @@ def __init__(
           }
 
         if num_layers > 0:
-          self.layers = self._create_scanned_layers(layer_cls, length=num_layers, rngs=rngs, **layer_kwargs)
+          self.layers = self._create_scanned_layers(
+              layer_cls, length=num_layers, metadata_axis_name="layers", rngs=rngs, **layer_kwargs
+          )        
         else:
           self.layers = nnx.List([])
    
@@ -390,34 +393,80 @@ def _create_single_layer(self, decoder_layer_class, rngs, **kwargs):
       )
       return nnx_wrappers.ToNNX(layer_linen, rngs=rngs)
 
-  def _create_scanned_layers(self, decoder_layer_class, length: int, rngs: nnx.Rngs, **layer_kwargs):
-    """Creates a VMapped stack of layers, forcing parameter init for Compact modules."""
+  def _create_scanned_layers(self, decoder_layer_class, length: int, metadata_axis_name: str, rngs: nnx.Rngs, **layer_kwargs):
+    """Creates a scanned stack of layers using jax.lax.scan for memory-efficient initialization.
 
-    def create_layer_fn(rng):
-      layer = decoder_layer_class(
-          config=self.config, mesh=self.mesh, quant=self.quant, model_mode=self.model_mode, rngs=rng, **layer_kwargs
-      )
-
-      return layer
+    Uses jax.lax.scan instead of nnx.vmap to reduce peak memory during initialization.
+    With vmap, all layers' parameters are created simultaneously (O(N) peak memory).
+    With scan, parameters are created one layer at a time (O(1) peak intermediate memory),
+    which prevents OOM on memory-constrained devices like TPU v6e-4.
+    """
+    scan_axis = self.config.param_scan_axis
 
-    # Workaround for Deepseek MTP test failure.
-    # TODO: Handle this properly.
+    # Fork rngs to get per-layer RNG states for scanning
     try:
       forked_rngs = rngs.fork(split=length)
-
     except:  # pylint: disable=bare-except
       pass
 
-    out_axes = nnx.StateAxes({nnx.Param: self.config.param_scan_axis, ...: 0})
-    layers_vmapped = nnx.vmap(
-        create_layer_fn,
-        in_axes=0,
-        out_axes=out_axes,
-        axis_name="layers",
-        transform_metadata={nnx.PARTITION_NAME: "layers"},
-    )(forked_rngs)
+    rngs_graphdef, rngs_state = nnx.split(forked_rngs)
+
+    # Create a reference layer to capture the module graph structure (graphdef).
+    # This layer's params are discarded — only the structure is kept.
+    # Must use the first slice of the forked rngs (not a dummy Rngs(0)) so the
+    # graphdef has the same number of RNG state leaves as the scan-created layers.
+    first_rng_state = jax.tree.map(lambda x: x[0], rngs_state)
+    ref_rngs = nnx.merge(rngs_graphdef, first_rng_state)
+    ref_layer = decoder_layer_class(
+        config=self.config, mesh=self.mesh, quant=self.quant,
+        model_mode=self.model_mode, rngs=ref_rngs, **layer_kwargs
+    )
+    layer_graphdef, _, _ = nnx.split(ref_layer, nnx.Param, ...)
+    del ref_layer
+
+    # Sequentially create each layer's parameters via jax.lax.scan.
+    # The scan body is traced once; XLA executes it N times with different RNG keys,
+    # keeping only one layer's intermediate state alive at a time.
+    def scan_body(carry, rng_state_slice):
+      layer_rngs = nnx.merge(rngs_graphdef, rng_state_slice)
+      layer = decoder_layer_class(
+          config=self.config, mesh=self.mesh, quant=self.quant,
+          model_mode=self.model_mode, rngs=layer_rngs, **layer_kwargs
+      )
+      _, params, rest = nnx.split(layer, nnx.Param, ...)
+      return carry, (params, rest)
+
+    _, (stacked_params, stacked_rest) = jax.lax.scan(scan_body, None, rngs_state)
 
-    return layers_vmapped
+    # jax.lax.scan stacks outputs along axis 0. Move params to the configured scan axis.
+    if scan_axis != 0:
+      stacked_params = jax.tree.map(lambda x: jnp.moveaxis(x, 0, scan_axis), stacked_params)
+
+    # Add partition metadata that nnx.vmap's transform_metadata would normally set.
+    # This metadata is read by variable_to_logically_partitioned() in initializers.py
+    # and by nnx.get_partition_spec() (via the updated out_sharding) to produce
+    # correct sharding specs that include the scan axis dimension.
+    def _add_scan_metadata(state, axis):
+      def _update_leaf(leaf):
+        if isinstance(leaf, nnx.VariableState):
+          metadata = leaf.get_metadata()
+          metadata[nnx.PARTITION_NAME] = metadata_axis_name
+          metadata["param_scan_axis"] = axis
+          # Insert the scan axis name into out_sharding so that
+          # nnx.get_partition_spec returns specs matching the actual tensor rank.
+          # Without this, scanned params are 3D but specs remain 2D.
+          if "out_sharding" in metadata and metadata["out_sharding"]:
+            sharding = list(metadata["out_sharding"])
+            sharding.insert(axis, metadata_axis_name)
+            metadata["out_sharding"] = tuple(sharding)
+          return leaf.replace(**metadata)
+        return leaf
+      return jax.tree.map(_update_leaf, state, is_leaf=lambda x: isinstance(x, nnx.VariableState))
+
+    stacked_params = _add_scan_metadata(stacked_params, scan_axis)
+    stacked_rest = _add_scan_metadata(stacked_rest, 0)
+
+    return nnx.merge(layer_graphdef, stacked_params, stacked_rest)
 
   def _apply_layer_with_remat(self, layer: nnx.Module, y: jax.Array, policy: Any, prevent_cse: bool, **kwargs):
     """Helper to cleanly apply jax.checkpoint to a single unscanned layer or block."""
@@ -439,9 +488,7 @@ def _apply_layers_sequentially(self, layers, x_in, *args, length: int, **kwargs)
     """Runs the layer stack using nnx.scan."""
     policy = self.get_remat_policy()
     prevent_cse = maxtext_utils.should_prevent_cse_in_remat(self.config)
-    graphdef, params, state = nnx.split(
-        layers, nnx.Param, ...
-    )  # state: the mutable state we carry (KV cache, RNGs, etc.)
+    graphdef, params, state = nnx.split(layers, nnx.Param, ...)
 
     scan_axis = self.config.param_scan_axis
     if scan_axis != 0:
@@ -451,6 +498,13 @@ def _apply_layers_sequentially(self, layers, x_in, *args, length: int, **kwargs)
     sig = inspect.signature(layer_cls.__call__)
     valid_kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters or "kwargs" in sig.parameters}
 
+    def _extract_matching_state(template, full):
+      if isinstance(template, nnx.State):
+        return nnx.State({k: _extract_matching_state(v, full[k]) for k, v in template.items()})
+      elif isinstance(template, dict):
+        return {k: _extract_matching_state(v, full[k]) for k, v in template.items()}
+      return full
+
     def layer_fn(carry, scanned_vars):
       current_params, current_state = scanned_vars
 
@@ -460,20 +514,28 @@ def layer_fn(carry, scanned_vars):
       layer = nnx.merge(graphdef, current_params, current_state)
       layer_out = layer(carry, *args, **valid_kwargs)
       new_carry = layer_out[0] if isinstance(layer_out, tuple) else layer_out
-      new_current_state = nnx.state(layer)
-
+      
+      new_full_state = nnx.state(layer)
+      new_current_state = _extract_matching_state(current_state, new_full_state)
+      
+      # ONLY return non-param state to prevent memory duplication of weights
       return new_carry, new_current_state
 
     layer_fn = jax.checkpoint(layer_fn, policy=policy, prevent_cse=prevent_cse)
 
-    final_carry, scanned_state = jax.lax.scan(layer_fn, x_in, (params, state))
+    final_carry, scanned_other = jax.lax.scan(layer_fn, x_in, (params, state))
 
     if scan_axis != 0:
-      scanned_params, scanned_other = scanned_state.split(nnx.Param, ...)
-      scanned_params = jax.tree.map(lambda x: jnp.moveaxis(x, 0, scan_axis), scanned_params)
-      scanned_state = nnx.State.merge(scanned_params, scanned_other)
-
-    return final_carry, nnx.merge(graphdef, scanned_state)
+      params = jax.tree.map(lambda x: jnp.moveaxis(x, 0, scan_axis), params)
+      
+    scanned_state = nnx.State.merge(params, scanned_other)
+    # Update the existing module in-place rather than creating a new one.
+    # Creating a new module via nnx.merge and reassigning (self.layers = new_module)
+    # would replace a child node in the NNX graph, which is detected as a graph
+    # structure mutation when the parent module is inside a JAX transformation
+    # (e.g., nnx.jit in PeftTrainer). In-place update preserves object identity.
+    nnx.update(layers, scanned_state)
+    return final_carry, layers
 
   def get_decoder_layers(self):
     """Retrieves decoder layer classes based on config using a dictionary lookup."""
@@ -1159,7 +1221,7 @@ def decoder_as_linen(
     model_mode: str,
     quant: None | Quant = None,
 ):
-  """Creates a Decoder module."""
+  """Creates a Decoder module"""
   module = nnx_wrappers.to_linen(
       NNXDecoder,
       config=config,
@@ -1171,4 +1233,4 @@ def decoder_as_linen(
       abstract_init=False,
       metadata_fn=initializers.variable_to_logically_partitioned,
   )
-  return module
+  return module
diff --git a/src/maxtext/layers/quantizations.py b/src/maxtext/layers/quantizations.py
@@ -26,6 +26,7 @@
 from aqt.jax.v2 import tiled_dot_general
 from aqt.jax.v2 import calibration
 
+from maxtext.layers import nnx_wrappers
 import qwix
 from qwix._src.core import dot_general_qt
 
@@ -285,7 +286,7 @@ class Fp8Quantization(Quantization):
 
   def dot_general_cls(self, mesh_axes: Tuple[str, ...] = ()):
     """Returns dot_general configured with aqt params."""
-    return nn.Fp8DirectDotGeneralOp
+    return nnx_wrappers.ToNNX(nn.Fp8DirectDotGeneralOp)
 
   def einsum(self, dtype: DType = jnp.float32):
     return _Fp8EinsumWrapper(dtype=dtype)
diff --git a/src/maxtext/models/models.py b/src/maxtext/models/models.py
@@ -33,7 +33,7 @@
 from maxtext.layers.decoders import Decoder
 from maxtext.layers.embeddings import Embed, embed_as_linen
 from maxtext.layers.encoders import AudioEncoder, VisionEncoder, audio_encoder_as_linen, vision_encoder_as_linen
-from maxtext.layers.multi_token_prediction import multi_token_prediction_block_as_linen
+from maxtext.layers.multi_token_prediction import MultiTokenPredictionBlock, multi_token_prediction_block_as_linen
 from maxtext.layers.quantizations import AqtQuantization as Quant
 from maxtext.multimodal import processor as mm_processor
 from maxtext.utils import max_utils
@@ -376,25 +376,12 @@ def __init__(
       # For MTP, we use the DecoderLayer blueprint to ensure architectural consistency.
       # By convention, this is the last layer in the list.
       mtp_layer = layer_types[-1]
-      mtp_block_linen = multi_token_prediction_block_as_linen(
+      self.mtp_block = MultiTokenPredictionBlock(
           config=self.config,
           mesh=self.mesh,
           transformer_layer_module=mtp_layer,
           decoder=self.decoder,
           rngs=rngs,
-          name="mtp_block",
-      )
-      self.mtp_block = nnx_wrappers.ToNNX(mtp_block_linen, rngs=rngs)
-
-      self.mtp_block.lazy_init(
-          shared_embedding=self.token_embedder,
-          main_hidden_state=jnp.ones((1, 1, self.config.emb_dim), dtype=self.config.dtype),
-          input_ids=jnp.ones((1, 1), dtype=jnp.int32),
-          target_ids=jnp.ones((1, 1), dtype=jnp.int32),
-          target_mask=jnp.ones((1, 1), dtype=jnp.int32),
-          position_ids=jnp.ones((1, 1), dtype=jnp.int32),
-          decoder_segment_ids=jnp.ones((1, 1), dtype=jnp.int32),
-          deterministic=True,
       )
 
   def no_op(self, *args, **kwargs):
diff --git a/tests/unit/a_max_utils_test.py b/tests/unit/a_max_utils_test.py