Fix unit test errors

Charles Li · Charles Li · commit b732cb3ead70 · 2026-03-09T00:33:27.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -1079,7 +1079,7 @@ subslice_shape: ""
 
 # NNX
 enable_nnx: false
-pure_nnx_decoder: True
+pure_nnx_decoder: false 
 
 ################################## Qwen3-Next Specific Configs ##################################
 # Kernel size for the 1D convolution in the Gated Delta Net
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -533,14 +533,14 @@ def __init__(
     elif self.is_qwen3_next:
       self.query_norm = Qwen3NextRMSNorm(
           num_features=self.config.head_dim,
-          eps=self.config.normalization_layer_epsilon,
+          epsilon=self.config.normalization_layer_epsilon,
           dtype=self.config.dtype,
           weight_dtype=self.config.weight_dtype,
           rngs=self.rngs,
       )
       self.key_norm = Qwen3NextRMSNorm(
           num_features=self.config.head_dim,
-          eps=self.config.normalization_layer_epsilon,
+          epsilon=self.config.normalization_layer_epsilon,
           dtype=self.config.dtype,
           weight_dtype=self.config.weight_dtype,
           rngs=self.rngs,
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -668,7 +668,7 @@ def get_norm_layer(self, num_features: int, rngs: nnx.Rngs):
       )
     elif self.config.decoder_block == DecoderBlockType.QWEN3_NEXT:
       return functools.partial(
-          normalizations.Qwen3NextRMSNorm, num_features=num_features, shard_mode=self.config.shard_mode
+          normalizations.Qwen3NextRMSNorm, num_features=num_features, shard_mode=self.config.shard_mode, rngs=rngs
       )
     else:
       raise ValueError(f"Incorrect decoder_block name {self.config.decoder_block.value=}")
diff --git a/src/maxtext/layers/normalizations.py b/src/maxtext/layers/normalizations.py
@@ -102,7 +102,17 @@ def __call__(self, x: jnp.ndarray, out_sharding: NamedSharding | None = None) ->
     return y_flat.reshape(input_shape)
 
 
-def Qwen3NextRMSNorm(num_features: int, eps: float, dtype: DType, weight_dtype: DType, *, rngs: nnx.Rngs):
+def Qwen3NextRMSNorm(
+    num_features: int,
+    epsilon: float,
+    dtype: DType,
+    weight_dtype: DType,
+    shard_mode: ShardMode = ShardMode.AUTO,
+    kernel_axes: tuple[None | str, ...] = (),
+    parameter_memory_host_offload: bool = False,
+    *,
+    rngs: nnx.Rngs,
+):
   """
   Used for input and post attention layernorms
   in Qwen3NextDecoderLayer.
@@ -115,10 +125,13 @@ def Qwen3NextRMSNorm(num_features: int, eps: float, dtype: DType, weight_dtype:
   return nnx.data(
       RMSNorm(
           num_features=num_features,
-          epsilon=eps,
+          epsilon=epsilon,
           dtype=dtype,
           weight_dtype=weight_dtype,
+          shard_mode=shard_mode,
+          kernel_axes=kernel_axes,
           scale_init=linen_initializers.zeros,
+          parameter_memory_host_offload=parameter_memory_host_offload,
           scale_offset=1.0,
           rngs=rngs,
       )
diff --git a/src/maxtext/models/models.py b/src/maxtext/models/models.py
@@ -27,7 +27,7 @@
 
 from maxtext.common.common_types import Config, DECODING_ACTIVE_SEQUENCE_INDICATOR, MODEL_MODE_AUTOREGRESSIVE, MODEL_MODE_TRAIN
 from maxtext.inference import page_manager
-from maxtext.layers.nnx_decoders import NNXDecoder, decoder_as_linen
+from maxtext.layers.nnx_decoders import NNXDecoder
 from maxtext.layers import initializers
 from maxtext.layers import nnx_wrappers
 from maxtext.layers.decoders import Decoder
@@ -88,12 +88,7 @@ def setup(self):
     )
     self.vision_encoder = vision_encoder_as_linen(config=cfg, mesh=mesh) if cfg.use_multimodal else None
     self.audio_encoder = audio_encoder_as_linen(config=cfg, mesh=mesh) if cfg.use_audio else None
-    if cfg.pure_nnx_decoder:
-      self.decoder = decoder_as_linen(
-          config=cfg, mesh=mesh, quant=self.quant, model_mode=self.model_mode, rngs=nnx.Rngs(0)
-      )
-    else:
-      self.decoder = Decoder(config=cfg, mesh=mesh, quant=self.quant, model_mode=self.model_mode)
+    self.decoder = Decoder(config=cfg, mesh=mesh, quant=self.quant, model_mode=self.model_mode)
 
     # If MTP is enabled via config, set up the MTP block.
     if self.config.mtp_num_layers > 0:
diff --git a/src/maxtext/models/qwen3.py b/src/maxtext/models/qwen3.py
@@ -962,7 +962,7 @@ def __init__(
     # First LayerNorm, applied before the attention block.
     self.input_layernorm = Qwen3NextRMSNorm(
         num_features=cfg.emb_dim,
-        eps=cfg.normalization_layer_epsilon,
+        epsilon=cfg.normalization_layer_epsilon,
         dtype=cfg.dtype,
         weight_dtype=cfg.weight_dtype,
         rngs=rngs,
@@ -987,7 +987,7 @@ def __init__(
     # Second LayerNorm, applied before the MoE block.
     self.post_attention_layernorm = Qwen3NextRMSNorm(
         num_features=cfg.emb_dim,
-        eps=cfg.normalization_layer_epsilon,
+        epsilon=cfg.normalization_layer_epsilon,
         dtype=cfg.dtype,
         weight_dtype=cfg.weight_dtype,
         rngs=rngs,
diff --git a/tests/unit/qwen3_next_vs_reference_test.py b/tests/unit/qwen3_next_vs_reference_test.py
diff --git a/tests/unit/train_compile_test.py b/tests/unit/train_compile_test.py

Original file line number	Diff line number	Diff line change
`@@ -668,7 +668,7 @@ def get_norm_layer(self, num_features: int, rngs: nnx.Rngs):`
`668`	`668`	`)`
`669`	`669`	`elif self.config.decoder_block == DecoderBlockType.QWEN3_NEXT:`
`670`	`670`	`return functools.partial(`
`671`		`- normalizations.Qwen3NextRMSNorm, num_features=num_features, shard_mode=self.config.shard_mode`
	`671`	`+ normalizations.Qwen3NextRMSNorm, num_features=num_features, shard_mode=self.config.shard_mode, rngs=rngs`
`672`	`672`	`)`
`673`	`673`	`else:`
`674`	`674`	`raise ValueError(f"Incorrect decoder_block name {self.config.decoder_block.value=}")`