AI-Hypercomputer · hsuan-lun-chiang · Mar 16, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
@@ -1111,8 +1111,8 @@ position_id_per_seconds: 25
 subslice_shape: ""
 
 # NNX
-enable_nnx: false
-pure_nnx_decoder: false
+enable_nnx: True
+pure_nnx_decoder: True
 
 ################################## Qwen3-Next Specific Configs ##################################
 # Kernel size for the 1D convolution in the Gated Delta Net

@@ -534,14 +534,14 @@ def __init__(
     elif self.is_qwen3_next:
       self.query_norm = Qwen3NextRMSNorm(
           num_features=self.config.head_dim,
-          eps=self.config.normalization_layer_epsilon,
+          epsilon=self.config.normalization_layer_epsilon,
           dtype=self.config.dtype,
           weight_dtype=self.config.weight_dtype,
           rngs=self.rngs,
       )
       self.key_norm = Qwen3NextRMSNorm(
           num_features=self.config.head_dim,
-          eps=self.config.normalization_layer_epsilon,
+          epsilon=self.config.normalization_layer_epsilon,
           dtype=self.config.dtype,
           weight_dtype=self.config.weight_dtype,
           rngs=self.rngs,

@@ -94,6 +94,19 @@ def variable_to_logically_partitioned(variable: nnx.VariableState):
     out_sharding = metadata["sharding"]
 
   if out_sharding is not None:
+    if nnx.PARTITION_NAME in metadata:
+      partition_name = metadata[nnx.PARTITION_NAME]
+      # Only nnx.Param variables are typically scanned across the param_scan_axis
+      scan_axis = metadata.get("param_scan_axis", 0) if variable.type == nnx.Param else 0
+
+      # Check if the scan axis name was already inserted into out_sharding
+      # (e.g., by _create_scanned_layers). Skip insertion to avoid duplicates.
+      sharding_list = [out_sharding] if isinstance(out_sharding, str) else list(out_sharding)
+      if partition_name not in sharding_list:
+        sharding_list.insert(scan_axis, partition_name)
+
+      out_sharding = tuple(sharding_list)
+
     return nn.LogicallyPartitioned(  # type: ignore[wrong-keyword-args]
         variable.value,
         out_sharding,  # type: ignore[arg-type]

@@ -102,7 +102,17 @@ def __call__(self, x: jnp.ndarray, out_sharding: NamedSharding | None = None) ->
     return y_flat.reshape(input_shape)
 
 
-def Qwen3NextRMSNorm(num_features: int, eps: float, dtype: DType, weight_dtype: DType, *, rngs: nnx.Rngs):
+def Qwen3NextRMSNorm(
+    num_features: int,
+    epsilon: float,
+    dtype: DType,
+    weight_dtype: DType,
+    shard_mode: ShardMode = ShardMode.AUTO,
+    kernel_axes: tuple[None | str, ...] = (),
+    parameter_memory_host_offload: bool = False,
+    *,
+    rngs: nnx.Rngs,
+):
   """
   Used for input and post attention layernorms
   in Qwen3NextDecoderLayer.
@@ -115,10 +125,13 @@ def Qwen3NextRMSNorm(num_features: int, eps: float, dtype: DType, weight_dtype:
   return nnx.data(
       RMSNorm(
           num_features=num_features,
-          epsilon=eps,
+          epsilon=epsilon,
           dtype=dtype,
           weight_dtype=weight_dtype,
+          shard_mode=shard_mode,
+          kernel_axes=kernel_axes,
           scale_init=linen_initializers.zeros,
+          parameter_memory_host_offload=parameter_memory_host_offload,
           scale_offset=1.0,
           rngs=rngs,
       )

@@ -962,7 +962,7 @@ def __init__(
     # First LayerNorm, applied before the attention block.
     self.input_layernorm = Qwen3NextRMSNorm(
         num_features=cfg.emb_dim,
-        eps=cfg.normalization_layer_epsilon,
+        epsilon=cfg.normalization_layer_epsilon,
         dtype=cfg.dtype,
         weight_dtype=cfg.weight_dtype,
         rngs=rngs,
@@ -987,7 +987,7 @@ def __init__(
     # Second LayerNorm, applied before the MoE block.
     self.post_attention_layernorm = Qwen3NextRMSNorm(
         num_features=cfg.emb_dim,
-        eps=cfg.normalization_layer_epsilon,
+        epsilon=cfg.normalization_layer_epsilon,
         dtype=cfg.dtype,
         weight_dtype=cfg.weight_dtype,
         rngs=rngs,