AI-Hypercomputer
diff --git a/‎src/MaxText/maxtext_utils.py‎
Lines changed: 28 additions & 1 deletion b/‎src/MaxText/maxtext_utils.py‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎src/MaxText/maxtext_utils_nnx.py‎
Lines changed: 138 additions & 0 deletions b/‎src/MaxText/maxtext_utils_nnx.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎src/MaxText/model_creation_utils.py‎
Lines changed: 18 additions & 19 deletions b/‎src/MaxText/model_creation_utils.py‎
Lines changed: 18 additions & 19 deletions
@@ -17,6 +17,7 @@
 
 import functools
 import pickle
+from typing import Sequence
 
 from flax import linen as nn
 from flax.linen import partitioning as nn_partitioning
@@ -26,6 +27,7 @@
 
 from jax.experimental import mesh_utils
 from jax.experimental.serialize_executable import deserialize_and_load
+from jax.sharding import AxisType, Mesh
 
 import jax
 import jax.numpy as jnp
@@ -39,8 +41,9 @@
 from MaxText import max_logging
 from MaxText import max_utils
 from MaxText import multimodal_utils
+from MaxText import pyconfig
 from MaxText import sharding
-from MaxText.common_types import DecoderBlockType, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE
+from MaxText.common_types import DecoderBlockType, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE, ShardMode
 from MaxText.inference.page_manager import PageState
 
 OVERWRITE_WITH_GRADIENT = "_overwrite_with_gradient"
@@ -1178,3 +1181,27 @@ def print_state_mesh_shardings_params(state, state_sharding, mesh):
     shape = jax.typeof(leaf_val)
     pspec = sharding.remove_size_one_mesh_axis(leaf_sharding.spec, mesh)
     max_logging.log(f"{path_str:.<80} {shape} {tuple(pspec)}")
+
+
+def get_mesh_from_config(
+    config: pyconfig.HyperParameters,
+    devices: Sequence[jax.Device] | None = None,
+) -> Mesh:
+  """
+  Geh mesh from the configuration.
+
+  Args:
+    config: the configuration
+    devices: the devices
+
+  Returns:
+    the device mesh
+  """
+  devices_array = create_device_mesh(config, devices)
+
+  if config.shard_mode == ShardMode.EXPLICIT:
+    axis_types = tuple([AxisType.Explicit] * len(config.mesh_axes))
+  else:
+    axis_types = tuple([AxisType.Auto] * len(config.mesh_axes))
+
+  return Mesh(devices_array, config.mesh_axes, axis_types=axis_types)
@@ -0,0 +1,138 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Utils for MaxText NNX. """
+
+from functools import partial
+from typing import Any, Callable
+
+from flax import nnx
+import jax
+from jax.sharding import Mesh, NamedSharding
+
+from MaxText import max_logging
+from MaxText import pyconfig
+
+
+def create_nnx_rngs(
+    config: pyconfig.HyperParameters, is_training: bool = True, rng_key: jax.Array | None = None
+) -> nnx.Rngs:
+  """
+  Create NNX Rngs
+
+  Args:
+    config: the configuration
+    is_training: if the Rngs are for training
+    rng_key: the Rng key
+
+  Returns:
+    The NNX Rngs
+  """
+  if rng_key is None:
+    rng_key = jax.random.PRNGKey(config.init_weights_seed)
+
+  if is_training:
+    return nnx.Rngs(
+        params=jax.random.fold_in(rng_key, 0), dropout=jax.random.fold_in(rng_key, 1), aqt=jax.random.fold_in(rng_key, 2)
+    )
+  return nnx.Rngs(params=rng_key)  # disable dropout RNG and aqt for inference
+
+
+def get_named_sharding_nnx(abstract_state: Any) -> Any:
+  """Get named sharding from NNX abstract state.
+
+  Args:
+    abstract_state: NNX model abstract state created from nnx.get_abstract_model.
+
+  Returns:
+    named sharding structure
+  """
+  # Don't use nnx.get_named_sharding() because it constructs new shardings. Instead, we
+  # get the existing sharding from the abstract_state.
+  # The state leaf is of type jax.ShapeDtypeStruct(shape, dtype, sharding)
+  return jax.tree.map(
+      lambda x: x.sharding,
+      abstract_state,
+      is_leaf=lambda x: isinstance(x, jax.ShapeDtypeStruct),
+  )
+
+
+def set_named_sharding_nnx(abstract_state: Any, named_sharding: Any) -> Any:
+  """Set named sharding to NNX abstract state.
+
+  Args:
+    abstract_state: NNX model abstract state created from nnx.get_abstract_model().
+    named_sharding: named sharding. It must have the same tree structure with abstract_state.
+
+  Returns:
+    updated abstract_state
+  """
+  return jax.tree.map(lambda x, y: jax.ShapeDtypeStruct(x.shape, x.dtype, sharding=y), abstract_state, named_sharding)
+
+
+def move_memory_to_host(path: tuple[str, ...], x: NamedSharding) -> NamedSharding:
+  """
+  Change the memory_kind of the NamedSharding to "pinned_host". This function can be
+  called by jax.tree_util.tree_map_with_path on a NNX state structure.
+
+  Args:
+    path: the tree path tuple
+    x: the NamedSharding corresponding to the path
+
+  Returns:
+    the NamedSharding with memory_kind set to "pinned_host"
+  """
+  max_logging.log(f"max_utils.py: Moving {path} to host")
+  # Create the new sharding with the target memory kind
+  return x.with_memory_kind(kind="pinned_host")
+
+
+def create_nnx_sharded_model(
+    abstract_model: nnx.Module,
+    init_fn: Callable,
+    mesh: Mesh | None = None,
+    named_sharding: Any | None = None,
+) -> nnx.Module:
+  """
+  Create the model with the given sharding.
+
+  Args:
+    abstract_model: the abstract model
+    init_fn: the model init function
+    mesh: the device mesh
+    named_sharding: the given sharding
+
+  Returns:
+    The initialized sharded model
+  """
+  graphdef, abstract_state = nnx.split(abstract_model)
+  if named_sharding is None:
+    # The state leaf is of type jax.ShapeDtypeStruct(shape, dtype, sharding)
+    # we get the sharding directly from it.
+    named_sharding = get_named_sharding_nnx(abstract_state)
+
+  if mesh is None:
+    mesh = abstract_model.mesh
+
+  # JIT a function that creates the model state with proper sharding from the start.
+  # By providing out_shardings, we instruct JAX to produce sharded output directly,
+  # avoiding a large intermediate allocation on a single device.
+  @partial(jax.jit, out_shardings=named_sharding)
+  def create_sharded_state():
+    model = init_fn()
+    return jax.lax.with_sharding_constraint(nnx.state(model), named_sharding)
+
+  # Create the model with sharded parameters.
+  with jax.set_mesh(mesh):
+    sharded_state = create_sharded_state()
+  return nnx.merge(graphdef, sharded_state)
@@ -21,11 +21,12 @@
 from flax import nnx
 import flax.linen as nn
 import jax
-from jax.sharding import Mesh, AxisType
+from jax.sharding import Mesh
 from MaxText import maxtext_utils
+from MaxText import maxtext_utils_nnx
 from MaxText import pyconfig
 from MaxText.layers import quantizations
-from MaxText.common_types import MODEL_MODE_TRAIN, ShardMode
+from MaxText.common_types import MODEL_MODE_TRAIN
 from MaxText.layers import models
 from orbax import checkpoint as ocp
 from functools import partial
@@ -39,6 +40,7 @@ def from_config(
     mesh: Mesh | None = None,
     *,
     model_mode: str = MODEL_MODE_TRAIN,
+    rngs: None = None,
 ) -> nn.Module:
   ...
 
@@ -79,15 +81,7 @@ def from_config(
       model = from_config(config)
   """
   if mesh is None:
-    devices_array = maxtext_utils.create_device_mesh(config, devices)
-
-    if config.shard_mode == ShardMode.EXPLICIT:
-      axis_types = tuple([AxisType.Explicit] * len(config.mesh_axes))
-    else:
-      axis_types = tuple([AxisType.Auto] * len(config.mesh_axes))
-
-    mesh = Mesh(devices_array, config.mesh_axes, axis_types=axis_types)
-
+    mesh = maxtext_utils.get_mesh_from_config(config, devices)
   model = create_model(config, mesh, model_mode=model_mode, rngs=rngs)
 
   # Return only the model
@@ -113,16 +107,10 @@ def create_model(config, mesh, model_mode: str = MODEL_MODE_TRAIN, rngs: nnx.Rng
 
 def create_nnx_model(config, mesh=None, devices=None, model_mode=MODEL_MODE_TRAIN, rng_key=None):
   """Creates a NNX model with sharded parameters, possibly loading from a checkpoint."""
+  is_training = model_mode == MODEL_MODE_TRAIN
 
   def _create_model(mesh: Mesh | None = None, model_mode: str = MODEL_MODE_TRAIN, rng_key: jax.Array | None = None):
-    if rng_key is None:
-      rng_key = jax.random.PRNGKey(config.init_weights_seed)
-
-    if model_mode == MODEL_MODE_TRAIN:
-      rngs = nnx.Rngs(params=rng_key, dropout=1)
-    else:
-      rngs = nnx.Rngs(params=rng_key)  # disable dropout RNG for inference
-
+    rngs = maxtext_utils_nnx.create_nnx_rngs(config, is_training=is_training, rng_key=rng_key)
     return from_config(config, devices, mesh, rngs=rngs, model_mode=model_mode)
 
   _create_model_partial = partial(_create_model, mesh=mesh, model_mode=model_mode, rng_key=rng_key)
@@ -135,6 +123,17 @@ def _create_model(mesh: Mesh | None = None, model_mode: str = MODEL_MODE_TRAIN,
   if mesh is None:
     mesh = abstract_model.mesh
 
+  # Note for pure_nnx:
+  # Currently, the NNX model returned has a linen decoder wrapped to NNX. So it is not a pure NNX model and
+  # we still need to use nn.logical_axis_rules(config.logical_axis_rules) to get the out sharding from the linen
+  # LogicallyPartitioned structure.
+  # In the future if the pure NNX model is used, with pure NNX's eager sharding, there will be no LogicallyPartitioned
+  # structure in the abstract state and we can get the sharded state with the following code:
+  #     graphdef, state = nnx.get_abstract_model(_create_model_partial, mesh)
+  #     abstract_model = nnx.merge(graphdef, state)
+  #     model = maxtext_utils_nnx.create_nnx_sharded_model(abstract_model, _create_model_partial, mesh=mesh)
+  #     sharded_state = nnx.state(model)
+
   # JIT a function that creates the model state with proper sharding from the start.
   # By providing out_shardings, we instruct JAX to produce sharded output directly,
   # avoiding a large intermediate allocation on a single device.