[WIP] NNX: fix model and test compatibility issues

ecnal-cienet · ecnal-cienet · commit 669dc01de88f · 2026-03-03T19:59:17.000Z
- Replace nn.Dropout with linears.Dropout in gpt_oss and olmo3 decoder layers
- Add num_activations logical axis rule to base.yml
- Fix integration and unit tests for NNX compatibility

I will relocate these files accordingly once the work is done.
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -497,6 +497,7 @@ logical_axis_rules: [
                       ['paged_kv_head_dim_size', []],
                       ['dense_layers', []],
                       ['moe_layers', []],
+                      ['num_activations', []],
                       ['engram_dim', ['tensor']],
                       ['mhc', []],
                       ['diloco', 'diloco'],
diff --git a/src/maxtext/models/gpt_oss.py b/src/maxtext/models/gpt_oss.py
@@ -28,6 +28,7 @@
 from maxtext.common.common_types import AttentionType, Config
 from maxtext.layers import attentions
 from maxtext.layers import initializers
+from maxtext.layers import linears
 from maxtext.layers import moe
 from maxtext.layers import nnx_wrappers
 from maxtext.layers import quantizations
@@ -130,6 +131,8 @@ def __init__(
         rngs=rngs,
     )
 
+    self.dropout = linears.Dropout(rate=config.dropout_rate, broadcast_dims=(-2,), rngs=rngs)
+
   def __call__(
       self,
       inputs,
@@ -181,7 +184,7 @@ def __call__(
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
-    layer_output = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
+    layer_output = self.dropout(layer_output, deterministic=deterministic)
 
     layer_output = nn.with_logical_constraint(
         layer_output,
diff --git a/src/maxtext/models/olmo3.py b/src/maxtext/models/olmo3.py
@@ -29,6 +29,7 @@
 from maxtext.common.common_types import AttentionType, Config
 from maxtext.layers import attentions
 from maxtext.layers import initializers
+from maxtext.layers import linears
 from maxtext.layers import nnx_wrappers
 from maxtext.layers import quantizations
 from maxtext.layers.attentions import Attention
@@ -140,6 +141,8 @@ def __init__(
         rngs=rngs,
     )
 
+    self.dropout = linears.Dropout(rate=config.dropout_rate, broadcast_dims=(-2,), rngs=rngs)
+
   def __call__(
       self,
       inputs,
@@ -193,7 +196,7 @@ def __call__(
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
-    layer_output = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
+    layer_output = self.dropout(layer_output, deterministic=deterministic)
 
     layer_output = nn.with_logical_constraint(
         layer_output,
diff --git a/src/maxtext/trainers/post_train/sft/train_sft.py b/src/maxtext/trainers/post_train/sft/train_sft.py
@@ -47,8 +47,6 @@
 
 from orbax import checkpoint as ocp
 
-from tunix.sft import metrics_logger, peft_trainer, profiler
-
 from maxtext.configs import pyconfig
 from maxtext.trainers.pre_train.train import loss_fn
 from maxtext.common.goodput import (
@@ -74,6 +72,8 @@ def get_tunix_config(mt_config):
   Returns:
     A Tunix `TrainingConfig` object.
   """
+  from tunix.sft import metrics_logger, peft_trainer, profiler  # pylint: disable=g-import-not-at-top
+
   # Checkpointing configurations
   checkpointing_options = ocp.CheckpointManagerOptions(
       save_interval_steps=mt_config.checkpoint_period,
@@ -140,6 +140,8 @@ def loss_func(model, inputs, inputs_position, inputs_segmentation, targets, targ
 
 def setup_trainer_state(mt_config, goodput_recorder=None):
   """Set up prerequisites for training loop."""
+  from tunix.sft import peft_trainer  # pylint: disable=g-import-not-at-top
+
   tunix_config = get_tunix_config(mt_config)
 
   with maybe_record_goodput(goodput_recorder, GoodputEvent.TPU_INIT):
diff --git a/tests/integration/aot_identical_test.py b/tests/integration/aot_identical_test.py
@@ -179,6 +179,7 @@ def assert_compile_and_real_match_jaxpr(self, test_name, *extra_args):
         "enable_checkpointing=False",
         "dump_jaxpr=True",
         "dump_jaxpr_delete_local_after=False",
+        "skip_first_n_steps_for_profiler=0",
     ]
     if extra_args:
       shared_args.extend(extra_args)
diff --git a/tests/integration/checkpointing_test.py b/tests/integration/checkpointing_test.py
@@ -93,6 +93,7 @@ def get_checkpointing_command(run_date, hardware, steps, metrics_file, attention
           f"dataset_type={dataset_type}",
           "async_checkpointing=False",
           f"attention={attention_type}",
+          "profiler=''",
       ]
       + model_params
       + pathways_command
@@ -135,19 +136,19 @@ def run_checkpointing(hardware, attention_type):
   # Determine dataset path/pattern depending on decoupled mode.
   gcsfuse_pattern = "/tmp/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record*"
   local_decoupled_root = os.path.join(
-      MAXTEXT_PKG_DIR, "..", "tests", "assets", "local_datasets", "c4_en_dataset_minimal", "c4", "en", "3.0.1"
+      MAXTEXT_PKG_DIR, "..", "..", "tests", "assets", "local_datasets", "c4_en_dataset_minimal", "c4", "en", "3.0.1"
   )
   local_pattern = os.path.join(local_decoupled_root, "c4-train.array_record*")
   selected_pattern = gcsfuse_pattern
   dataset_path = "/tmp/gcsfuse"
 
-  if is_decoupled():
+  if not glob.glob(gcsfuse_pattern):
     # Prefer local minimal dataset if gcsfuse data absent
-    if not glob.glob(gcsfuse_pattern) and glob.glob(local_pattern):
+    if glob.glob(local_pattern):
       selected_pattern = local_pattern
-      dataset_path = os.path.join(MAXTEXT_PKG_DIR, "..", "tests", "assets", "local_datasets")
-    elif not glob.glob(gcsfuse_pattern) and not glob.glob(local_pattern):
-      pytest.skip("No grain ArrayRecord shards found for checkpointing test in decoupled mode.")
+      dataset_path = os.path.join(MAXTEXT_PKG_DIR, "..", "..", "tests", "assets", "local_datasets")
+    else:
+      pytest.skip("No grain ArrayRecord shards found for checkpointing test.")
 
   grain_command = [
       "grain_worker_count=0",
diff --git a/tests/integration/decode_tests.py b/tests/integration/decode_tests.py
@@ -49,6 +49,8 @@ class DecodeTests(unittest.TestCase):
           "max_target_length=128",
           "per_device_batch_size=1",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
+          "profiler=''",
+          "pure_nnx=False",
       ],
       "int8": [  # tests decode with int8 quantization
           None,
@@ -64,6 +66,8 @@ class DecodeTests(unittest.TestCase):
           "quantization=int8",
           "quantize_kvcache=True",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
+          "profiler=''",
+          "pure_nnx=False",
       ],
       "pdb_lt_1": [  # tests decode with per_device_batch_size < 1
           None,
@@ -77,6 +81,8 @@ class DecodeTests(unittest.TestCase):
           "max_target_length=128",
           "per_device_batch_size=.25",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
+          "profiler=''",
+          "pure_nnx=False",
       ],
       "decode_sampling": [
           None,
diff --git a/tests/integration/generate_param_only_checkpoint_test.py b/tests/integration/generate_param_only_checkpoint_test.py
@@ -54,6 +54,8 @@ def run_e2e_test_flow(hardware, model_config, attention_type="autoselected", sta
       f"attention={attention_type}",
       "max_target_length=128",
       "per_device_batch_size=1",
+      "profiler=''",
+      "pure_nnx=False",
   ] + model_config
 
   pathways_command = []
@@ -72,6 +74,7 @@ def run_e2e_test_flow(hardware, model_config, attention_type="autoselected", sta
             dataset_type="tfds",
             dataset_path=dataset_path,
         )
+        + ["pure_nnx=False"]
     )
     state_path = f"{base_output_directory}/runner_{run_date}/checkpoints/0/items"
 
diff --git a/tests/integration/gradient_accumulation_test.py b/tests/integration/gradient_accumulation_test.py
@@ -28,9 +28,9 @@
 from maxtext.common.gcloud_stub import is_decoupled
 from maxtext.trainers.pre_train.train import main as train_main
 from maxtext.utils.globals import MAXTEXT_ASSETS_ROOT
-from maxtext.trainers.post_train.sft.train_sft_deprecated import main as sft_main
+from maxtext.trainers.post_train.sft.train_sft import main as sft_main
 
-from tests.utils.test_helpers import get_test_config_path, get_test_dataset_path, get_test_base_output_directory
+from tests.utils.test_helpers import get_test_config_path, get_test_dataset_path, get_test_base_output_directory, get_post_train_test_config_path
 
 
 def generate_random_string(length=10):
@@ -151,9 +151,8 @@ def test_sft_grad_accumulate_same_loss(self):
     sft_main(
         [
             None,
-            get_test_config_path(),
-            "base_output_directory=gs://runner-maxtext-logs",
-            "dataset_path=gs://maxtext-dataset",
+            get_post_train_test_config_path("sft"),
+            f"base_output_directory={self.base_output_directory}",
             "gradient_clipping_threshold=0",  # Ensures we are testing raw scales of gradients (clipping off).
             "enable_checkpointing=False",
             "enable_goodput_recording=False",
@@ -162,6 +161,6 @@ def test_sft_grad_accumulate_same_loss(self):
             rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
             "steps=3",
             "gradient_accumulation_steps=2",
-            "use_sft=True",
+            "dataset_type=synthetic",
         ]
     )
diff --git a/tests/integration/smoke/inference_microbenchmark_smoke_test.py b/tests/integration/smoke/inference_microbenchmark_smoke_test.py
@@ -53,6 +53,9 @@ def test(self):
             "weight_dtype=bfloat16",
             "attention=dot_product",
             "skip_jax_distributed_system=True",
+            "profiler=''",
+            "pure_nnx=False",
+            "enable_nnx=False",
         ]
     )
     run_benchmarks(config)
diff --git a/tests/integration/standalone_dl_ckpt_test.py b/tests/integration/standalone_dl_ckpt_test.py
@@ -89,6 +89,8 @@ def test_standalone_checkpointer(self):
             "async_checkpointing=False",
             "enable_goodput_recording=False",
             "skip_jax_distributed_system=True",
+            "pure_nnx=False",
+            "enable_nnx=False",
         )
     )
     # restore at 50 and checkpoint at 100
@@ -110,6 +112,8 @@ def test_standalone_checkpointer(self):
             "async_checkpointing=False",
             "enable_goodput_recording=False",
             "skip_jax_distributed_system=True",
+            "pure_nnx=False",
+            "enable_nnx=False",
         )
     )
 
diff --git a/tests/integration/xaot_test.py b/tests/integration/xaot_test.py
@@ -80,6 +80,7 @@ def run_compile_then_load(self, test_name, *extra_args):
         "learning_rate=1e-3",
         "dataset_type=synthetic",
         "enable_checkpointing=False",
+        "profiler=''",
     ]
 
     if extra_args:
diff --git a/tests/unit/diloco_test.py b/tests/unit/diloco_test.py
@@ -283,5 +283,8 @@ def test_diloco_two_slices(self):
             "dcn_diloco_parallelism=2",
             "enable_diloco=true",
             "model_name=gemma2-2b",
+            "pure_nnx=False",
+            "enable_nnx=False",
+            "pure_nnx_decoder=False",
         )
     )
diff --git a/tests/unit/max_utils_test.py b/tests/unit/max_utils_test.py
@@ -147,7 +147,7 @@ def init_pyconfig(self, **kwargs):
         "run_name": "test",
         "enable_checkpointing": False,
         "dataset_type": "synthetic",
-        "model_name": "llama3.1-8b",
+        "model_name": "gemma2-2b",
     } | kwargs
     config = pyconfig.initialize(
         [sys.argv[0], get_test_config_path()],
@@ -158,8 +158,7 @@ def init_pyconfig(self, **kwargs):
   @pytest.mark.tpu_only
   def test_unscan_train_state_params(self):
     """Test unscan_train_state_params logic and performance with a real model."""
-    # Initialize a configuration for an 8B model.
-    config = self.init_pyconfig()
+    config = self.init_pyconfig(pure_nnx=False, enable_nnx=False, pure_nnx_decoder=False)
 
     _, _, sharding, _, mesh, *_, state = setup_train_loop(config, None)
 
@@ -181,7 +180,7 @@ def test_unscan_train_state_params(self):
     )
     jax.block_until_ready(params_to_unscan)
     end_time = time.time()
-    print(f"\nUnscanning 8B model took: {end_time - start_time:.4f} seconds.\n")
+    print(f"\nUnscanning model took: {end_time - start_time:.4f} seconds.\n")
 
     # Assertions to verify correctness.
     decoder_params = params_to_unscan["params"]["decoder"]
@@ -190,8 +189,8 @@ def test_unscan_train_state_params(self):
     self.assertIn(f"layers_{num_layers-1}", decoder_params)
 
     # Check shape of one of the unstacked tensors.
-    # The exact key might differ based on model implementation, adjust if needed.
-    unstacked_shape = decoder_params["layers_5"]["mlp"]["wi_0"]["kernel"].shape
+    # gemma2-2b uses mlp_global/mlp_local instead of mlp (alternating attention layers).
+    unstacked_shape = decoder_params["layers_5"]["mlp_global"]["wi_0"]["kernel"].shape
     expected_shape = (config.base_emb_dim, config.base_mlp_dim)
     self.assertEqual(unstacked_shape, expected_shape)
 
diff --git a/tests/unit/multi_token_prediction_test.py b/tests/unit/multi_token_prediction_test.py
@@ -22,6 +22,7 @@
 
 from maxtext.configs import pyconfig
 from maxtext.layers.decoders import DecoderLayer
+from maxtext.layers.nnx_decoders import NNXDecoderLayer
 from maxtext.layers import multi_token_prediction  # The class under test
 from maxtext.layers import embeddings
 from maxtext.common.common_types import MODEL_MODE_TRAIN
@@ -48,6 +49,8 @@ def setUp(self):
         run_name="multi_token_prediction_layer_test",
         skip_jax_distributed_system=True,
         per_device_batch_size=8,
+        pure_nnx=True,
+        pure_nnx_decoder=False,
         **extra_args,
     )
     self.rng = jax.random.PRNGKey(42)  # Base RNG for setup
@@ -61,16 +64,17 @@ def setUp(self):
           config=self.cfg,
           mesh=self.mesh,
           layer_number=TEST_LAYER_NUM,
-          transformer_layer_module=DecoderLayer,
+          transformer_layer_module=NNXDecoderLayer,
           rngs=self.rngs,
       )
     else:
       # Instantiate the Layer
-      self.mtp_layer = multi_token_prediction.MultiTokenPredictionLayerLinen(
+      self.mtp_layer = multi_token_prediction.MultiTokenPredictionLayer(
           config=self.cfg,
           mesh=self.mesh,
           layer_number=TEST_LAYER_NUM,
           transformer_layer_module=DecoderLayer,
+          rngs=self.rngs,
       )
 
     # Dimensions directly from the config object
@@ -216,6 +220,7 @@ def setUp(self):
         skip_jax_distributed_system=True,
         mtp_num_layers=2,
         base_emb_dim=16,
+        pure_nnx_decoder=False,
         **extra_args,
     )
     self.nnx_rngs = nnx.Rngs(params=0)
diff --git a/tests/unit/sharding_compare_test.py b/tests/unit/sharding_compare_test.py
@@ -14,6 +14,7 @@
 
 """Compare expected sharding of models with actual sharding of models."""
 
+import functools
 import hashlib
 import json
 import os
@@ -125,6 +126,9 @@ def test_sharding_dump_for_model(model_name: str, topology: str, num_slice: str)
       f"compile_topology={topology}",
       f"compile_topology_num_slices={num_slice}",
       f"model_name={model_name}",
+      "pure_nnx=False",
+      "enable_nnx=False",
+      "pure_nnx_decoder=False",
   ]
 
   root_dir = "tests/utils/sharding_info"
@@ -205,13 +209,15 @@ def abstract_state_and_shardings(request):
   tx = optimizers.get_optimizer(config, learning_rate_schedule)
   rng = jax.random.PRNGKey(0)
 
+  init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, config, True, rng)
+
   # Get abstract state and physical shardings from maxtext_utils
   abstract_state, _, state_mesh_shardings = maxtext_utils.get_abstract_state(
-      model, tx, config, rng, topology_mesh, is_training=True
+      config, topology_mesh, init_state_fn, is_training=True
   )
 
   # Get logical shardings from maxtext_utils
-  logical_shardings = maxtext_utils.get_logical_annotations(model, tx, config, rng, topology_mesh, is_training=True)
+  logical_shardings = maxtext_utils.get_logical_annotations(config, topology_mesh, init_state_fn)
 
   return model_name, topology, num_slice, abstract_state, state_mesh_shardings, logical_shardings
 
diff --git a/tests/unit/tiling_test.py b/tests/unit/tiling_test.py
diff --git a/tests/unit/train_compile_test.py b/tests/unit/train_compile_test.py
diff --git a/tools/gcs_benchmarks/standalone_dataloader.py b/tools/gcs_benchmarks/standalone_dataloader.py

Original file line number	Diff line number	Diff line change
`@@ -179,6 +179,7 @@ def assert_compile_and_real_match_jaxpr(self, test_name, *extra_args):`
`179`	`179`	`"enable_checkpointing=False",`
`180`	`180`	`"dump_jaxpr=True",`
`181`	`181`	`"dump_jaxpr_delete_local_after=False",`
	`182`	`+ "skip_first_n_steps_for_profiler=0",`
`182`	`183`	`]`
`183`	`184`	`if extra_args:`
`184`	`185`	`shared_args.extend(extra_args)`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,8 @@ def run_e2e_test_flow(hardware, model_config, attention_type="autoselected", sta`
`54`	`54`	`f"attention={attention_type}",`
`55`	`55`	`"max_target_length=128",`
`56`	`56`	`"per_device_batch_size=1",`
	`57`	`+ "profiler=''",`
	`58`	`+ "pure_nnx=False",`
`57`	`59`	`] + model_config`
`58`	`60`
`59`	`61`	`pathways_command = []`
`@@ -72,6 +74,7 @@ def run_e2e_test_flow(hardware, model_config, attention_type="autoselected", sta`
`72`	`74`	`dataset_type="tfds",`
`73`	`75`	`dataset_path=dataset_path,`
`74`	`76`	`)`
	`77`	`+ + ["pure_nnx=False"]`
`75`	`78`	`)`
`76`	`79`	`state_path = f"{base_output_directory}/runner_{run_date}/checkpoints/0/items"`
`77`	`80`
Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,9 @@ def test(self):`
`53`	`53`	`"weight_dtype=bfloat16",`
`54`	`54`	`"attention=dot_product",`
`55`	`55`	`"skip_jax_distributed_system=True",`
	`56`	`+ "profiler=''",`
	`57`	`+ "pure_nnx=False",`
	`58`	`+ "enable_nnx=False",`
`56`	`59`	`]`
`57`	`60`	`)`
`58`	`61`	`run_benchmarks(config)`