fix: preserve rebased PP/EP path compatibility and refresh artifacts

jasont314 · nazar-ospanov · zimo0110 · jasont314 · commit bf350035b91c · 2026-02-20T12:34:37.000Z
Co-authored-by: Nazar Ospanov &lt;aimogenius@berkeley.edu&gt;
Co-authored-by: Zoir Imomaliev &lt;91550816+zimo0110@users.noreply.github.com&gt;
Co-authored-by: Sanjay Adhikesaven &lt;sanjay.adhikesaven1@gmail.com&gt;
Signed-off-by: Jason Trinh &lt;jasontrinh@berkeley.edu&gt;
diff --git a/checkpoints/optimized_training.jsonl b/checkpoints/optimized_training.jsonl
diff --git a/examples/llm_finetune/nemotron/nemotron_nano_v3_pp_ep_squad.yaml b/examples/llm_finetune/nemotron/nemotron_nano_v3_pp_ep_squad.yaml
@@ -110,7 +110,9 @@ validation_dataset:
   _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
   dataset_name: rajpurkar/squad
   split: validation
-  limit_dataset_samples: 64
+  # With dp=2 and local_batch_size=64, keep at least 128 samples so each DP rank
+  # gets a full local batch during validation (avoids PP microbatch shape mismatch).
+  limit_dataset_samples: 128
   seq_length: 1024
   padding: max_length
   truncation: true
diff --git a/nemo_automodel/_transformers/auto_model.py b/nemo_automodel/_transformers/auto_model.py
@@ -39,12 +39,21 @@
 from transformers import (  # noqa: E402
     AutoModelForCausalLM,
     AutoModelForImageTextToText,
-    AutoModelForMultimodalLM,
     AutoModelForSequenceClassification,
     AutoModelForTextToWaveform,
     PreTrainedModel,
 )
-from transformers.initialization import no_init_weights  # noqa: E402
+try:  # noqa: E402
+    from transformers import AutoModelForMultimodalLM  # noqa: E402
+except ImportError:  # transformers<4.58
+    # Older transformers releases expose image-text multimodal auto-models
+    # under AutoModelForImageTextToText but not AutoModelForMultimodalLM.
+    AutoModelForMultimodalLM = AutoModelForImageTextToText
+try:  # noqa: E402
+    from transformers.initialization import no_init_weights  # noqa: E402
+except ImportError:  # transformers<4.58
+    from transformers.modeling_utils import no_init_weights  # noqa: E402
+
 from transformers.models.auto.auto_factory import _BaseAutoModelClass  # noqa: E402
 from transformers.utils import ContextManagers  # noqa: E402
 
diff --git a/nemo_automodel/_transformers/utils.py b/nemo_automodel/_transformers/utils.py
@@ -117,7 +117,10 @@ def _patch_special_tokens_pattern():
     lack CLS/SEP tokens end up with ``None`` IDs in the sequence, crashing
     ``pad()``.
     """
-    from transformers.tokenization_python import PreTrainedTokenizer
+    try:
+        from transformers.tokenization_python import PreTrainedTokenizer
+    except ModuleNotFoundError:  # transformers<5.x
+        from transformers.tokenization_utils import PreTrainedTokenizer
 
     _orig_init = PreTrainedTokenizer.__init__
 
diff --git a/nemo_automodel/components/distributed/pipelining/functional.py b/nemo_automodel/components/distributed/pipelining/functional.py
@@ -482,8 +482,94 @@ def split_model_into_stages(
     pp_rank = pp_mesh.get_local_rank()
     pp_size = pp_mesh.size()
     # Detect model structure
-    has_model_attr = hasattr(model, "model") and getattr(model, "model", None) is not None
-    has_backbone_attr = (not has_model_attr) and hasattr(model, "backbone") and getattr(model, "backbone", None) is not None
+    model_has_model_attr = hasattr(model, "model") and getattr(model, "model", None) is not None
+    model_has_backbone_attr = hasattr(model, "backbone") and getattr(model, "backbone", None) is not None
+
+    def _submodule_exists(module_root: nn.Module, module_fqn: str) -> bool:
+        if not module_fqn:
+            return True
+        try:
+            module_root.get_submodule(module_fqn)
+            return True
+        except Exception:
+            return False
+
+    def _normalize_stage_fqn_aliases(explicit_stages: list[list[str]]) -> list[list[str]]:
+        alias_suffixes = (
+            (".embeddings", ".embed_tokens"),
+            (".embed_tokens", ".embeddings"),
+            (".norm_f", ".norm"),
+            (".norm", ".norm_f"),
+        )
+        rewrites: list[tuple[str, str]] = []
+        normalized_stages: list[list[str]] = []
+        for stage_modules in explicit_stages:
+            normalized_stage: list[str] = []
+            for module_fqn in stage_modules:
+                normalized_fqn = module_fqn
+                if not _submodule_exists(model, normalized_fqn):
+                    for src_suffix, dst_suffix in alias_suffixes:
+                        if normalized_fqn.endswith(src_suffix):
+                            candidate = normalized_fqn[: -len(src_suffix)] + dst_suffix
+                            if _submodule_exists(model, candidate):
+                                rewrites.append((normalized_fqn, candidate))
+                                normalized_fqn = candidate
+                                break
+                normalized_stage.append(normalized_fqn)
+            normalized_stages.append(normalized_stage)
+
+        if rewrites:
+            # De-duplicate while preserving insertion order.
+            unique_rewrites = list(dict.fromkeys(rewrites))
+            logger.info(
+                "Rewriting pipeline stage FQN aliases for current model structure: %s",
+                ", ".join(f"{src}->{dst}" for src, dst in unique_rewrites),
+            )
+
+        return normalized_stages
+
+    # Normalize explicit stage FQNs to the model's actual root attribute.
+    if module_names_per_stage is not None:
+        uses_backbone_prefix = any(
+            module_fqn == "backbone" or module_fqn.startswith("backbone.")
+            for stage_modules in module_names_per_stage
+            for module_fqn in stage_modules
+        )
+        uses_model_prefix = any(
+            module_fqn == "model" or module_fqn.startswith("model.")
+            for stage_modules in module_names_per_stage
+            for module_fqn in stage_modules
+        )
+
+        if uses_backbone_prefix and not model_has_backbone_attr and model_has_model_attr:
+            logger.info("Rewriting pipeline stage FQNs from backbone.* to model.* for current model structure.")
+            module_names_per_stage = [
+                [
+                    ("model." + module_fqn[len("backbone.") :] if module_fqn.startswith("backbone.") else module_fqn)
+                    for module_fqn in stage_modules
+                ]
+                for stage_modules in module_names_per_stage
+            ]
+        elif uses_model_prefix and not model_has_model_attr and model_has_backbone_attr:
+            logger.info("Rewriting pipeline stage FQNs from model.* to backbone.* for current model structure.")
+            module_names_per_stage = [
+                [
+                    ("backbone." + module_fqn[len("model.") :] if module_fqn.startswith("model.") else module_fqn)
+                    for module_fqn in stage_modules
+                ]
+                for stage_modules in module_names_per_stage
+            ]
+
+        module_names_per_stage = _normalize_stage_fqn_aliases(module_names_per_stage)
+
+    prefer_backbone_attr = False
+    if module_names_per_stage is not None:
+        prefer_backbone_attr = any(
+            module_fqn.startswith("backbone.") for stage_modules in module_names_per_stage for module_fqn in stage_modules
+        )
+
+    has_backbone_attr = model_has_backbone_attr and (prefer_backbone_attr or not model_has_model_attr)
+    has_model_attr = model_has_model_attr and not has_backbone_attr
     if has_backbone_attr:
         text_model = model.backbone
         text_model_attr_name = ""
diff --git a/nemo_automodel/components/distributed/pipelining/hf_utils.py b/nemo_automodel/components/distributed/pipelining/hf_utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+import inspect
 import types
 from typing import TYPE_CHECKING, Callable, Optional, Union
 
@@ -25,7 +26,7 @@
 logger = logging.getLogger(__name__)
 
 # Constants for identifying text/language modules in multimodal models
-TEXT_MODULE_ATTRS = ("language_model", "text_model", "text_decoder")
+TEXT_MODULE_ATTRS = ("language_model", "text_model", "text_decoder", "backbone")
 MULTIMODAL_SUFFIXES = (
     "vision_tower",
     "visual",
@@ -127,7 +128,7 @@ def pipeline_forward(
                 causal_mask = (
                     self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
                     if hasattr(self, "_update_causal_mask")
-                    else attention_mask
+                    else None
                 )
                 mamba_mask = (
                     self._update_mamba_mask(attention_mask, cache_position)
@@ -142,12 +143,29 @@ def pipeline_forward(
                     layer_mask = causal_mask if pp_needs_attention_mask else None
                 else:
                     layer_mask = None
-                hidden_states = mixer_block(
-                    hidden_states,
-                    cache_params=past_key_values,
-                    cache_position=cache_position if pp_needs_cache_position else None,
-                    attention_mask=layer_mask,
-                )
+                # Some NemotronH-like blocks (e.g., local NemotronV3Block) do not accept
+                # cache kwargs, while HF NemotronH blocks do. Use signature-aware dispatch.
+                signature_owner = getattr(mixer_block, "_checkpoint_wrapped_module", mixer_block)
+                supports_cache_params = getattr(signature_owner, "_nemo_pp_supports_cache_params", None)
+                supports_cache_position = getattr(signature_owner, "_nemo_pp_supports_cache_position", None)
+                if supports_cache_params is None or supports_cache_position is None:
+                    try:
+                        forward_params = inspect.signature(signature_owner.forward).parameters
+                        supports_cache_params = "cache_params" in forward_params
+                        supports_cache_position = "cache_position" in forward_params
+                    except (TypeError, ValueError):
+                        supports_cache_params = True
+                        supports_cache_position = True
+                    setattr(signature_owner, "_nemo_pp_supports_cache_params", supports_cache_params)
+                    setattr(signature_owner, "_nemo_pp_supports_cache_position", supports_cache_position)
+
+                block_kwargs = {"attention_mask": layer_mask}
+                if supports_cache_params:
+                    block_kwargs["cache_params"] = past_key_values
+                if supports_cache_position:
+                    block_kwargs["cache_position"] = cache_position if pp_needs_cache_position else None
+
+                hidden_states = mixer_block(hidden_states, **block_kwargs)
         else:
             # Attention mask handling (compilation-friendly):
             # causal_mask_mapping should be precomputed in data pipeline via default_collater
diff --git a/nemo_automodel/components/models/nemotron_v3/model.py b/nemo_automodel/components/models/nemotron_v3/model.py
@@ -160,12 +160,15 @@ def initialize_weights(self, buffer_device: torch.device | None = None) -> None:
         """
         # Embedding weights: normal initialization
         with buffer_device:
-            nn.init.normal_(self.embed_tokens.weight, mean=0.0, std=self.config.initializer_range)
-            self.norm.reset_parameters()
+            if self.embed_tokens is not None and getattr(self.embed_tokens, "weight", None) is not None:
+                nn.init.normal_(self.embed_tokens.weight, mean=0.0, std=self.config.initializer_range)
+            if self.norm is not None and hasattr(self.norm, "reset_parameters"):
+                self.norm.reset_parameters()
 
         # Initialize all layers via delegation
         for block in self.layers.values():
-            block.init_weights(buffer_device=buffer_device)
+            if block is not None:
+                block.init_weights(buffer_device=buffer_device)
 
 
 class NemotronHForCausalLM(HFCheckpointingMixin, nn.Module, MoEFSDPSyncMixin):
@@ -307,8 +310,10 @@ def initialize_weights(
         """
         buffer_device = buffer_device or torch.device(f"cuda:{torch.cuda.current_device()}")
         with buffer_device:
-            self.model.initialize_weights(buffer_device=buffer_device)
-            nn.init.normal_(self.lm_head.weight, mean=0.0, std=self.config.initializer_range)
+            if self.model is not None:
+                self.model.initialize_weights(buffer_device=buffer_device)
+            if self.lm_head is not None and getattr(self.lm_head, "weight", None) is not None:
+                nn.init.normal_(self.lm_head.weight, mean=0.0, std=self.config.initializer_range)
 
         self.to(dtype)
 
diff --git a/nemo_automodel/components/moe/parallelizer.py b/nemo_automodel/components/moe/parallelizer.py
@@ -47,6 +47,39 @@ def _get_cp_stream() -> torch.cuda.Stream:
     return _CP_STREAM
 
 
+def _resolve_text_layer_container(model: nn.Module) -> nn.Module:
+    """Return the module that owns transformer `layers` for block-wise iteration."""
+    if hasattr(model, "layers") and model.layers is not None:
+        return model
+
+    # Try common nested containers first.
+    for attr_name in ("backbone", "model", "language_model", "text_model", "text_decoder"):
+        if hasattr(model, attr_name):
+            nested = getattr(model, attr_name)
+            if nested is None:
+                continue
+            if hasattr(nested, "layers") and nested.layers is not None:
+                return nested
+            nested_text = get_text_module(nested)
+            if hasattr(nested_text, "layers") and nested_text.layers is not None:
+                return nested_text
+
+    # Fallback: search any nested submodule exposing `layers`.
+    for _, submod in model.named_modules():
+        if submod is model:
+            continue
+        if hasattr(submod, "layers") and submod.layers is not None:
+            return submod
+
+    child_names = list(model._modules.keys()) if hasattr(model, "_modules") else []
+    has_backbone = hasattr(model, "backbone") and getattr(model, "backbone") is not None
+    has_model_attr = hasattr(model, "model") and getattr(model, "model") is not None
+    raise AttributeError(
+        "Could not find a module with `layers` under "
+        f"{type(model).__name__} (children={child_names[:24]}, has_backbone={has_backbone}, has_model={has_model_attr})"
+    )
+
+
 class ExpertParallel(ParallelStyle):
     """
     ExpertParallel class is used to shard the MoE parameters on the EP mesh.
@@ -83,8 +116,9 @@ def apply_ep(model: nn.Module, ep_mesh: DeviceMesh, moe_mesh: DeviceMesh | None
         _model = model
     # Prefer nested text modules when present
     _model = get_text_module(_model)
+    _layer_container = _resolve_text_layer_container(_model)
 
-    for _, block in _model.layers.named_children():
+    for _, block in _layer_container.layers.named_children():
         moe_module = block.moe if hasattr(block, "moe") else block.mlp
         if isinstance(moe_module, MoE):
             # GroupedExpertsTEGroupedLinear uses TE's GroupedLinear which creates
@@ -145,15 +179,17 @@ def selective_checkpointing_context_fn():
         _model = model.model
     else:
         _model = model
-    for layer_id, block in _model.layers.named_children():
+    _model = get_text_module(_model)
+    _layer_container = _resolve_text_layer_container(_model)
+    for layer_id, block in _layer_container.layers.named_children():
         if ignore_router:
             block = ptd_checkpoint_wrapper(
                 block, preserve_rng_state=True, context_fn=selective_checkpointing_context_fn
             )
         else:
             block = ptd_checkpoint_wrapper(block, preserve_rng_state=True)
 
-        _model.layers.register_module(layer_id, block)
+        _layer_container.layers.register_module(layer_id, block)
 
 
 def apply_fsdp(
@@ -193,8 +229,9 @@ def apply_fsdp(
         _model = model
     # handle VLM
     _model = get_text_module(_model)
+    _layer_container = _resolve_text_layer_container(_model)
 
-    for _, block in _model.layers.named_children():
+    for _, block in _layer_container.layers.named_children():
         moe_module = block.moe if hasattr(block, "moe") else block.mlp
         if isinstance(moe_module, MoE) and ep_shard_enabled:
             # Apply FSDP on dim=1 for grouped experts since we may have more
@@ -217,8 +254,8 @@ def apply_fsdp(
 
         fully_shard_default(block, ignored_params=ignored_params)
 
-    if hasattr(_model, "embed_tokens") and _model.embed_tokens is not None:
-        fully_shard_default(_model.embed_tokens)
+    if hasattr(_layer_container, "embed_tokens") and _layer_container.embed_tokens is not None:
+        fully_shard_default(_layer_container.embed_tokens)
 
     lm_head = getattr(_model, "lm_head", None) or getattr(model, "lm_head", None)
     if lm_head is not None:
@@ -252,7 +289,7 @@ def apply_fsdp(
         else:
             logging.info("Skipping FSDP wrap for frozen visual tower")
 
-    fully_shard_default(_model)
+    fully_shard_default(_layer_container)
 
     # If model has a nested structure (outer model wrapping inner _model), wrap the outer model if requested
     if wrap_outer_model and model is not _model:
@@ -266,8 +303,10 @@ def apply_cp(model: torch.nn.Module, cp_mesh: DeviceMesh, cp_comm_type: str = "p
         _model = model.model
     else:
         _model = model
+    _model = get_text_module(_model)
+    _layer_container = _resolve_text_layer_container(_model)
 
-    for _, block in _model.layers.named_children():
+    for _, block in _layer_container.layers.named_children():
         attn_module = block.self_attn.attn_module
         assert isinstance(attn_module, DotProductAttention), (
             "Context parallelism is only supported for TransformerEngine's DotProductAttention"
@@ -307,10 +346,24 @@ def parallelize_model(
 
     ep_enabled = ep_axis_name is not None and moe_mesh is not None and moe_mesh[ep_axis_name].size() > 1
     if ep_enabled:
-        assert model.model.moe_config.n_routed_experts % moe_mesh[ep_axis_name].size() == 0, (
-            f"n_routed_experts {model.model.moe_config.n_routed_experts} must be divisible by "
-            f"expert_parallel_degree {moe_mesh[ep_axis_name].size()}"
-        )
+        _model = model.model if hasattr(model, "model") and model.model is not None else model
+        _model = get_text_module(_model)
+        n_routed_experts = None
+        if hasattr(_model, "moe_config") and _model.moe_config is not None:
+            n_routed_experts = getattr(_model.moe_config, "n_routed_experts", None)
+        if n_routed_experts is None and hasattr(_model, "config"):
+            for attr in ("n_routed_experts", "moe_num_experts", "num_experts"):
+                if hasattr(_model.config, attr):
+                    n_routed_experts = getattr(_model.config, attr)
+                    break
+
+        if n_routed_experts is not None:
+            assert n_routed_experts % moe_mesh[ep_axis_name].size() == 0, (
+                f"n_routed_experts {n_routed_experts} must be divisible by "
+                f"expert_parallel_degree {moe_mesh[ep_axis_name].size()}"
+            )
+        else:
+            logger.warning("Could not infer n_routed_experts; skipping EP divisibility assertion.")
 
         apply_ep(model, moe_mesh[ep_axis_name], moe_mesh=moe_mesh)
 
diff --git a/nemo_automodel/recipes/_dist_setup.py b/nemo_automodel/recipes/_dist_setup.py
diff --git a/nemo_automodel/recipes/llm/train_ft.py b/nemo_automodel/recipes/llm/train_ft.py