Extend FSDP2 unit tests to include DCP checkpointing and parity tests.

cspades · cspades · commit 2aadb35c02b8 · 2026-03-05T10:30:12.000-08:00
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/tests/pytorch/distributed/run_fsdp2_model.py b/tests/pytorch/distributed/run_fsdp2_model.py
@@ -4,14 +4,18 @@
 #
 # See LICENSE for license information.
 
+import argparse
 import os
 import sys
-import argparse
+import shutil
+from contextlib import nullcontext
+from copy import deepcopy
 from dataclasses import dataclass
+from pathlib import Path
 
 import transformer_engine.pytorch as te
 import transformer_engine.common.recipe
-
+from transformer_engine.pytorch import QuantizedTensor
 import torch
 import torch.distributed as dist
 from torch.distributed.checkpoint import save, load
@@ -27,11 +31,13 @@
 from torch.distributed import DeviceMesh
 from torch.distributed._composable.fsdp import fully_shard
 from torch.distributed.device_mesh import init_device_mesh
-from transformer_engine.pytorch import QuantizedTensor
-from contextlib import nullcontext
 
 LOCAL_RANK = None
 
+# Needed for `torch.distributed.checkpoint.{save,load}` because
+# multiple processes need to write to the same directory.
+SHARED_TMP_DIR = "/tmp/pytest-shared-tmp"
+
 
 @dataclass
 class AppState(Stateful):
@@ -63,7 +69,7 @@ def state_dict(self):
                 # yet get_state_dict / _init_optim_state produce empty Tensors.
                 # TransformerEngine uses empty Tensors for dummy Parameters.
                 optimizer_state_dict["state"][fqn] = {}
-            if fqn.endswith("._extra_state"):
+            if fqn.endswith("_extra_state"):
                 # Evict `_extra_state` quantization data from model checkpoint.
                 model_state_dict.pop(fqn)
         return {
@@ -352,7 +358,9 @@ def test_fp8_fsdp2_allgather(model):
     # FP32 manual weight allgather
     fp32_allgathered_params = {}
     for name, param in model.named_parameters():
-        assert isinstance(param, DTensor)
+        assert isinstance(
+            param, DTensor
+        ), f"[test_fp8_fsdp2_allgather] {param} should be a DTensor."
         local_tensor = param._local_tensor
         device_mesh = param.device_mesh
         dist_group = (
@@ -471,7 +479,7 @@ def _train(args):
     optimizer = optim.Adam(model.parameters(), lr=1e-3)
 
     """
-    Pre-Save Training
+    FSDP2 Training
     """
     for iteration in range(args.iter):
         # Zero the parameter gradients
@@ -499,6 +507,148 @@ def _train(args):
     if args.fp8_init:
         test_fp8_fsdp2_allgather(model)
 
+    """
+    DCP Checkpoint Testing
+    """
+    # Compute the pre-save model loss to the last random input
+    # with respect to the last random target.
+    model.eval()
+    with te.autocast(enabled=True, recipe=fp8_recipe):
+        output = model(input_data)
+    pre_save_loss = F.mse_loss(output, target)
+
+    # Save deep copy of the model and optimizer state before checkpointing.
+    # NOTE(@cspades): deepcopy has issues with DTensors. Just clone().
+    s1 = {}
+    for key, val in model.state_dict().items():
+        s1[key] = val.clone()
+    optim_state_dict = optimizer.state_dict()
+    o1 = {"state": {}}
+    for idx, state in optim_state_dict["state"].items():
+        o1_state = o1["state"].setdefault(idx, {})
+        for key, val in state.items():
+            o1_state[key] = val.clone()
+    o1["param_groups"] = deepcopy(optim_state_dict["param_groups"])
+
+    # Write model to checkpoint.
+    CKPT_DIR = (
+        Path(SHARED_TMP_DIR)
+        / "run_fsdp2_model"
+        / f"dcp-{'_'.join(str(x) for x in args.sharding_dims)}-{args.layer_type}-{args.recipe}-fp8_init_{args.fp8_init}"
+    )
+    CKPT_DIR.mkdir(parents=True, exist_ok=True, mode=0o777)
+    state_dict = {"app": AppState(model=model, optimizer=optimizer)}
+    torch.distributed.checkpoint.save(state_dict, checkpoint_id=str(CKPT_DIR))
+
+    # Perform an extra training step to change the weights such that
+    # state parity tests will fail unless the checkpoint is loaded
+    # without any errors or incongruities vs. the saved model state.
+    model.train()
+    for iteration in range(args.iter):
+        optimizer.zero_grad()
+        with (
+            torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+            if args.recipe == "NVFP4BlockScaling"
+            else nullcontext()
+        ):
+            with te.autocast(enabled=True, recipe=fp8_recipe):
+                output = model(torch.randn(inp_shape).to(device))
+                loss = F.mse_loss(output, torch.randn(out_shape).to(device))
+        loss.backward()
+        optimizer.step()
+
+    # Load the checkpoint.
+    state_dict = {"app": AppState(model=model, optimizer=optimizer)}
+    torch.distributed.checkpoint.load(state_dict=state_dict, checkpoint_id=str(CKPT_DIR))
+
+    # FIXME(@cspades): DelayedScaling checkpointing has tiny uint8 parity issues
+    # that affects the dequantized model state. Only test loss parity.
+    if args.recipe != "DelayedScaling" and args.fp8_init:
+        # Validate checkpoint parity with pre-save state dictionaries.
+        # Compare pre-save and post-load model state dictionaries.
+        s2 = model.state_dict()
+        nonempty_model_state = False
+        for key in s1.keys() | s2.keys():
+            if key.endswith("_extra_state"):
+                # Don't parity test _extra_state. Shape can change after reset_parameters().
+                continue
+            v1 = s1.get(key, None)
+            if isinstance(v1, DTensor):
+                v1 = v1.to_local()
+            v2 = s2.get(key, None)
+            if isinstance(v2, DTensor):
+                v2 = v2.to_local()
+            assert (
+                v1 is not None and v2 is not None
+            ), f"[{key} Not Found] Original Param: {v1} | Checkpoint Param: {v2}"
+            assert (
+                v1.shape == v2.shape
+            ), f"[Checkpoint Param {key} Shape Mismatch] {v1.shape} != {v2.shape}"
+            assert torch.allclose(v1, v2), f"[Checkpoint Param {key} Value Mismatch] {v1} != {v2}"
+            nonempty_model_state = True
+        assert nonempty_model_state, "Model state should not be empty for evenly-sharded DTensors!"
+
+        # Compare pre-save and post-load optimizer state dictionaries.
+        o2 = optimizer.state_dict()
+        nonempty_optim_state = False
+        for param_id in o1["state"].keys() | o2["state"].keys():
+            param_state_1 = o1["state"].get(param_id, None)
+            param_state_2 = o2["state"].get(param_id, None)
+            assert param_state_1 is not None and param_state_2 is not None, (
+                f"[{param_id} Not Found] Original Optim State: {param_state_1} | Checkpoint Optim"
+                f" State: {param_state_2}"
+            )
+            for key in param_state_1.keys() | param_state_2.keys():
+                v1 = param_state_1.get(key, None)
+                if isinstance(v1, DTensor):
+                    v1 = v1.to_local()
+                v2 = param_state_2.get(key, None)
+                if isinstance(v2, DTensor):
+                    v2 = v2.to_local()
+                assert v1 is not None and v2 is not None, (
+                    f"[{param_id} {key} Not Found] Original Optim State: {v1} | Checkpoint Optim"
+                    f" State: {v2}"
+                )
+                assert (
+                    v1.shape == v2.shape
+                ), f"[Optim State {param_id} {key} Shape Mismatch] {v1.shape} != {v2.shape}"
+                assert torch.allclose(
+                    v1, v2
+                ), f"[Optim State {param_id} {key} Value Mismatch] {v1} != {v2}"
+                nonempty_optim_state = True  # Optimizer state depends on wgrad, verify this!
+        assert (
+            nonempty_optim_state
+        ), "Optimizer state should not be empty for evenly-sharded DTensors!"
+        assert len(o1["param_groups"]) == len(o2["param_groups"]), (
+            f"[Optim State Param Groups Length Mismatch] {o1['param_groups']} !="
+            f" {o2['param_groups']}"
+        )
+        for i in range(len(o2["param_groups"])):
+            for key in o1["param_groups"][i].keys():
+                v1 = o1["param_groups"][i][key]
+                v2 = o2["param_groups"][i][key]
+                assert v1 == v2, f"[Optim State Param Group {i} {key} Value Mismatch] {v1} != {v2}"
+
+    # Validate post-load model loss.
+    model.eval()
+    with (
+        torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+        if args.recipe == "NVFP4BlockScaling"
+        else nullcontext()
+    ):
+        with te.autocast(enabled=True, recipe=fp8_recipe):
+            output = model(input_data)
+            post_load_loss = F.mse_loss(output, target)
+    # Allow for 1% disparity due to _extra_state disparity.
+    assert torch.allclose(
+        pre_save_loss, post_load_loss, rtol=1e-2
+    ), f"Pre-Save Loss: {pre_save_loss} != Post-Load Loss: {post_load_loss}"
+
+    # Clean up temporary checkpoint directory.
+    if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+        shutil.rmtree(CKPT_DIR)
+    torch.distributed.barrier()
+
     dist.destroy_process_group()
     return 0
 
diff --git a/tests/pytorch/distributed/test_torch_fsdp2.py b/tests/pytorch/distributed/test_torch_fsdp2.py
@@ -74,7 +74,7 @@ def _run_test(fp_init, sharding_dims, recipe, layer_type):
     subprocess.run(test_cmd, env=os.environ, check=True)
 
 
-@pytest.mark.skipif(NUM_PROCS % 2 != 0, reason="Requires even number of GPUs")
+@pytest.mark.skipif(NUM_PROCS % 2 != 0, reason="Requires even number of GPUs.")
 @pytest.mark.skipif(not te.torch_version() >= (2, 4, 0), reason="Requires PyTorch 2.4.0+")
 @pytest.mark.parametrize(
     "sharding_dims",
@@ -83,16 +83,20 @@ def _run_test(fp_init, sharding_dims, recipe, layer_type):
         [NUM_PROCS],
         # HSDP
         [2, NUM_PROCS // 2],
-        # FSDP-TP
-        [1, 2, NUM_PROCS // 2],
-        # HSDP-TP
+        # (H/F)SDP-TP
         [NUM_PROCS // 4, 2, 2],
     ),
 )
 @pytest.mark.parametrize("fp8_init", (False, True))
 @pytest.mark.parametrize("layer_type", ("LayerNormLinear", "TransformerLayer"))
 def test_distributed(fp8_init, sharding_dims, fp_recipe, layer_type):
 
+    parallel_size = math.prod(x for x in sharding_dims if x != 0)
+    if NUM_PROCS < parallel_size:
+        pytest.skip(
+            f"Insufficient devices ({NUM_PROCS}) to test sharding configuration: {sharding_dims}"
+        )
+
     if fp_recipe in ("Float8BlockScaling", "NVFP4BlockScaling") and fp8_init:
         pytest.xfail(f"{fp_recipe} + fp8_init: test_fp8_fsdp2_allgather is currently failing.")
 
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -575,7 +575,8 @@ def set_device_mesh(
         weight_mesh : Optional[DeviceMesh]
             Not used for DotProductAttention as there are no quantized weights.
         """
-        warnings.warn(f"weight_mesh not necessary for {self.__class__.__name__}: {weight_mesh}")
+        if weight_mesh is not None:
+            warnings.warn(f"weight_mesh not necessary for {self.__class__.__name__}: {weight_mesh}")
         if tp_mesh is not None:
             # Validate TP DeviceMesh / Group. Must be consistent with tp_size.
             assert tp_mesh.ndim == 1 and self.tp_size == tp_mesh.size(), (
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -11,6 +11,7 @@
 import torch
 from torch.distributed import DeviceMesh
 from torch.distributed.tensor import DTensor
+from torch.distributed.tensor.placement_types import Replicate, Shard, _StridedShard
 
 import transformer_engine_torch as tex
 
@@ -800,13 +801,17 @@ def make_grouped_weights(self, defer_init=False) -> None:
             weight_quantizers[0] is None or not weight_quantizers[0].internal
         ), "Found internal quantizer with `single_grouped_parameter=True`."
         grouped_param = torch.nn.Parameter(grouped_weights)
-        if isinstance(getattr(self, f"weight0", None), DTensor):
+        if isinstance(getattr(self, "weight0", None), DTensor):
             # Convert to DTensor with properties equivalent to the original DTensor.
-            dtensor_member_param = getattr(self, f"weight0")
+            dtensor_member_param = getattr(self, "weight0")
+            grouped_3d_placements = tuple(
+                type(p)(p.dim + 1) if isinstance(p, (Shard, _StridedShard)) else p
+                for p in dtensor_member_param.placements
+            )
             grouped_param = _convert_param_to_dtensor_param(
                 grouped_param,
                 device_mesh=dtensor_member_param.device_mesh,
-                placements=dtensor_member_param.placements,
+                placements=grouped_3d_placements,
                 # DTensor / DCP will view this as a TP-sharded 3-D Tensor.
                 shape=(self.num_gemms, self.out_features, self.in_features),
                 # Default Stride: (out*in, in, 1)
@@ -878,8 +883,6 @@ def set_device_mesh(
             self.set_tensor_parallel_group(tp_mesh.get_group())
 
             # Construct TP-sharded DTensors.
-            from torch.distributed.tensor.placement_types import Replicate, Shard
-
             for weight in self.weight_names:
                 param = getattr(self, weight)
                 placements = (Replicate(),)
diff --git a/transformer_engine/pytorch/module/layernorm.py b/transformer_engine/pytorch/module/layernorm.py
@@ -168,7 +168,8 @@ def set_device_mesh(
             Quantized DTensor parameters are currently not supported for FusibleOperation(s),
             and this mesh is not used.
         """
-        warnings.warn(f"weight_mesh not necessary for {self.__class__.__name__}: {weight_mesh}")
+        if weight_mesh is not None:
+            warnings.warn(f"weight_mesh not necessary for {self.__class__.__name__}: {weight_mesh}")
         if tp_mesh is not None:
             # Construct TP-Replicate DTensors. Used to shim non-TP parameters for compatibility
             # with DTensor parameters in TP layers to support DTensor operations.
diff --git a/transformer_engine/pytorch/module/rmsnorm.py b/transformer_engine/pytorch/module/rmsnorm.py
@@ -171,7 +171,8 @@ def set_device_mesh(
             Quantized DTensor parameters are currently not supported for FusibleOperation(s),
             and this mesh is not used.
         """
-        warnings.warn(f"weight_mesh not necessary for {self.__class__.__name__}: {weight_mesh}")
+        if weight_mesh is not None:
+            warnings.warn(f"weight_mesh not necessary for {self.__class__.__name__}: {weight_mesh}")
         if tp_mesh is not None:
             # Construct TP-Replicate DTensors. Used to shim non-TP parameters for compatibility
             # with DTensor parameters in TP layers to support DTensor operations.