NVIDIA · ksivaman · Feb 27, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 16, 2026
diff --git a/tests/pytorch/test_grouped_tensor.py b/tests/pytorch/test_grouped_tensor.py
diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py
@@ -2,7 +2,7 @@
 #
 # See LICENSE for license information.
 
-from typing import Optional
+from typing import Optional, List
 
 import torch
 import pytest
@@ -137,6 +137,117 @@ def reset_global_fp8_state():
     FP8GlobalStateManager.reset()
 
 
+def check_grouped_tensor_pointers_helper(tensors, num_elems_in_byte=1, tensor_name="tensor"):
+    """
+    Verify that tensors are stored in contiguous memory.
+
+    Args:
+        tensors: List or iterable of tensors to check
+        num_elems_in_byte: Number of elements packed per byte (1 for normal, 2 for NVFP4)
+        tensor_name: Name to use in error messages
+    """
+    tensor_list = list(tensors)
+    if len(tensor_list) < 2:
+        return  # Nothing to check
+
+    for i in range(1, len(tensor_list)):
+        prev_tensor = tensor_list[i - 1]
+        curr_tensor = tensor_list[i]
+
+        # Calculate expected offset based on previous tensor size
+        prev_numel = prev_tensor.numel()
+        expected_offset = (prev_numel // num_elems_in_byte) * prev_tensor.element_size()
+
+        # Verify current tensor's data pointer is correctly offset
+        expected_ptr = prev_tensor.data_ptr() + expected_offset
+        actual_ptr = curr_tensor.data_ptr()
+
+        assert (
+            actual_ptr == expected_ptr
+        ), f"{tensor_name} {i} data pointer mismatch: expected {expected_ptr}, got {actual_ptr}"
+
+
+def check_grouped_tensor_pointers(
+    weights: List[torch.Tensor], fp8_recipe: Optional[recipe.Recipe] = None
+):
+    """
+    Verify that the pointers of the weights are in contiguous memory for GroupedTensor.
+    TODO(ksivaman): This check can be made way more efficient but for now leaving the brute force approach.
+    """
+
+    num_elems_in_a_data_byte = 1 if fp8_recipe is None else 2 if fp8_recipe.nvfp4() else 1
+
+    # Check data.
+    if hasattr(weights[0], "_data") and weights[0]._data is not None:
+        data_tensors = [w._data for w in weights]
+        check_grouped_tensor_pointers_helper(data_tensors, num_elems_in_byte=1, tensor_name="data")
+
+    # Check transpose.
+    if hasattr(weights[0], "_transpose") and weights[0]._transpose is not None:
+        transpose_tensors = [w._transpose for w in weights]
+        check_grouped_tensor_pointers_helper(
+            transpose_tensors, num_elems_in_byte=1, tensor_name="transpose"
+        )
+
+    # Check scale_inv.
+    if hasattr(weights[0], "_scale_inv") and weights[0]._scale_inv is not None:
+        scale_inv_tensors = [w._scale_inv for w in weights]
+        check_grouped_tensor_pointers_helper(
+            scale_inv_tensors, num_elems_in_byte=1, tensor_name="scale_inv"
+        )
+
+    # Check rowwise scale_inv.
+    if hasattr(weights[0], "_rowwise_scale_inv") and weights[0]._rowwise_scale_inv is not None:
+        scale_inv_tensors = [w._rowwise_scale_inv for w in weights]
+        check_grouped_tensor_pointers_helper(
+            scale_inv_tensors, num_elems_in_byte=1, tensor_name="rowwise_scale_inv"
+        )
+
+    # Check columnwise scale_inv.
+    if (
+        hasattr(weights[0], "_columnwise_scale_inv")
+        and weights[0]._columnwise_scale_inv is not None
+    ):
+        columnwise_scale_inv_tensors = [w._columnwise_scale_inv for w in weights]
+        check_grouped_tensor_pointers_helper(
+            columnwise_scale_inv_tensors,
+            num_elems_in_byte=1,
+            tensor_name="columnwise scale_inv",
+        )
+
+    # Check rowwise amax.
+    if hasattr(weights[0], "_rowwise_amax") and weights[0]._rowwise_amax is not None:
+        rowwise_amax_tensors = [w._rowwise_amax for w in weights]
+        check_grouped_tensor_pointers_helper(
+            rowwise_amax_tensors, num_elems_in_byte=1, tensor_name="rowwise amax"
+        )
+
+    # Check columnwise amax.
+    if hasattr(weights[0], "_columnwise_amax") and weights[0]._columnwise_amax is not None:
+        columnwise_amax_tensors = [w._columnwise_amax for w in weights]
+        check_grouped_tensor_pointers_helper(
+            columnwise_amax_tensors, num_elems_in_byte=1, tensor_name="columnwise amax"
+        )
+
+    # Check rowwise data.
+    if hasattr(weights[0], "_rowwise_data") and weights[0]._rowwise_data is not None:
+        rowwise_data_tensors = [w._rowwise_data for w in weights]
+        check_grouped_tensor_pointers_helper(
+            rowwise_data_tensors,
+            num_elems_in_byte=num_elems_in_a_data_byte,
+            tensor_name="rowwise data",
+        )
+
+    # Check columnwise data.
+    if hasattr(weights[0], "_columnwise_data") and weights[0]._columnwise_data is not None:
+        columnwise_data_tensors = [w._columnwise_data for w in weights]
+        check_grouped_tensor_pointers_helper(
+            columnwise_data_tensors,
+            num_elems_in_byte=num_elems_in_a_data_byte,
+            tensor_name="columnwise data",
+        )
+
+
 def _test_sanity_e2e_amp(block, dtype, config, fp8_recipe, skip_wgrad):
     te_inp_hidden_states = torch.randn(
         (config.max_seqlen_q, config.batch_size, config.hidden_size),
@@ -495,9 +606,17 @@ def test_sanity_grouped_linear(
     use_fp8 = fp8_recipe is not None
     with quantized_model_init(enabled=use_fp8 and fp8_model_params, recipe=fp8_recipe):
         te_grouped_linear = GroupedLinear(
-            num_gemms, config.hidden_size, ffn_hidden_size, bias=use_bias, params_dtype=dtype
+            num_gemms,
+            config.hidden_size,
+            ffn_hidden_size,
+            bias=use_bias,
+            params_dtype=dtype,
         ).cuda()
 
+    # Verify that weights are stored in contiguous GroupedTensor storage.
+    weights = [getattr(te_grouped_linear, f"weight{i}") for i in range(num_gemms)]
+    check_grouped_tensor_pointers(weights, fp8_recipe)
+
     inp_hidden_states = torch.randn(
         num_tokens, config.hidden_size, dtype=dtype, requires_grad=True
     ).cuda()
@@ -956,7 +1075,13 @@ def test_replace_raw_data_for_float8tensor():
     random_bf16_data = torch.randn(fp8_tensor.shape, dtype=torch.bfloat16, device="cuda")
     fp8_quantizer.update_quantized(random_bf16_data, fp8_tensor)
 
-    attrs_to_check = ["_quantizer", "_fp8_dtype", "_scale_inv", "_transpose", "_transpose_invalid"]
+    attrs_to_check = [
+        "_quantizer",
+        "_fp8_dtype",
+        "_scale_inv",
+        "_transpose",
+        "_transpose_invalid",
+    ]
     attrs = {}
     for attr in attrs_to_check:
         attrs[attr] = getattr(fp8_tensor, attr)

diff --git a/transformer_engine/common/recipe/__init__.py b/transformer_engine/common/recipe/__init__.py
@@ -88,33 +88,40 @@ class Recipe:
     Base recipe class.
     """
 
-    def nvfp4(self):
+    @classmethod
+    def nvfp4(cls):
         """Whether the given recipe is NVFP4 1D block scaling."""
-        return isinstance(self, NVFP4BlockScaling)
+        return issubclass(cls, NVFP4BlockScaling)
 
-    def mxfp8(self):
+    @classmethod
+    def mxfp8(cls):
         """Whether the given recipe is MXFP8 block scaling."""
-        return isinstance(self, MXFP8BlockScaling)
+        return issubclass(cls, MXFP8BlockScaling)
 
-    def delayed(self):
+    @classmethod
+    def delayed(cls):
         """Whether the given recipe is delayed scaling."""
-        return isinstance(self, DelayedScaling)
+        return issubclass(cls, DelayedScaling)
 
-    def float8_current_scaling(self):
+    @classmethod
+    def float8_current_scaling(cls):
         """Whether the given recipe is (per-tensor) current scaling."""
-        return isinstance(self, Float8CurrentScaling)
+        return issubclass(cls, Float8CurrentScaling)
 
-    def float8_per_tensor_scaling(self):
+    @classmethod
+    def float8_per_tensor_scaling(cls):
         """Whether the given recipe is per-tensor scaling."""
-        return isinstance(self, (DelayedScaling, Float8CurrentScaling))
+        return issubclass(cls, (DelayedScaling, Float8CurrentScaling))
 
-    def float8_block_scaling(self):
+    @classmethod
+    def float8_block_scaling(cls):
         """Whether the given recipe is float8 blockwise scaling."""
-        return isinstance(self, Float8BlockScaling)
+        return issubclass(cls, Float8BlockScaling)
 
-    def custom(self):
+    @classmethod
+    def custom(cls):
         """Whether the given recipe is custom."""
-        return isinstance(self, CustomRecipe)
+        return issubclass(cls, CustomRecipe)
 
 
 @dataclass()

diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -13,6 +13,7 @@
 import transformer_engine_torch as tex
 
 from transformer_engine.common.recipe import Recipe
+from transformer_engine.pytorch.tensor.storage.grouped_tensor import GroupedTensor
 from .base import (
     get_dummy_wgrad,
     TransformerEngineBaseModule,
@@ -147,7 +148,10 @@ def forward(
             # tensors (like scales), but bulk allocation shares storage across all tensors,
             # so if scales can't be offloaded, nothing in the group can be offloaded.
             inputmats = tex.split_quantize(
-                inp_view, m_splits, input_quantizers, disable_bulk_allocation=cpu_offloading
+                inp_view,
+                m_splits,
+                input_quantizers,
+                disable_bulk_allocation=cpu_offloading,
             )
         elif debug:
             inputmats = DebugQuantizer.multi_tensor_quantize(
@@ -365,7 +369,10 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 for i in range(ctx.num_gemms):
                     grad_biases[i] = grad_output_mats[i].sum(dim=0)
                 grad_output = DebugQuantizer.multi_tensor_quantize(
-                    grad_output_view, ctx.grad_output_quantizers, ctx.m_splits, ctx.activation_dtype
+                    grad_output_view,
+                    ctx.grad_output_quantizers,
+                    ctx.m_splits,
+                    ctx.activation_dtype,
                 )
             else:
                 # Only split grad output. Grad bias is fused with
@@ -436,7 +443,8 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                     if ctx.input_quantizers[0] is not None:
                         for input_quantizer in ctx.input_quantizers:
                             if isinstance(
-                                input_quantizer, (Float8Quantizer, Float8CurrentScalingQuantizer)
+                                input_quantizer,
+                                (Float8Quantizer, Float8CurrentScalingQuantizer),
                             ):
                                 input_quantizer.set_usage(rowwise=True, columnwise=True)
                             else:
@@ -446,7 +454,10 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                         inputmats = tex.split_quantize(inp_view, ctx.m_splits, ctx.input_quantizers)
                     elif ctx.debug:
                         inputmats = DebugQuantizer.multi_tensor_quantize(
-                            inp_view, ctx.input_quantizers, ctx.m_splits, ctx.activation_dtype
+                            inp_view,
+                            ctx.input_quantizers,
+                            ctx.m_splits,
+                            ctx.activation_dtype,
                         )
                     else:
                         inputmats = torch.split(
@@ -616,7 +627,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype
+        self.params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype
         self.num_gemms = num_gemms
         self.in_features = in_features
         self.out_features = out_features
@@ -631,13 +642,20 @@ def __init__(
         assert (
             not ub_overlap_rs and not ub_overlap_ag
         ), "GroupedLinear doesn't support Userbuffer overlap."
+        self.init_method = init_method
         self.get_rng_state_tracker = get_rng_state_tracker
         self.rng_tracker_name = rng_tracker_name
         self.name = name
 
         self.wgrad_store = WeightGradStore(delay_wgrad_compute)
 
-        self._offsets = {"input": 0, "weight": 1, "output": 2, "grad_output": 0, "grad_input": 1}
+        self._offsets = {
+            "input": 0,
+            "weight": 1,
+            "output": 2,
+            "grad_output": 0,
+            "grad_input": 1,
+        }
         self._num_fp8_tensors_per_gemm = {
             "fwd": 3,
             "bwd": 2,
@@ -679,7 +697,7 @@ def __init__(
                         self.out_features,
                         self.in_features,
                         device=device,
-                        dtype=params_dtype,
+                        dtype=self.params_dtype,
                     ),
                 ),
                 init_fn=init_method,
@@ -695,20 +713,21 @@ def __init__(
                         torch.empty(
                             self.out_features,
                             device=device,
-                            dtype=params_dtype,
+                            dtype=self.params_dtype,
                         ),
                     ),
                     init_fn=init_method_constant(0.0),
                 )
             else:
-                bias = torch.Tensor().to(dtype=params_dtype, device=device)
+                bias = torch.Tensor().to(dtype=self.params_dtype, device=device)
                 setattr(self, f"bias{i}", bias)
 
         if self.primary_weights_in_fp8:
             self.init_fp8_metadata(num_gemms=self.num_gemms)
 
         is_meta = torch.device(device).type == "meta"
         self.reset_parameters(defer_init=is_meta)
+        self.make_grouped_weights(defer_init=is_meta)
 
         if self.wgrad_store.delay_wgrad_compute():
             for name, param in self.named_parameters():
@@ -729,8 +748,49 @@ def set_meta_tensor(self, fwd: bool, recipe: Recipe) -> None:
             )
             self._customize_quantizers_float8_current_scaling(fwd, recipe)
 
+    def make_grouped_weights(self, defer_init=False) -> None:
+        """
+        Convert parameters into a GroupedTensor and re-register them as parameters.
+        """
+
+        if defer_init:
+            return
+
+        weights = [getattr(self, f"weight{i}") for i in range(self.num_gemms)]
+        weight_quantizers = self._get_weight_quantizers()
+
+        # Create the weight storage.
+        grouped_weights = GroupedTensor.make_grouped_tensor(
+            num_tensors=self.num_gemms,
+            shape=[(self.out_features, self.in_features)] * self.num_gemms,
+            quantizers=weight_quantizers,
+            dtype=self.params_dtype,
+        )
+
+        # Copy existing params into storage.
+        # TODO(ksivamani): Verify correctness of copy for all recipes.
+        with torch.no_grad():
+            for i in range(self.num_gemms):
+                grouped_weights.quantized_tensors[i].copy_(weights[i])
+
+        # Re-register the grouped weights as parameters.
+        for i in range(self.num_gemms):
+            self.register_parameter(
+                f"weight{i}",
+                torch.nn.Parameter(grouped_weights.quantized_tensors[i]),
+                init_fn=self.init_method,
+                get_rng_state_tracker=self.get_rng_state_tracker,
+                fp8_meta_index=self._offsets["weight"] + i * self._num_fp8_tensors_per_gemm["fwd"],
+            )
+
+        self.set_tensor_parallel_attributes(defer_init=defer_init)
+
     def reset_parameters(self, defer_init=False):
         super().reset_parameters(defer_init=defer_init)
+        self.set_tensor_parallel_attributes(defer_init=defer_init)
+
+    def set_tensor_parallel_attributes(self, defer_init=False) -> None:
+        """Set attributes needed for TP"""
 
         if not defer_init:
             # Set parallelism attributes for linear weights