diff --git a/transformer_engine/common/recipe/__init__.py b/transformer_engine/common/recipe/__init__.py
index 64ee2a5a16..d534ad883b 100644
--- a/transformer_engine/common/recipe/__init__.py
+++ b/transformer_engine/common/recipe/__init__.py
@@ -181,6 +181,11 @@ def scaling_factor_compute(amax: Tensor,
             `LayerNormLinear (BF16 output) -> (cast to FP8 ) FP8 DPA (cast to BF16) -> Linear`.
             When `fp8_mha = True, fp8_dpa = True`, it becomes
             `LayerNormLinear (FP8 output) -> FP8 DPA -> Linear`.
+    quantize_forward : bool, default = True
+            Whether to quantize tensors in the forward pass.
+    quantize_backward : bool, default = True
+            Whether to quantize tensors in the backward pass. Delayed scaling
+            always quantizes backward; setting this to False is not supported.
 
     Notes
     -----
@@ -204,9 +209,15 @@ def scaling_factor_compute(amax: Tensor,
     reduce_amax: bool = True
     fp8_dpa: bool = False
     fp8_mha: bool = False
+    quantize_forward: bool = True
+    quantize_backward: bool = not (os.getenv("NVTE_KEEP_BACKWARD_UNQUANTIZED", "0") == "1")
 
     def __post_init__(self) -> None:
         assert self.fp8_format != Format.E5M2, "Pure E5M2 training is not supported."
+        assert not (
+            not self.quantize_forward and self.quantize_backward
+        ), "Invalid recipe configuration: quantize_backward=True requires quantize_forward=True."
+        assert self.quantize_backward, "Delayed scaling does not support quantize_backward=False."
 
     def __repr__(self) -> str:
         return (
@@ -216,7 +227,9 @@ def __repr__(self) -> str:
             f"amax_history_len={self.amax_history_len}, "
             f"reduce_amax={self.reduce_amax}, "
             f"fp8_dpa={self.fp8_dpa}, "
-            f"fp8_mha={self.fp8_mha}"
+            f"fp8_mha={self.fp8_mha}, "
+            f"quantize_forward={self.quantize_forward}, "
+            f"quantize_backward={self.quantize_backward}"
         )
 
 
@@ -230,6 +243,10 @@ class Float8CurrentScaling(Recipe):
     fp8_format : {Format.E4M3, Format.HYBRID}, default = Format.HYBRID
                 Controls the FP8 data format used during forward and backward
                 pass.
+    quantize_forward : bool, default = True
+            Whether to quantize tensors in the forward pass.
+    quantize_backward : bool, default = True
+            Whether to quantize tensors in the backward pass.
     """
 
     use_power_2_scales: bool = os.getenv("NVTE_FP8_CURRENT_SCALING_POWER_2_SCALES", "0") == "1"
@@ -242,9 +259,14 @@ class Float8CurrentScaling(Recipe):
     fp8_gemm_wgrad: MMParams = MMParams(use_split_accumulator=True)
     fp8_dpa: bool = False
     fp8_mha: bool = False
+    quantize_forward: bool = True
+    quantize_backward: bool = not (os.getenv("NVTE_KEEP_BACKWARD_UNQUANTIZED", "0") == "1")
 
     def __post_init__(self) -> None:
         assert self.fp8_format != Format.E5M2, "Pure E5M2 training is not supported."
+        assert not (
+            not self.quantize_forward and self.quantize_backward
+        ), "Invalid recipe configuration: quantize_backward=True requires quantize_forward=True."
 
     def __repr__(self) -> str:
         return (
@@ -257,7 +279,9 @@ def __repr__(self) -> str:
             f"fp8_gemm_dgrad={self.fp8_gemm_dgrad}, "
             f"fp8_gemm_wgrad={self.fp8_gemm_wgrad}, "
             f"fp8_dpa={self.fp8_dpa}, "
-            f"fp8_mha={self.fp8_mha}"
+            f"fp8_mha={self.fp8_mha}, "
+            f"quantize_forward={self.quantize_forward}, "
+            f"quantize_backward={self.quantize_backward}"
         )
 
 
@@ -284,21 +308,32 @@ class MXFP8BlockScaling(Recipe):
     fp8_format : {Format.E4M3, Format.HYBRID}, default = Format.E4M3
                 Controls the FP8 data format used during forward and backward
                 pass.
+    quantize_forward : bool, default = True
+            Whether to quantize tensors in the forward pass.
+    quantize_backward : bool, default = True
+            Whether to quantize tensors in the backward pass.
     """
 
     margin: int = 0
     fp8_format: Format = Format.E4M3
     fp8_dpa: bool = False
     fp8_mha: bool = False
+    quantize_forward: bool = True
+    quantize_backward: bool = not (os.getenv("NVTE_KEEP_BACKWARD_UNQUANTIZED", "0") == "1")
 
     def __post_init__(self) -> None:
         assert self.fp8_format != Format.E5M2, "Pure E5M2 training is not supported."
+        assert not (
+            not self.quantize_forward and self.quantize_backward
+        ), "Invalid recipe configuration: quantize_backward=True requires quantize_forward=True."
 
     def __repr__(self) -> str:
         return (
             f"recipe_type={self.__class__.__name__}, "
             f"margin={self.margin}, "
-            f"format={str(self.fp8_format).split('.')[1]}"
+            f"format={str(self.fp8_format).split('.')[1]}, "
+            f"quantize_forward={self.quantize_forward}, "
+            f"quantize_backward={self.quantize_backward}"
         )
 
 
@@ -327,6 +362,10 @@ class Float8BlockScaling(Recipe):
     fp8_format : {Format.E4M3, Format.HYBRID}, default = Format.E4M3
                 Controls the FP8 data format used during forward and backward
                 pass.
+    quantize_forward : bool, default = True
+            Whether to quantize tensors in the forward pass.
+    quantize_backward : bool, default = True
+            Whether to quantize tensors in the backward pass.
     """
 
     use_f32_scales: bool = os.getenv("NVTE_FP8_BLOCK_SCALING_FP32_SCALES", "0") == "1"
@@ -343,6 +382,8 @@ class Float8BlockScaling(Recipe):
     fp8_gemm_wgrad: MMParams = MMParams(use_split_accumulator=True)
     fp8_dpa: bool = False
     fp8_mha: bool = False
+    quantize_forward: bool = True
+    quantize_backward: bool = not (os.getenv("NVTE_KEEP_BACKWARD_UNQUANTIZED", "0") == "1")
 
     def __post_init__(self) -> None:
         assert self.x_block_scaling_dim in [1, 2], "Only 1D or 2D blocks supported for x"
@@ -364,6 +405,9 @@ def __post_init__(self) -> None:
             not self.fp8_dpa and not self.fp8_mha
         ), "FP8 attention is not supported for Float8BlockScaling."
         assert self.fp8_format != Format.E5M2, "Pure E5M2 training is not supported."
+        assert not (
+            not self.quantize_forward and self.quantize_backward
+        ), "Invalid recipe configuration: quantize_backward=True requires quantize_forward=True."
 
     def __repr__(self) -> str:
         return (
@@ -379,7 +423,9 @@ def __repr__(self) -> str:
             f"fp8_gemm_dgrad={self.fp8_gemm_dgrad}, "
             f"fp8_gemm_wgrad={self.fp8_gemm_wgrad}, "
             f"fp8_dpa={self.fp8_dpa}, "
-            f"fp8_mha={self.fp8_mha}"
+            f"fp8_mha={self.fp8_mha}, "
+            f"quantize_forward={self.quantize_forward}, "
+            f"quantize_backward={self.quantize_backward}"
         )
 
 
@@ -428,6 +474,10 @@ class NVFP4BlockScaling(Recipe):
              If set to `True`, stochastic rounding is disabled during quantization for all tensors.
     disable_2d_quantization : bool, default = False
              If set to `True`, 1D block scaling with block size 16 is used for all tensors.
+    quantize_forward : bool, default = True
+            Whether to quantize tensors in the forward pass.
+    quantize_backward : bool, default = True
+            Whether to quantize tensors in the backward pass.
     """
 
     # Configuration envvars
@@ -443,10 +493,15 @@ class NVFP4BlockScaling(Recipe):
     # Not applying quantization to attention for now
     fp8_dpa: bool = False
     fp8_mha: bool = False
+    quantize_forward: bool = True
+    quantize_backward: bool = not (os.getenv("NVTE_KEEP_BACKWARD_UNQUANTIZED", "0") == "1")
 
     def __post_init__(self) -> None:
         assert self.fp4_format == Format.E2M1, "Only E2M1 is supported for NVFP4 scaling"
         assert self.fp8_format == Format.E4M3, "Only E4M3 is supported for NVFP4 scaling"
+        assert not (
+            not self.quantize_forward and self.quantize_backward
+        ), "Invalid recipe configuration: quantize_backward=True requires quantize_forward=True."
 
         # Quantization params
         # Note: RHT is currently only applied to column-wise usage so that
@@ -474,6 +529,8 @@ def __repr__(self) -> str:
             f"fp8_format={str(self.fp8_format).split('.')[1]}, "
             f"fp8_dpa={self.fp8_dpa}, "
             f"fp8_mha={self.fp8_mha}, "
+            f"quantize_forward={self.quantize_forward}, "
+            f"quantize_backward={self.quantize_backward}, "
             f"fp4_quant_fwd_inp={self.fp4_quant_fwd_inp}, "
             f"fp4_quant_fwd_weight={self.fp4_quant_fwd_weight}, "
             f"fp4_quant_bwd_grad={self.fp4_quant_bwd_grad}, "
@@ -505,12 +562,23 @@ class CustomRecipe(Recipe):
 
         - forward:  "linear_input", "linear_weight", "linear_output"
         - backward: "linear_grad_output", "linear_grad_input"
+    quantize_forward : bool, default = True
+        Whether to quantize tensors in the forward pass.
+    quantize_backward : bool, default = True
+        Whether to quantize tensors in the backward pass.
     """
 
     qfactory: Callable[..., Any]
 
     fp8_dpa: bool = False
     fp8_mha: bool = False
+    quantize_forward: bool = True
+    quantize_backward: bool = not (os.getenv("NVTE_KEEP_BACKWARD_UNQUANTIZED", "0") == "1")
 
     def __repr__(self) -> str:
-        return f"recipe_type={self.__class__.__name__}, qfactory={self.qfactory}"
+        return (
+            f"recipe_type={self.__class__.__name__}, "
+            f"qfactory={self.qfactory}, "
+            f"quantize_forward={self.quantize_forward}, "
+            f"quantize_backward={self.quantize_backward}"
+        )
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
index 841cdf04ca..a878f2ace2 100644
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -1135,9 +1135,10 @@ def grad_output_preprocess(
         grad_output = grad_output.reshape((-1, grad_output.shape[-1]))
         grad_output = grad_output.contiguous()
         gather_grad_output = row_parallel_mode and ctx.sequence_parallel
+        use_fp8_bwd = ctx.fp8 and not ctx.keep_backward_unquantized
 
         # Non-FP8 case: bgrad is fused with wgrad for this case.
-        if not ctx.fp8 and not ctx.debug:
+        if not use_fp8_bwd and not ctx.debug:
             if gather_grad_output:
                 if not ctx.ub_overlap_ag:  # Perform NCCL all-gather
                     grad_output, _ = gather_along_first_dim(grad_output, ctx.tp_group)
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index c9ceb714e3..abe6df6875 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -96,6 +96,12 @@ def forward(
             save_original_input,
             debug,
         ) = non_tensor_args
+        keep_backward_unquantized = fp8 and (
+            not FP8GlobalStateManager.get_fp8_recipe().quantize_backward
+        )
+        if keep_backward_unquantized:
+            # Note, NVTE_KEEP_BACKWARD_UNQUANTIZED is ignored when delayed scaling is used
+            save_original_input = True
 
         num_gemms = len(m_splits)
         weights = weights_and_biases[:num_gemms]
@@ -286,6 +292,7 @@ def forward(
             ctx.activation_dtype = activation_dtype
             ctx.fp8 = fp8
             ctx.fp8_recipe = FP8GlobalStateManager.get_fp8_recipe() if fp8 else None
+            ctx.keep_backward_unquantized = keep_backward_unquantized
             ctx.fuse_wgrad_accumulation = fuse_wgrad_accumulation
             ctx.cpu_offloading = cpu_offloading
             ctx.is_first_microbatch = is_first_microbatch
@@ -304,6 +311,17 @@ def forward(
             ctx.save_original_input = save_original_input
             ctx.input_quantizers = input_quantizers
 
+            # keep_backward_unquantized overrides
+            if keep_backward_unquantized:
+                ctx.fp8 = ctx.fp8 and not keep_backward_unquantized
+                ctx.ub_overlap_ag = False
+                ctx.ub_overlap_rs_dgrad = False
+                ctx.ub_bulk_dgrad = False
+                ctx.ub_bulk_wgrad = False
+                ctx.grad_input_quantizer = None
+                ctx.grad_weight_quantizer = None
+                ctx.grad_output_quantizer = None
+
         # [*, in_features] -> [*, out_features] except first dimension changes for SP
         return out.view(-1, *inp.shape[1:-1], out.shape[-1])
 
@@ -395,13 +413,16 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                     dtype=ctx.activation_dtype,
                     device=ctx.device,
                 )
+                weights_for_dgrad = weights
+                if ctx.keep_backward_unquantized:
+                    weights_for_dgrad = origin_weights
                 # Make sure weights are available in column-wise format
                 # for dgrad computation.
-                for weight in weights:
+                for weight in weights_for_dgrad:
                     if isinstance(weight, QuantizedTensorStorage):
                         weight.update_usage(columnwise_usage=True)
                 general_grouped_gemm(
-                    weights,
+                    weights_for_dgrad,
                     grad_output,
                     [dgrad],
                     ctx.grad_input_quantizers,
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index 702916696b..187fd70f92 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -141,6 +141,9 @@ def forward(
             symmetric_ar_type,
             debug,
         ) = non_tensor_args
+        keep_backward_unquantized = fp8 and (
+            not FP8GlobalStateManager.get_fp8_recipe().quantize_backward
+        )
 
         # NVTX label for profiling
         nvtx_label = "transformer_engine._LayerNormLinear.forward"
@@ -200,7 +203,10 @@ def forward(
         if fp8:
             if input_quantizer is None:
                 raise ValueError("Missing quantizer for input tensor")
-            input_quantizer.set_usage(rowwise=True, columnwise=backward_needs_input)
+            input_quantizer.set_usage(
+                rowwise=True,
+                columnwise=backward_needs_input and not keep_backward_unquantized,
+            )
             if with_input_all_gather and input_quantizer.supports_only_rowwise_all_gather():
                 # All-gather is not supported with FP8 column-wise data
                 input_quantizer.set_usage(columnwise=False)
@@ -213,6 +219,7 @@ def forward(
             and not debug
             and not return_layernorm_output
             and not return_layernorm_output_gathered
+            and not keep_backward_unquantized
             and not custom  # TODO(negvet): and not FP8GlobalStateManager.get_fp8_recipe().custom()
         )
 
@@ -236,6 +243,7 @@ def forward(
         ln_out_return = None
         if return_layernorm_output or return_layernorm_output_gathered:
             ln_out_return = ln_out
+        ln_out_hp = ln_out if keep_backward_unquantized else None
 
         # ------------------------------------------------------
         # Prepare GEMM input tensor
@@ -409,13 +417,16 @@ def forward(
         # ------------------------------------------------------
 
         if is_grad_enabled:
+            ln_out_to_save = ln_out
+            if keep_backward_unquantized:
+                ln_out_to_save = ln_out_hp
             ctx.weight_quantizer = weight_quantizer
             ctx.ln_out_needs_gather = (
                 weight.requires_grad and parallel_mode == "column" and sequence_parallel
             )
 
             # Input with column-wise usage is needed for wgrad GEMM.
-            if backward_needs_input:
+            if backward_needs_input and not keep_backward_unquantized:
                 if isinstance(ln_out, QuantizedTensorStorage):
                     # For sequence parallel in vanilla FP8, rowwise data is
                     # to gather the input. For MXFP8, columnwise only data
@@ -427,7 +438,7 @@ def forward(
                         ln_out.update_usage(rowwise_usage=False)
 
             if cpu_offloading:
-                mark_activation_offload(inputmat, mu, rsigma, ln_out)
+                mark_activation_offload(inputmat, mu, rsigma, ln_out_to_save)
 
             # Scatter intermediate/activation tensors saved for the backward pass
             # NOTE: weight_fp8 = weight when ctx.fp8 == False and torch.disttributed.FSDP already
@@ -439,7 +450,7 @@ def forward(
                 mu,
                 rsigma,
                 weightmat if fp8 and not is_weight_param_quantized else None,
-                ln_out if weight.requires_grad else None,
+                ln_out_to_save if weight.requires_grad else None,
             )
             nvtx_range_pop(f"{nvtx_label}.fsdp_scatter")
 
@@ -466,7 +477,7 @@ def forward(
                 weight,
                 bias,
                 ln_weight,
-                ln_out,
+                ln_out_to_save,
                 mu,
                 rsigma,
             )
@@ -493,6 +504,7 @@ def forward(
             ctx.activation_dtype = activation_dtype
             ctx.fp8 = fp8
             ctx.fp8_recipe = FP8GlobalStateManager.get_fp8_recipe() if fp8 else None
+            ctx.keep_backward_unquantized = keep_backward_unquantized
             ctx.fuse_wgrad_accumulation = fuse_wgrad_accumulation
             ctx.cpu_offloading = cpu_offloading
             ctx.is_first_microbatch = is_first_microbatch
@@ -523,6 +535,17 @@ def forward(
             ctx.wgrad_store = wgrad_store
             ctx.debug = debug
 
+            # keep_backward_unquantized overrides
+            if keep_backward_unquantized:
+                ctx.fp8 = ctx.fp8 and not keep_backward_unquantized
+                ctx.ub_overlap_ag = False
+                ctx.ub_overlap_rs_dgrad = False
+                ctx.ub_bulk_dgrad = False
+                ctx.ub_bulk_wgrad = False
+                ctx.grad_input_quantizer = None
+                ctx.grad_weight_quantizer = None
+                ctx.grad_output_quantizer = None
+
         # ------------------------------------------------------
         # Cached state for backward pass is ready...
         # ------------------------------------------------------
@@ -665,7 +688,7 @@ def backward(
             ln_out_total_work = None
             if ctx.ln_out_needs_gather:
                 quantizer = None
-                if ctx.input_quantizer is not None:
+                if ctx.input_quantizer is not None and ctx.fp8:
                     quantizer = ctx.input_quantizer
                     if quantizer.supports_only_rowwise_all_gather():
                         # If data is in FP8, we compute FP8 transposes manually
@@ -703,7 +726,11 @@ def backward(
             # Make sure required data is available
             if isinstance(grad_output, QuantizedTensorStorage):
                 grad_output.update_usage(rowwise_usage=True)
-            if ctx.weight_quantizer is not None and isinstance(weight, QuantizedTensorStorage):
+            if (
+                ctx.fp8
+                and ctx.weight_quantizer is not None
+                and isinstance(weight, QuantizedTensorStorage)
+            ):
                 weight.update_usage(columnwise_usage=True)
 
             # Choose whether to use GEMM kernel with split accumulator
@@ -730,8 +757,11 @@ def backward(
             # dgrad GEMM
             # Note: dx = dy * w
             nvtx_range_push(f"{nvtx_label}.dgrad_gemm")
+            weight_for_dgrad = weight
+            if ctx.keep_backward_unquantized:
+                weight_for_dgrad = origin_weight
             gemm_out, *_, reduce_scatter_out = general_gemm(
-                weight,
+                weight_for_dgrad,
                 grad_output,
                 layout="NN",
                 grad=True,
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
index bec6744518..ac10534012 100644
--- a/transformer_engine/pytorch/module/layernorm_mlp.py
+++ b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -232,6 +232,12 @@ def _forward(
             debug,
             recompute_for_bwd,
         ) = non_tensor_args
+        keep_backward_unquantized = fp8 and (
+            not FP8GlobalStateManager.get_fp8_recipe().quantize_backward
+        )
+        assert (
+            not keep_backward_unquantized
+        ), "NVTE_KEEP_BACKWARD_UNQUANTIZED is not implemented in LayerNormMLP"
 
         # if grad is enabled and this is not the bwd stage, we must save this so bwd knows which path to take
         if is_grad_enabled and not recompute_for_bwd:
@@ -778,6 +784,7 @@ def _forward(
                     ctx.fc2_main_grad_func = lambda: fc2_weight.main_grad
 
             ctx.fp8_recipe = FP8GlobalStateManager.get_fp8_recipe() if fp8 else None
+            ctx.keep_backward_unquantized = keep_backward_unquantized
             ctx.fc1_grad_input_quantizer = fc1_grad_input_quantizer
             ctx.fc1_grad_weight_quantizer = fc1_grad_weight_quantizer
             ctx.fc1_grad_output_quantizer = fc1_grad_output_quantizer
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index 23ad8cacb0..7d960102ec 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -129,6 +129,12 @@ def forward(
             save_original_input,
             debug,
         ) = non_tensor_args
+        keep_backward_unquantized = fp8 and (
+            not FP8GlobalStateManager.get_fp8_recipe().quantize_backward
+        )
+        if keep_backward_unquantized:
+            # Note, NVTE_KEEP_BACKWARD_UNQUANTIZED is ignored when delayed scaling is used
+            save_original_input = True
 
         # NVTX label for profiling
         nvtx_label = "transformer_engine._Linear.forward"
@@ -443,6 +449,7 @@ def forward(
             ctx.activation_dtype = activation_dtype
             ctx.fp8 = fp8
             ctx.fp8_recipe = FP8GlobalStateManager.get_fp8_recipe() if fp8 else None
+            ctx.keep_backward_unquantized = keep_backward_unquantized
             ctx.input_quantizer = input_quantizer
             ctx.grad_input_quantizer = grad_input_quantizer
             ctx.grad_weight_quantizer = grad_weight_quantizer
@@ -486,6 +493,17 @@ def forward(
                     FP8GlobalStateManager.IS_FIRST_FP8_MODULE = _first_fp8_module
             ctx.wgrad_store = wgrad_store
 
+            # keep_backward_unquantized overrides
+            if keep_backward_unquantized:
+                ctx.fp8 = ctx.fp8 and not keep_backward_unquantized
+                ctx.ub_overlap_ag = False
+                ctx.ub_overlap_rs_dgrad = False
+                ctx.ub_bulk_dgrad = False
+                ctx.ub_bulk_wgrad = False
+                ctx.grad_input_quantizer = None
+                ctx.grad_weight_quantizer = None
+                ctx.grad_output_quantizer = None
+
         # ------------------------------------------------------
         # Cached state for backward pass is ready...
         # ------------------------------------------------------
@@ -690,8 +708,10 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 # Make sure required data is available
                 if isinstance(grad_output, QuantizedTensorStorage):
                     grad_output.update_usage(rowwise_usage=True)
-                if ctx.weight_quantizer is not None and isinstance(
-                    weight_fp8, QuantizedTensorStorage
+                if (
+                    ctx.fp8
+                    and ctx.weight_quantizer is not None
+                    and isinstance(weight_fp8, QuantizedTensorStorage)
                 ):
                     weight_fp8.update_usage(columnwise_usage=True)
 
@@ -720,8 +740,11 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 # Note: dx = dy * w
 
                 nvtx_range_push(f"{nvtx_label}.dgrad_gemm")
+                weight_for_dgrad = weight_fp8
+                if ctx.keep_backward_unquantized:
+                    weight_for_dgrad = weight
                 gemm_out, *_, reduce_scatter_out = general_gemm(
-                    weight_fp8,
+                    weight_for_dgrad,
                     grad_output,
                     layout="NN",
                     grad=True,
diff --git a/transformer_engine/pytorch/ops/basic/basic_linear.py b/transformer_engine/pytorch/ops/basic/basic_linear.py
index e640f3ffb1..16b7bcb7c5 100644
--- a/transformer_engine/pytorch/ops/basic/basic_linear.py
+++ b/transformer_engine/pytorch/ops/basic/basic_linear.py
@@ -332,12 +332,16 @@ def pre_fuser_forward(self, *, requires_grad: bool) -> None:
             # Note: We cache the quantized input for backward pass,
             # but discard the quantized weights.
             weight_requires_grad = requires_grad and self.weight.requires_grad
+            keep_backward_unquantized = FP8GlobalStateManager.is_fp8_enabled() and (
+                not FP8GlobalStateManager.get_fp8_recipe().quantize_backward
+            )
+            columnwise_usage = weight_requires_grad and not keep_backward_unquantized
             input_quantizer = self.get_quantizer("forward", 0)
             weight_quantizer = self.get_quantizer("forward", 1)
             grad_output_quantizer = self.get_quantizer("backward", 0)
-            input_quantizer.set_usage(rowwise=True, columnwise=weight_requires_grad)
+            input_quantizer.set_usage(rowwise=True, columnwise=columnwise_usage)
             weight_quantizer.set_usage(rowwise=True, columnwise=False)
-            grad_output_quantizer.set_usage(rowwise=True, columnwise=weight_requires_grad)
+            grad_output_quantizer.set_usage(rowwise=True, columnwise=columnwise_usage)
 
     def reset_recipe_state(self, *, recipe: Optional[Recipe]) -> None:
         super().reset_recipe_state(recipe=recipe)
@@ -420,6 +424,7 @@ def _functional_forward(
         tensor_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
         sequence_parallel: bool = False,
         with_quantized_compute: bool = False,
+        keep_backward_unquantized: bool = False,
         input_quantizer: Optional[Quantizer] = None,
         weight_quantizer: Optional[Quantizer] = None,
         output_quantizer: Optional[Quantizer] = None,
@@ -459,6 +464,8 @@ def _functional_forward(
             distributing along inner dimension (embedding dim)
         with_quantized_compute: bool, default = `False`
             Whether to perform compute with quantized data.
+        keep_backward_unquantized: bool, default = `False`
+            Whether to skip quantized backward and use high precision.
         input_quantizer: Quantizer, optional
             Builder class for quantized input tensor.
         weight_quantizer: Quantizer, optional
@@ -510,7 +517,10 @@ def _functional_forward(
         if with_quantized_compute:
             if input_quantizer is None:
                 raise ValueError("Missing quantizer for input tensor")
-            input_quantizer.set_usage(rowwise=True, columnwise=weight_requires_grad)
+            input_quantizer.set_usage(
+                rowwise=True,
+                columnwise=weight_requires_grad and not keep_backward_unquantized,
+            )
             if with_x_all_gather:
                 input_quantizer.set_usage(columnwise=False)
                 x, x_async = gather_along_first_dim(
@@ -542,7 +552,10 @@ def _functional_forward(
         elif with_quantized_compute and not is_quantized_tensor(w):
             if weight_quantizer is None:
                 raise ValueError("Missing quantizer for weight tensor")
-            weight_quantizer.set_usage(rowwise=True, columnwise=input_requires_grad)
+            weight_quantizer.set_usage(
+                rowwise=True,
+                columnwise=input_requires_grad and not keep_backward_unquantized,
+            )
             w = weight_quantizer(w)
 
         # Check output tensor
@@ -611,14 +624,23 @@ def _functional_forward(
 
         # Prepare weight tensor for backward pass
         if input_requires_grad:
-            if w is not weight and with_quantized_compute and is_quantized_tensor(w):
+            if (
+                w is not weight
+                and with_quantized_compute
+                and is_quantized_tensor(w)
+                and not keep_backward_unquantized
+            ):
                 w.update_usage(rowwise_usage=False, columnwise_usage=True)
         else:
             w = None
 
         # Prepare input tensor for backward pass
         if weight_requires_grad:
-            if with_quantized_compute and is_quantized_tensor(x_local):
+            if (
+                with_quantized_compute
+                and is_quantized_tensor(x_local)
+                and not keep_backward_unquantized
+            ):
                 if not (isinstance(x_local, Float8TensorStorage) and with_x_all_gather):
                     # FP8 does not support all-gather of transpose data
                     x_local.update_usage(rowwise_usage=False, columnwise_usage=True)
@@ -968,6 +990,9 @@ def op_forward(
         grad_output_quantizer = self.get_quantizer("backward", 0)
         grad_input_quantizer = prev_op_grad_output_quantizer
         with_quantized_compute = FP8GlobalStateManager.is_fp8_enabled()
+        keep_backward_unquantized = with_quantized_compute and (
+            not FP8GlobalStateManager.get_fp8_recipe().quantize_backward
+        )
 
         # Get autocast dtype if needed
         if torch.is_autocast_enabled():
@@ -984,6 +1009,7 @@ def op_forward(
             tensor_parallel_group=self.tensor_parallel_group,
             sequence_parallel=self.sequence_parallel,
             with_quantized_compute=with_quantized_compute,
+            keep_backward_unquantized=keep_backward_unquantized,
             input_quantizer=input_quantizer,
             weight_quantizer=weight_quantizer,
             output_quantizer=output_quantizer,
@@ -993,10 +1019,12 @@ def op_forward(
 
         # Save state for backward pass
         if ctx.requires_grad:
+            saved_input = input_ if keep_backward_unquantized else x_local
+            saved_weight = self.weight if keep_backward_unquantized else w
             if is_cpu_offload_enabled():
-                mark_activation_offload(x_local)
-            ctx.save_for_backward(x_local, w)
-            ctx.with_quantized_compute = with_quantized_compute
+                mark_activation_offload(saved_input)
+            ctx.save_for_backward(saved_input, saved_weight)
+            ctx.with_quantized_compute = with_quantized_compute and not keep_backward_unquantized
             ctx.input_quantizer = input_quantizer
             ctx.weight_quantizer = weight_quantizer
             ctx.grad_output_quantizer = grad_output_quantizer
diff --git a/transformer_engine/pytorch/ops/basic/quantize.py b/transformer_engine/pytorch/ops/basic/quantize.py
index d126b554b5..33062d5b88 100644
--- a/transformer_engine/pytorch/ops/basic/quantize.py
+++ b/transformer_engine/pytorch/ops/basic/quantize.py
@@ -59,6 +59,15 @@ def op_forward(
         quantize_forward = fp8_enabled and self._quantize_forward
         quantize_backward = fp8_enabled and self._quantize_backward
 
+        # Recipe quantize overrides
+        if FP8GlobalStateManager.get_fp8_recipe() is not None:
+            quantize_forward = (
+                quantize_forward and FP8GlobalStateManager.get_fp8_recipe().quantize_forward
+            )
+            quantize_backward = (
+                quantize_backward and FP8GlobalStateManager.get_fp8_recipe().quantize_backward
+            )
+
         # Quantize if needed
         out = input_
         if quantize_forward and not is_quantized_tensor(out):
diff --git a/transformer_engine/pytorch/ops/fused/forward_linear_bias_activation.py b/transformer_engine/pytorch/ops/fused/forward_linear_bias_activation.py
index dfc11a19e7..860407904c 100644
--- a/transformer_engine/pytorch/ops/fused/forward_linear_bias_activation.py
+++ b/transformer_engine/pytorch/ops/fused/forward_linear_bias_activation.py
@@ -92,6 +92,9 @@ def fuser_forward(
         grad_output_quantizer = linear_op.get_quantizer("backward", 0)
         grad_input_quantizer = prev_op_grad_output_quantizer
         with_quantized_compute = FP8GlobalStateManager.is_fp8_enabled()
+        keep_backward_unquantized = with_quantized_compute and (
+            not FP8GlobalStateManager.get_fp8_recipe().quantize_backward
+        )
 
         # Get autocast dtype if needed
         if torch.is_autocast_enabled():
@@ -109,6 +112,7 @@ def fuser_forward(
             tensor_parallel_group=linear_op.tensor_parallel_group,
             sequence_parallel=linear_op.sequence_parallel,
             with_quantized_compute=with_quantized_compute,
+            keep_backward_unquantized=keep_backward_unquantized,
             input_quantizer=input_quantizer,
             weight_quantizer=weight_quantizer,
             output_quantizer=output_quantizer,
@@ -118,10 +122,14 @@ def fuser_forward(
 
         # Save state for backward pass
         if linear_op_ctx.requires_grad:
+            saved_input = input_ if keep_backward_unquantized else x_local
+            saved_weight = linear_op.weight if keep_backward_unquantized else w
             if is_cpu_offload_enabled():
-                mark_activation_offload(x_local)
-            linear_op_ctx.save_for_backward(x_local, w)
-            linear_op_ctx.with_quantized_compute = with_quantized_compute
+                mark_activation_offload(saved_input)
+            linear_op_ctx.save_for_backward(saved_input, saved_weight)
+            linear_op_ctx.with_quantized_compute = (
+                with_quantized_compute and not keep_backward_unquantized
+            )
             linear_op_ctx.input_quantizer = input_quantizer
             linear_op_ctx.weight_quantizer = weight_quantizer
             linear_op_ctx.grad_output_quantizer = grad_output_quantizer
diff --git a/transformer_engine/pytorch/ops/fused/forward_linear_bias_add.py b/transformer_engine/pytorch/ops/fused/forward_linear_bias_add.py
index 2dfc0566b7..0729291d55 100644
--- a/transformer_engine/pytorch/ops/fused/forward_linear_bias_add.py
+++ b/transformer_engine/pytorch/ops/fused/forward_linear_bias_add.py
@@ -86,6 +86,9 @@ def fuser_forward(
         grad_output_quantizer = linear_op.get_quantizer("backward", 0)
         grad_input_quantizer = prev_op_grad_output_quantizer
         with_quantized_compute = FP8GlobalStateManager.is_fp8_enabled()
+        keep_backward_unquantized = with_quantized_compute and (
+            not FP8GlobalStateManager.get_fp8_recipe().quantize_backward
+        )
 
         # Get autocast dtype if needed
         if torch.is_autocast_enabled():
@@ -106,6 +109,7 @@ def fuser_forward(
             tensor_parallel_group=linear_op.tensor_parallel_group,
             sequence_parallel=linear_op.sequence_parallel,
             with_quantized_compute=with_quantized_compute,
+            keep_backward_unquantized=keep_backward_unquantized,
             input_quantizer=input_quantizer,
             weight_quantizer=weight_quantizer,
             output_quantizer=output_quantizer,
@@ -115,10 +119,14 @@ def fuser_forward(
 
         # Save state for backward pass
         if linear_op_ctx.requires_grad:
+            saved_input = input_ if keep_backward_unquantized else x_local
+            saved_weight = linear_op.weight if keep_backward_unquantized else w
             if is_cpu_offload_enabled():
-                mark_activation_offload(x_local)
-            linear_op_ctx.save_for_backward(x_local, w)
-            linear_op_ctx.with_quantized_compute = with_quantized_compute
+                mark_activation_offload(saved_input)
+            linear_op_ctx.save_for_backward(saved_input, saved_weight)
+            linear_op_ctx.with_quantized_compute = (
+                with_quantized_compute and not keep_backward_unquantized
+            )
             linear_op_ctx.input_quantizer = input_quantizer
             linear_op_ctx.weight_quantizer = weight_quantizer
             linear_op_ctx.grad_output_quantizer = grad_output_quantizer
diff --git a/transformer_engine/pytorch/ops/fused/forward_linear_scale_add.py b/transformer_engine/pytorch/ops/fused/forward_linear_scale_add.py
index ae4bdd4b19..dfdd11a231 100644
--- a/transformer_engine/pytorch/ops/fused/forward_linear_scale_add.py
+++ b/transformer_engine/pytorch/ops/fused/forward_linear_scale_add.py
@@ -65,6 +65,9 @@ def fuser_forward(
         grad_output_quantizer = linear_op.get_quantizer("backward", 0)
         grad_input_quantizer = prev_op_grad_output_quantizer
         with_quantized_compute = FP8GlobalStateManager.is_fp8_enabled()
+        keep_backward_unquantized = with_quantized_compute and (
+            not FP8GlobalStateManager.get_fp8_recipe().quantize_backward
+        )
 
         # Get extra input tensor for add operation
         extra_input = basic_op_extra_inputs[2][0]
@@ -87,6 +90,7 @@ def fuser_forward(
             tensor_parallel_group=linear_op.tensor_parallel_group,
             sequence_parallel=linear_op.sequence_parallel,
             with_quantized_compute=with_quantized_compute,
+            keep_backward_unquantized=keep_backward_unquantized,
             input_quantizer=input_quantizer,
             weight_quantizer=weight_quantizer,
             output_quantizer=output_quantizer,
@@ -96,10 +100,14 @@ def fuser_forward(
 
         # Save state for backward pass
         if linear_op_ctx.requires_grad:
+            saved_input = input_ if keep_backward_unquantized else x_local
+            saved_weight = linear_op.weight if keep_backward_unquantized else w
             if is_cpu_offload_enabled():
-                mark_activation_offload(x_local)
-            linear_op_ctx.save_for_backward(x_local, w)
-            linear_op_ctx.with_quantized_compute = with_quantized_compute
+                mark_activation_offload(saved_input)
+            linear_op_ctx.save_for_backward(saved_input, saved_weight)
+            linear_op_ctx.with_quantized_compute = (
+                with_quantized_compute and not keep_backward_unquantized
+            )
             linear_op_ctx.input_quantizer = input_quantizer
             linear_op_ctx.weight_quantizer = weight_quantizer
             linear_op_ctx.grad_output_quantizer = grad_output_quantizer
diff --git a/transformer_engine/pytorch/quantization.py b/transformer_engine/pytorch/quantization.py
index eba547afb0..00196c584f 100644
--- a/transformer_engine/pytorch/quantization.py
+++ b/transformer_engine/pytorch/quantization.py
@@ -842,14 +842,15 @@ def autocast(
                           are reduced at the end of each training step.
     """
 
-    if enabled:
+    effective_enabled = enabled and getattr(recipe, "quantize_forward", True)
+    if effective_enabled:
         check_recipe_support(recipe)
 
     # Save current state so we always restore it on exit.
     fp8_state = FP8GlobalStateManager.get_autocast_state()
 
     FP8GlobalStateManager.autocast_enter(
-        enabled=enabled,
+        enabled=effective_enabled,
         calibrating=calibrating,
         fp8_recipe=recipe,
         fp8_group=amax_reduction_group,
@@ -859,7 +860,7 @@ def autocast(
         yield
     finally:
         FP8GlobalStateManager.set_autocast_state(fp8_state)
-        FP8GlobalStateManager.autocast_exit(enabled, _graph=_graph)
+        FP8GlobalStateManager.autocast_exit(effective_enabled, _graph=_graph)
 
 
 def _update_amax_history(amax_history: torch.Tensor) -> torch.Tensor: