NVIDIA · timmoon10 · Apr 30, 2026 · May 1, 2026 · May 5, 2026 · May 5, 2026
diff --git a/tests/cpp/operator/test_act.cu b/tests/cpp/operator/test_act.cu
@@ -124,6 +124,7 @@ void performTest(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&ograd);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_igrad = std::make_unique<IType[]>(N*H);
@@ -132,7 +133,7 @@ void performTest(const size_t N, const size_t H) {
 
   float ref_amax;
   compute_ref_act_cast<ref_act>(input.rowwise_cpu_dptr<IType>(), ref_output.get(),
-                                output.scale(), &ref_amax, N, H);
+                                ref_scale, &ref_amax, N, H);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -179,6 +180,7 @@ void performTestGLU(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&ograd);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output = std::make_unique<OType[]>(N * H);
   std::unique_ptr<IType[]> ref_igrad = std::make_unique<IType[]>(2 * N * H);
@@ -187,7 +189,7 @@ void performTestGLU(const size_t N, const size_t H) {
 
   float ref_amax;
   compute_ref_glu_act_cast<ref_act>(input.rowwise_cpu_dptr<IType>(), ref_output.get(),
-                                    output.scale(), &ref_amax, N, H);
+                                    ref_scale, &ref_amax, N, H);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -197,8 +199,8 @@ void performTestGLU(const size_t N, const size_t H) {
     auto [atol, rtol] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol, rtol);
     if (output.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
-      const float ref_scale = 1.f / output.scale();
-      compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr<float>(), ref_scale, atol, rtol);
+      const float ref_scale_inv = 1.f / ref_scale;
+      compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr<float>(), ref_scale_inv, atol, rtol);
     }
   }
   auto [atol, rtol] = getTolerances(otype);

diff --git a/tests/cpp/operator/test_cast.cu b/tests/cpp/operator/test_cast.cu
@@ -53,21 +53,22 @@ void performTest(const std::vector<size_t>& shape) {
 
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   nvte_quantize(input.data(), output_c.data(), 0);
 
   float ref_amax;
 
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                     full_size, &ref_amax, output_c.scale());
+                                     full_size, &ref_amax, ref_scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
   ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);

diff --git a/tests/cpp/operator/test_cast_current_scaling.cu b/tests/cpp/operator/test_cast_current_scaling.cu
@@ -123,28 +123,29 @@ void performTest(const std::vector<size_t>& shape) {
     nvte_compute_amax(input.data(), output_c.data(), 0);
     QuantizationConfigWrapper config;
     nvte_compute_scale_from_amax(output_c.data(), config, 0);
+
     // avoid atomic amax update in cuda cast kernels because of current per-tensor scaling
     amax_to_check = output_c.amax();
     output_c.set_tensor_amax_nullptr();
   }
   nvte_quantize(input.data(), output_c.data(), 0);
 
   float ref_amax;
-  float ref_scale;
+  float ref_scale = 1.0;
   float ref_scale_inv;
   if (is_out_fp8){
     compute_amax_scale_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(),
                                      full_size, &ref_amax, &ref_scale, &ref_scale_inv, max_fp8, 0.0f);
   }
 
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                    full_size, nullptr, is_out_fp8 ? output_c.scale() : 1.0f );
+                                    full_size, nullptr, ref_scale);
 
   cudaDeviceSynchronize();
 
   auto err = cudaGetLastError();
   ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
-  if (isFp8Type(otype)) {
+  if (is_out_fp8) {
     auto [atol_fp32, rtol_fp32] = getTolerances(DType::kFloat32);
     compareResults("amax", amax_to_check, ref_amax, 0.0f, rtol_fp32);
     compareResults("scale", output_c.scale(), ref_scale, 0.0f, rtol_fp32);

diff --git a/tests/cpp/operator/test_cast_dbias.cu b/tests/cpp/operator/test_cast_dbias.cu
@@ -74,13 +74,14 @@ void performTest(const std::vector<size_t>& shape) {
 
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
 
   CType ref_amax;
   compute_ref_cast_dbias(input.rowwise_cpu_dptr<IType>(),
-                         output_c.scale(),
+                         ref_scale,
                          ref_output_c.get(),
                          &ref_amax,
                          ref_output_dbias.get(),
@@ -109,7 +110,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);

diff --git a/tests/cpp/operator/test_cast_dbias_dgelu.cu b/tests/cpp/operator/test_cast_dbias_dgelu.cu
@@ -84,14 +84,15 @@ void performTest(const std::vector<size_t>& shape) {
   fillUniform(&input);
   fillUniform(&grad);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
 
   CType ref_amax;
   compute_ref_cast_dbias_dgelu(input.rowwise_cpu_dptr<IType>(),
                                grad.rowwise_cpu_dptr<IType>(),
-                               output_c.scale(),
+                               ref_scale,
                                ref_output_c.get(),
                                &ref_amax,
                                ref_output_dbias.get(),
@@ -123,7 +124,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 

diff --git a/tests/cpp/operator/test_cast_gated_swiglu.cu b/tests/cpp/operator/test_cast_gated_swiglu.cu
@@ -79,6 +79,7 @@ void performTest(const std::vector<size_t>& shape) {
   fillUniform(&grad);
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(input_size);
 
@@ -91,7 +92,7 @@ void performTest(const std::vector<size_t>& shape) {
   float ref_amax;
   compute_ref_cast_dgated_swiglu(grad.rowwise_cpu_dptr<IType>(),
                                  input.rowwise_cpu_dptr<IType>(),
-                                 output_c.scale(),
+                                 ref_scale,
                                  ref_output_c.get(),
                                  &ref_amax,
                                  rows,
@@ -100,7 +101,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 

diff --git a/tests/cpp/operator/test_cast_nvfp4_transpose.cu b/tests/cpp/operator/test_cast_nvfp4_transpose.cu
@@ -502,7 +502,7 @@ void print_detailed_tensor_comparison(const std::string& name,
     printf("==================================\n");
 }
 
-void compareResults_nvfp4(const Tensor &test,
+void compareResults_nvfp4(Tensor &test,
                           const void *ref, const void *ref_t, const int rows, const int cols,
                           double atol = 1e-5, double rtol = 1e-8, bool if_on_gpus = true, bool dump_data = false) {
     if (if_on_gpus) test.to_cpu();

diff --git a/tests/cpp/operator/test_cast_transpose.cu b/tests/cpp/operator/test_cast_transpose.cu
@@ -55,21 +55,21 @@ void performTest(const size_t N, const size_t H) {
 
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   nvte_quantize(input.data(), output.data(), 0);
 
   float ref_amax;
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                     ref_output_t.get(), N, H, &ref_amax,
-                                     output.scale());
+                                     ref_output_t.get(), N, H, &ref_amax, ref_scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
   ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);

diff --git a/tests/cpp/operator/test_cast_transpose_dbias.cu b/tests/cpp/operator/test_cast_transpose_dbias.cu
@@ -73,14 +73,15 @@ void performTest(const size_t N, const size_t H) {
 
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
 
   CType ref_amax;
   compute_ref_cast_transpose_dbias(input.rowwise_cpu_dptr<IType>(),
-                                   output.scale(),
+                                   ref_scale,
                                    ref_output_c.get(),
                                    ref_output_t.get(),
                                    &ref_amax,
@@ -111,7 +112,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);

diff --git a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
@@ -86,6 +86,7 @@ void performTest(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&gelu_input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N*H);
@@ -94,7 +95,7 @@ void performTest(const size_t N, const size_t H) {
   CType ref_amax;
   compute_ref_cast_transpose_dbias_dgelu(input.rowwise_cpu_dptr<IType>(),
                                          gelu_input.rowwise_cpu_dptr<IType>(),
-                                         output.scale(),
+                                         ref_scale,
                                          ref_output_c.get(),
                                          ref_output_t.get(),
                                          &ref_amax,
@@ -127,7 +128,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 

diff --git a/tests/cpp/operator/test_cast_transpose_dgeglu.cu b/tests/cpp/operator/test_cast_transpose_dgeglu.cu
@@ -81,6 +81,7 @@ void performTest(const size_t N, const size_t H) {
   fillUniform(&grad);
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N * H * 2);
   std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N * H * 2);
@@ -89,7 +90,7 @@ void performTest(const size_t N, const size_t H) {
 
   CType ref_amax;
   compute_ref_cast_transpose_dgated_gelu(grad.rowwise_cpu_dptr<IType>(), input.rowwise_cpu_dptr<IType>(),
-                                         output.scale(), ref_output_c.get(), ref_output_t.get(),
+                                         ref_scale, ref_output_c.get(), ref_output_t.get(),
                                          &ref_amax, N, H);
 
   cudaDeviceSynchronize();
@@ -99,7 +100,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 

diff --git a/tests/cpp/operator/test_dequantize_nvfp4.cu b/tests/cpp/operator/test_dequantize_nvfp4.cu
@@ -75,7 +75,7 @@ void compute_ref_dequantize_nvfp4(const uint8_t *packed_data,
 }
 
 template <typename OutputType>
-float compute_amax(const test::Tensor &t, size_t rows, size_t cols) {
+float compute_amax(test::Tensor &t, size_t rows, size_t cols) {
     t.to_cpu();
     const auto *data = t.rowwise_cpu_dptr<OutputType>();
     float amax = 0.0f;

diff --git a/tests/cpp/operator/test_multi_cast_transpose.cu b/tests/cpp/operator/test_multi_cast_transpose.cu
@@ -97,7 +97,7 @@ void performTest() {
     std::copy(input.rowwise_cpu_dptr<InputType>(),
               input.rowwise_cpu_dptr<InputType>() + height * width,
               ref_input_list.back().begin());
-    ref_scale_list[tensor_id] = output.scale();
+    ref_scale_list[tensor_id] = isFp8Type(otype) ? output.scale() : 1.0f;
     ref_height_list[tensor_id] = height;
     ref_width_list[tensor_id] = width;
   }
@@ -138,7 +138,7 @@ void performTest() {
                      atol_amax, rtol_amax);
       compareResults("scale_inv",
                      output_list[tensor_id].rowwise_scale_inv(),
-                     1.f / output_list[tensor_id].scale(),
+                     1.f / ref_scale_list[tensor_id],
                      atol_amax, rtol_amax);
     }
     auto [atol, rtol] = getTolerances(otype);

diff --git a/tests/cpp/operator/test_normalization.cu b/tests/cpp/operator/test_normalization.cu
@@ -208,7 +208,7 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
   auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
   if (isFp8Type(otype)) {
     compareResults("amax", z.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / z.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", z.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 

diff --git a/tests/cpp/operator/test_qdq.cu b/tests/cpp/operator/test_qdq.cu
@@ -65,12 +65,13 @@ void performTestQ(const size_t N) {
 
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = output.scale();
 
   nvte_quantize(input.data(), output.data(), 0);
 
   float ref_amax;
   compute_ref_q<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output.get(),
-                                       N, &ref_amax, output.scale());
+                                       N, &ref_amax, ref_scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();