From 45aa776d803244fefa0f7523f25fa716004bd393 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Thu, 30 Apr 2026 01:51:01 +0000
Subject: [PATCH 01/10] Fix bug in NVFP4 quantize test where we set scale
 instead of amax

Refactor test tensor wrapper by removing recipe-specific logic whenever possible.

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 .../cpp/operator/test_cast_nvfp4_transpose.cu |   5 +-
 tests/cpp/operator/test_cast_transpose.cu     |   6 +-
 tests/cpp/test_common.cu                      | 307 ++++++++++--------
 tests/cpp/test_common.h                       |  83 ++---
 4 files changed, 216 insertions(+), 185 deletions(-)
diff --git a/tests/cpp/operator/test_cast_nvfp4_transpose.cu b/tests/cpp/operator/test_cast_nvfp4_transpose.cu
index d8d495d61f..08e43f3544 100644
--- a/tests/cpp/operator/test_cast_nvfp4_transpose.cu
+++ b/tests/cpp/operator/test_cast_nvfp4_transpose.cu
@@ -574,8 +574,7 @@ void performTest(float (*OP)(const float),
             amax = fmaxf(amax, static_cast<float>(input_dptr[idx]));
         }
     }
-    // Set 2nd stage NVFP4 scaling factor
-    output.set_scale(amax);
+    output.set_amax(amax);
 
     bool use_2d_quantization = false;
 
@@ -585,7 +584,7 @@ void performTest(float (*OP)(const float),
                            ref_output_t.get(),
                            ref_scales.get(),
                            ref_scales_t.get(),
-                           output.scale(),
+                           output.amax(),
                            rows,
                            cols,
                            scales_stride,
diff --git a/tests/cpp/operator/test_cast_transpose.cu b/tests/cpp/operator/test_cast_transpose.cu
index 44c78e4a09..ac05958b0b 100644
--- a/tests/cpp/operator/test_cast_transpose.cu
+++ b/tests/cpp/operator/test_cast_transpose.cu
@@ -55,13 +55,13 @@ void performTest(const size_t N, const size_t H) {
 
   fillUniform(&input);
   setRandomScale(&output);
+  const float scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   nvte_quantize(input.data(), output.data(), 0);
 
   float ref_amax;
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                     ref_output_t.get(), N, H, &ref_amax,
-                                     output.scale());
+                                     ref_output_t.get(), N, H, &ref_amax, scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -69,7 +69,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index 5196684118..04c1e8cf7e 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -8,13 +8,14 @@
 #include "test_common.h"
 
 #include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iostream>
 #include <memory>
 #include <numeric>
 #include <random>
-#include <iostream>
-#include <cassert>
-#include <cmath>
 #include <string>
+#include <unordered_set>
 
 #include <gtest/gtest.h>
 #include <omp.h>
@@ -391,11 +392,10 @@ Tensor::Tensor(const std::string& name,
       }
     } else {
       if (scaling_mode == NVTE_NVFP4_1D_SCALING) {
-        // Used for NVFP4 second stage scaling
-        cudaMalloc((void**)&scale, sizeof(float));  // NOLINT(*)
-        cudaMemset(scale, 0, sizeof(float));
-        scale_cpu_data_ = std::make_shared<float>(0);
-        tensor_.set_scale(scale, DType::kFloat32, std::vector<size_t>{1});
+        cudaMalloc((void**)&amax, sizeof(float));  // NOLINT(*)
+        cudaMemset(amax, 0, sizeof(float));
+        amax_cpu_data_ = std::make_shared<float>(0);
+        tensor_.set_amax(amax, DType::kFloat32, std::vector<size_t>{1});
       }
       auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(flattened_shape, tensor_.scaling_mode());
       auto rowwise_scale_size = rowwise_scale_meta.bytes();
@@ -420,138 +420,180 @@ Tensor::Tensor(const std::string& name,
       }
     }
   }
+
+  // Sanity check that CPU and GPU have corresponding buffers
+  NVTE_CHECK((cpu_data_rowwise_ == nullptr) == (tensor_.dptr() == nullptr));
+  NVTE_CHECK((cpu_data_columnwise_ == nullptr) == (tensor_.columnwise_dptr() == nullptr));
+  NVTE_CHECK((rowwise_scale_inv_cpu_data_ == nullptr) == (tensor_.scale_inv() == nullptr));
+  NVTE_CHECK((columnwise_scale_inv_cpu_data_ == nullptr)
+             == (tensor_.get_columnwise_scale_inv().data_ptr == nullptr));
+  NVTE_CHECK((amax_cpu_data_ == nullptr) == (tensor_.amax() == nullptr));
+  NVTE_CHECK((scale_cpu_data_ == nullptr) == (tensor_.scale() == nullptr));
 }
 
-void Tensor::to_cpu() const {
-  const NVTEShape s = tensor_.shape();
-  const size_t size = bytes(s, tensor_.dtype());
-  if (rowwise_) {
-    cudaMemcpy(cpu_data_rowwise_.get(),
-               tensor_.get_rowwise_data().data_ptr,
-               size,
-               cudaMemcpyDeviceToHost);
-  }
-  if (columnwise_) {
-    const DType colwise_type = tensor_.dtype();
-
-    const size_t colwise_size = bytes(s, colwise_type);
-    cudaMemcpy(cpu_data_columnwise_.get(),
-                tensor_.get_columnwise_data().data_ptr,
-                colwise_size,
-                cudaMemcpyDeviceToHost);
-  }
-  if (isFp8Type(dtype()) || isFp4Type(dtype())) {
-    if ((tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING)) {
-      if (tensor_.amax() != nullptr){
-        cudaMemcpy(amax_cpu_data_.get(),
-                  tensor_.amax(),
-                  sizeof(float),
-                  cudaMemcpyDeviceToHost);
-      }
-      cudaMemcpy(scale_cpu_data_.get(),
-                 tensor_.scale(),
-                 sizeof(float),
-                 cudaMemcpyDeviceToHost);
-    }
-    auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(s, tensor_.scaling_mode());
-    if (rowwise_) {
-      auto scale_size = rowwise_scale_meta.bytes();
-      cudaMemcpy(rowwise_scale_inv_cpu_data_.get(),
-                 tensor_.get_rowwise_scale_inv().data_ptr,
-                 scale_size,
-                 cudaMemcpyDeviceToHost);
+Tensor::~Tensor() {
+  std::unordered_set<void *> freed_ptrs;
+  auto free_cuda_buffer = [&freed_ptrs] (void *ptr) -> void {
+    if (ptr != nullptr && freed_ptrs.count(ptr) > 0) {
+      cudaFree(ptr);
+      freed_ptrs.insert(ptr);
     }
-    if (columnwise_) {
-      auto scale_size = colwise_scale_meta.bytes();
-      cudaMemcpy(columnwise_scale_inv_cpu_data_.get(),
-                 tensor_.get_columnwise_scale_inv().data_ptr,
-                 scale_size,
-                 cudaMemcpyDeviceToHost);
+  };
+  free_cuda_buffer(tensor_.dptr());
+  free_cuda_buffer(tensor_.scale_inv());
+  free_cuda_buffer(tensor_.scale());
+  free_cuda_buffer(tensor_.amax());
+  free_cuda_buffer(tensor_.columnwise_dptr());
+  free_cuda_buffer(tensor_.get_columnwise_scale_inv().data_ptr);
+}
+
+void Tensor::to_cpu() const {
+  auto from_basic_tensor = [] (const NVTEBasicTensor src, void *dst) -> void {
+    if (dst != nullptr) {
+      const size_t copy_size = bytes(src.shape, static_cast<DType>(src.dtype));
+      cudaMemcpy(dst, src.data_ptr, copy_size, cudaMemcpyDeviceToHost);
     }
-  }
+  };
+  from_basic_tensor(tensor_.get_rowwise_data(), cpu_data_rowwise_.get());
+  from_basic_tensor(tensor_.get_columnwise_data(), cpu_data_columnwise_.get());
+  from_basic_tensor(tensor_.get_rowwise_scale_inv(), rowwise_scale_inv_cpu_data_.get());
+  from_basic_tensor(tensor_.get_columnwise_scale_inv(), columnwise_scale_inv_cpu_data_.get());
+  from_basic_tensor(tensor_.get_amax(), amax_cpu_data_.get());
+  from_basic_tensor(tensor_.get_scale(), scale_cpu_data_.get());
 }
 
 void Tensor::from_cpu() const {
-  const NVTEShape s = tensor_.shape();
-  const size_t size = bytes(s, tensor_.dtype());
-  if (rowwise_) {
-    cudaMemcpy(tensor_.get_rowwise_data().data_ptr, cpu_data_rowwise_.get(), size,
-               cudaMemcpyHostToDevice);
-  }
-  if (columnwise_) {
-    cudaMemcpy(tensor_.get_columnwise_data().data_ptr, cpu_data_columnwise_.get(), size,
-               cudaMemcpyHostToDevice);
-  }
-  if (isFp8Type(dtype()) || isFp4Type(dtype())) {
-    if ((tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING)
-        || (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING)) {
-      if (tensor_.amax() != nullptr){
-        cudaMemcpy(tensor_.amax(), amax_cpu_data_.get(), sizeof(float), cudaMemcpyHostToDevice);
-      }
-      cudaMemcpy(tensor_.scale(), scale_cpu_data_.get(), sizeof(float), cudaMemcpyHostToDevice);
-    }
-    auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(s, tensor_.scaling_mode());
-    if (rowwise_) {
-      auto scale_size = rowwise_scale_meta.bytes();
-      cudaMemcpy(tensor_.get_rowwise_scale_inv().data_ptr,
-                 rowwise_scale_inv_cpu_data_.get(), scale_size,
-                 cudaMemcpyHostToDevice);
+  auto to_basic_tensor = [] (const void *src, NVTEBasicTensor dst) -> void {
+    if (src != nullptr) {
+      const size_t copy_size = bytes(dst.shape, static_cast<DType>(dst.dtype));
+      cudaMemcpy(dst.data_ptr, src, copy_size, cudaMemcpyHostToDevice);
     }
-    if (columnwise_) {
-      auto scale_size = colwise_scale_meta.bytes();
-      cudaMemcpy(tensor_.get_columnwise_scale_inv().data_ptr,
-                 columnwise_scale_inv_cpu_data_.get(), scale_size,
-                 cudaMemcpyHostToDevice);
-    }
-  }
+  };
+  to_basic_tensor(cpu_data_rowwise_.get(), tensor_.get_rowwise_data());
+  to_basic_tensor(cpu_data_columnwise_.get(), tensor_.get_columnwise_data());
+  to_basic_tensor(rowwise_scale_inv_cpu_data_.get(), tensor_.get_rowwise_scale_inv());
+  to_basic_tensor(columnwise_scale_inv_cpu_data_.get(), tensor_.get_columnwise_scale_inv());
+  to_basic_tensor(amax_cpu_data_.get(), tensor_.get_amax());
+  to_basic_tensor(scale_cpu_data_.get(), tensor_.get_scale());
+}
+
+void Tensor::set_amax(float amax) {
+  NVTE_CHECK(amax_cpu_data_);
+  NVTE_CHECK(tensor_.get_amax().dtype == kNVTEFloat32);
+  NVTE_CHECK(product(tensor_.get_amax().shape) == 1);
+  *amax_cpu_data_ = amax;
+  from_cpu();
 }
 
 void Tensor::set_scale(float scale) {
-  if (isFp8Type(dtype()) || isFp4Type(dtype())) {
-    NVTE_CHECK(scale_cpu_data_);
-    if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
-      *scale_cpu_data_ = scale;
-      from_cpu();
-    }
-  }
+  NVTE_CHECK(scale_cpu_data_);
+  NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32);
+  NVTE_CHECK(product(tensor_.get_scale().shape) == 1);
+  *scale_cpu_data_ = scale;
+  from_cpu();
 }
 
 void Tensor::set_scale_inv(float scale_inv) {
-  if (isFp8Type(dtype()) || isFp4Type(dtype())) {
-    if (rowwise_) {
-      NVTE_CHECK(rowwise_scale_inv_cpu_data_);
+  NVTE_CHECK(rowwise_scale_inv_cpu_data_);
+  NVTE_CHECK(columnwise_scale_inv_cpu_data_ == nullptr);  // TODO Not needed. Remove once all incorrect function calls have been removed
+  NVTE_CHECK(product(tensor_.get_rowwise_scale_inv().shape) == 1);
+  NVTE_CHECK(tensor_.get_rowwise_scale_inv().dtype == kNVTEFloat32);
+  *reinterpret_cast<float *>(rowwise_scale_inv_cpu_data_.get()) = scale_inv;
+  from_cpu();
+}
+
+void Tensor::fill_uniform_rowwise_scale_inv() {
+  if (rowwise_scale_inv_cpu_data_ == nullptr) {
+    return;
+  }
+
+  // Generate random scales on CPU
+  const auto numel = product(tensor_.get_rowwise_scale_inv().shape);
+  const auto dtype = tensor_.get_rowwise_scale_inv().dtype;
+  switch (dtype) {
+  case kNVTEFloat32:
+    {
+      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_cpu_data_.get());
+      std::uniform_real_distribution<> dis(-2.0, 1.0);
+      for (size_t i = 0; i < numel; ++i) {
+        cpu_data[i] = dis(gen_);
+      }
     }
-    if (columnwise_) {
-      NVTE_CHECK(columnwise_scale_inv_cpu_data_);
+    break;
+  case kNVTEFloat8E4M3:
+  case kNVTEFloat8E8M0:
+  case kNVTEByte:
+    {
+      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_cpu_data_.get());
+      std::uniform_int_distribution<uint8_t> dis(0, 127);
+      for (size_t i = 0; i < numel; ++i) {
+        cpu_data[i] = dis(gen_);
+      }
     }
+    break;
+  default:
+    NVTE_ERROR("Unsupported rowwise scale-inv dtype (",
+               static_cast<int>(dtype), ").");
+  }
 
-    auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(tensor_.shape(), tensor_.scaling_mode());
-    if (rowwise_) {
-      auto num_scales = product(rowwise_scale_meta.shape);
-      if (num_scales == 1) {
-        rowwise_cpu_scale_inv_ptr<float>()[0] = scale_inv;
-      } else {
-        std::uniform_int_distribution<uint8_t> dis(0, 127);
-        auto *scale_inv_ptr = rowwise_cpu_scale_inv_ptr<uint8_t>();
-        for (size_t i = 0; i < num_scales; i++) {
-          scale_inv_ptr[i] = dis(gen_);
-        }
+  // Update GPU tensor
+  from_cpu();
+}
+
+void Tensor::fill_uniform_columnwise_scale_inv() {
+  if (columnwise_scale_inv_cpu_data_ == nullptr) {
+    return;
+  }
+
+  // Generate random scales on CPU
+  const auto numel = product(tensor_.get_columnwise_scale_inv().shape);
+  const auto dtype = tensor_.get_columnwise_scale_inv().dtype;
+  switch (dtype) {
+  case kNVTEFloat32:
+    {
+      auto *cpu_data = reinterpret_cast<float *>(scale_cpu_data_.get());
+      std::uniform_real_distribution<> dis(-2.0, 1.0);
+      for (size_t i = 0; i < numel; ++i) {
+        cpu_data[i] = dis(gen_);
       }
     }
-    if (columnwise_) {
-      auto num_scales = product(colwise_scale_meta.shape);
-      if (num_scales == 1) {
-        columnwise_cpu_scale_inv_ptr<float>()[0] = scale_inv;
-      } else {
-        std::uniform_int_distribution<uint8_t> dis(0, 127);
-        auto *scale_inv_ptr = columnwise_cpu_scale_inv_ptr<uint8_t>();
-        for (size_t i = 0; i < num_scales; i++) {
-          scale_inv_ptr[i] = dis(gen_);
-        }
+    break;
+  case kNVTEFloat8E4M3:
+  case kNVTEFloat8E8M0:
+  case kNVTEByte:
+    {
+      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_cpu_data_.get());
+      std::uniform_int_distribution<uint8_t> dis(0, 127);
+      for (size_t i = 0; i < numel; ++i) {
+        cpu_data[i] = dis(gen_);
       }
     }
-    from_cpu();
+    break;
+  default:
+    NVTE_ERROR("Unsupported columnwise scale-inv dtype (",
+               static_cast<int>(dtype), ").");
   }
+
+  // Update GPU tensor
+  from_cpu();
+}
+
+void Tensor::fill_uniform_scale() {
+  if (scale_cpu_data_ == nullptr) {
+    return;
+  }
+
+  // Generate random scales on CPU
+  auto *cpu_data = reinterpret_cast<float *>(scale_cpu_data_.get());
+  const auto numel = product(tensor_.get_columnwise_scale_inv().shape);
+  NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32);
+  std::uniform_real_distribution<> dis(-2.0, 1.0);
+  for (size_t i = 0; i < numel; ++i) {
+    cpu_data[i] = dis(gen_);
+  }
+
+  // Update GPU tensor
+  from_cpu();
 }
 
 void Tensor::shareFP8Meta(const Tensor &other) {
@@ -912,6 +954,7 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
 }
 
 void fillUniform(Tensor *t) {
+  // Generate random row-wise data and column-wise data
   if (t->rowwise()) {
     const size_t size = product(t->rowwise_shape());
     TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(t->dtype(), T,
@@ -929,8 +972,12 @@ void fillUniform(Tensor *t) {
       }
     );
   }
-  std::uniform_real_distribution<> dis(-2.0, 1.0);
-  t->set_scale_inv(dis(t->gen()));
+
+  // Generate random scales
+  t->fill_uniform_rowwise_scale_inv();
+  t->fill_uniform_columnwise_scale_inv();
+
+  // Update data on GPU
   t->from_cpu();
 }
 
@@ -966,7 +1013,16 @@ void fillCase_special(Tensor *t) {
       }
     });
   }
-  t->set_scale_inv(1.0);
+
+  // Try setting scale to 1, fallback to random scales
+  try {
+    t->set_scale_inv(1.0);
+  } catch (...) {
+    t->fill_uniform_rowwise_scale_inv();
+    t->fill_uniform_columnwise_scale_inv();
+  }
+
+  // Update GPU tensor data
   t->from_cpu();
 }
 
@@ -1000,15 +1056,12 @@ template void fillCase<fp4e2m1>(Tensor *t, const InputsFillCase fill_case);
 #endif
 
 void setRandomScale(Tensor *t) {
-  std::uniform_real_distribution<> dis(-2.0, 1.0);
-  const float scale = dis(t->gen());
-  t->set_scale(scale);
+  t->fill_uniform_scale();
 }
 
 void setRandomScaleInv(Tensor *t) {
-  std::uniform_real_distribution<> dis(-2.0, 1.0);
-  const float scale_inv = dis(t->gen());
-  t->set_scale_inv(scale_inv);
+  t->fill_uniform_rowwise_scale_inv();
+  t->fill_uniform_columnwise_scale_inv();
 }
 
 bool isFp8Type(DType type) {
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index b5a7f26d14..7696d9a36e 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -6,10 +6,11 @@
 
 #pragma once
 
-#include <memory>
-#include <vector>
 #include <array>
+#include <memory>
 #include <random>
+#include <vector>
+
 #include <cudaTypedefs.h>
 #define FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080)
 
@@ -27,6 +28,11 @@
 namespace test {
 using namespace transformer_engine;
 
+size_t typeToNumBits(DType type);
+size_t product(const NVTEShape &shape);
+size_t product(const std::vector<size_t> &shape);
+size_t bytes(const NVTEShape& shape, const DType type);
+
 template <size_t i>
 struct BytesToType {};
 
@@ -141,30 +147,7 @@ class Tensor {
   Tensor(Tensor &&other) = default;
   Tensor& operator=(Tensor &&other) = default;
 
-  ~Tensor() {
-    void *data_ptr = tensor_.dptr();
-    void *scale_inv = tensor_.scale_inv();
-    void *columnwise_data_ptr = tensor_.get_columnwise_data().data_ptr;
-    void *columnwise_scale_inv = tensor_.get_columnwise_scale_inv().data_ptr;
-    if (columnwise_data_ptr == data_ptr) {
-      columnwise_data_ptr = nullptr;
-    }
-    if (columnwise_scale_inv == scale_inv) {
-      columnwise_scale_inv = nullptr;
-    }
-    if (data_ptr != nullptr) {
-      cudaFree(data_ptr);
-    }
-    if (scale_inv != nullptr) {
-      cudaFree(scale_inv);
-    }
-    if (columnwise_data_ptr != nullptr) {
-      cudaFree(columnwise_data_ptr);
-    }
-    if (columnwise_scale_inv != nullptr) {
-      cudaFree(columnwise_scale_inv);
-    }
-  }
+  ~Tensor();
 
   NVTETensor data() const noexcept { return tensor_.data(); }
 
@@ -215,24 +198,19 @@ class Tensor {
   }
 
   float amax() const {
-    if(amax_cpu_data_) {
-      to_cpu();
-      return *amax_cpu_data_;
-    } else {
-      return 0;
-    }
+    NVTE_CHECK(amax_cpu_data_);
+    NVTE_CHECK(tensor_.get_amax().dtype == kNVTEFloat32);
+    NVTE_CHECK(product(tensor_.get_amax().shape) == 1);
+    to_cpu();
+    return *amax_cpu_data_;
   }
 
   float scale() const {
-    if(scale_cpu_data_) {
-      NVTE_CHECK((tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING)
-                 || (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING),
-                 "Invalid scaling_mode!");
-      to_cpu();
-      return *scale_cpu_data_;
-    } else {
-      return 1;
-    }
+    NVTE_CHECK(scale_cpu_data_);
+    NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32);
+    NVTE_CHECK(product(tensor_.get_scale().shape) == 1);
+    to_cpu();
+    return *scale_cpu_data_;
   }
 
   template <typename T>
@@ -266,12 +244,12 @@ class Tensor {
   }
 
   float rowwise_scale_inv(){
-    if(rowwise_scale_inv_cpu_data_) {
-      float scale_inv = rowwise_cpu_scale_inv_ptr<float>()[0];
-      return scale_inv;
-    } else {
-      return 1;
-    }
+    to_cpu();
+    NVTE_CHECK(rowwise_scale_inv_cpu_data_);
+    auto scale_inv_tensor = tensor_.get_rowwise_scale_inv();
+    NVTE_CHECK(product(scale_inv_tensor.shape) == 1);
+    NVTE_CHECK(scale_inv_tensor.dtype == kNVTEFloat32);
+    return *reinterpret_cast<const float *>(rowwise_scale_inv_cpu_data_.get());
   }
 
   bool rowwise() const {
@@ -292,8 +270,14 @@ class Tensor {
 
   void to_cpu() const;
   void from_cpu() const;
+
+  void set_amax(float amax);
   void set_scale(float scale);
   void set_scale_inv(float scale_inv);
+
+  void fill_uniform_rowwise_scale_inv();
+  void fill_uniform_columnwise_scale_inv();
+  void fill_uniform_scale();
   void shareFP8Meta(const Tensor &other);
 
   std::mt19937& gen() { return gen_; }
@@ -455,11 +439,6 @@ inline float dsilu(const float x)    { return x * dsigmoid(x) + sigmoid(x); }
 inline float srelu(const float x)    { return x > 0 ? x * x : 0; }
 inline float dsrelu(const float x)   { return fmaxf(0, 2 * x); }
 
-size_t typeToNumBits(DType type);
-size_t product(const NVTEShape &shape);
-size_t product(const std::vector<size_t> &shape);
-size_t bytes(const NVTEShape& shape, const DType type);
-
 size_t first_dimension(const std::vector<size_t> &shape);
 size_t last_dimension(const std::vector<size_t> &shape);
 

From 1acb3c3cb43c7cf72f120ef67a9bb7581525688c Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Fri, 1 May 2026 02:53:23 +0000
Subject: [PATCH 02/10] Only get fp32 scale when tensor is expected to have
 fp32 scale

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/cpp/operator/test_act.cu                        | 10 ++++++----
 tests/cpp/operator/test_cast.cu                       |  5 +++--
 tests/cpp/operator/test_cast_current_scaling.cu       |  7 ++++---
 tests/cpp/operator/test_cast_dbias.cu                 |  5 +++--
 tests/cpp/operator/test_cast_dbias_dgelu.cu           |  5 +++--
 tests/cpp/operator/test_cast_gated_swiglu.cu          |  5 +++--
 tests/cpp/operator/test_cast_transpose.cu             |  6 +++---
 tests/cpp/operator/test_cast_transpose_dbias.cu       |  5 +++--
 tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu |  5 +++--
 tests/cpp/operator/test_cast_transpose_dgeglu.cu      |  5 +++--
 tests/cpp/operator/test_multi_cast_transpose.cu       |  4 ++--
 tests/cpp/operator/test_normalization.cu              |  2 +-
 tests/cpp/operator/test_qdq.cu                        |  3 ++-
 tests/cpp/test_common.cu                              |  3 +++
 14 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/tests/cpp/operator/test_act.cu b/tests/cpp/operator/test_act.cu
index ca5ccdc4ce..6edc6bd63b 100644
--- a/tests/cpp/operator/test_act.cu
+++ b/tests/cpp/operator/test_act.cu
@@ -124,6 +124,7 @@ void performTest(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&ograd);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_igrad = std::make_unique<IType[]>(N*H);
@@ -132,7 +133,7 @@ void performTest(const size_t N, const size_t H) {
 
   float ref_amax;
   compute_ref_act_cast<ref_act>(input.rowwise_cpu_dptr<IType>(), ref_output.get(),
-                                output.scale(), &ref_amax, N, H);
+                                ref_scale, &ref_amax, N, H);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -179,6 +180,7 @@ void performTestGLU(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&ograd);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output = std::make_unique<OType[]>(N * H);
   std::unique_ptr<IType[]> ref_igrad = std::make_unique<IType[]>(2 * N * H);
@@ -187,7 +189,7 @@ void performTestGLU(const size_t N, const size_t H) {
 
   float ref_amax;
   compute_ref_glu_act_cast<ref_act>(input.rowwise_cpu_dptr<IType>(), ref_output.get(),
-                                    output.scale(), &ref_amax, N, H);
+                                    ref_scale, &ref_amax, N, H);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -197,8 +199,8 @@ void performTestGLU(const size_t N, const size_t H) {
     auto [atol, rtol] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol, rtol);
     if (output.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
-      const float ref_scale = 1.f / output.scale();
-      compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr<float>(), ref_scale, atol, rtol);
+      const float ref_scale_inv = 1.f / ref_scale;
+      compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr<float>(), ref_scale_inv, atol, rtol);
     }
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_cast.cu b/tests/cpp/operator/test_cast.cu
index 35d9dd2efd..e8f48feef8 100644
--- a/tests/cpp/operator/test_cast.cu
+++ b/tests/cpp/operator/test_cast.cu
@@ -53,13 +53,14 @@ void performTest(const std::vector<size_t>& shape) {
 
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   nvte_quantize(input.data(), output_c.data(), 0);
 
   float ref_amax;
 
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                     full_size, &ref_amax, output_c.scale());
+                                     full_size, &ref_amax, ref_scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -67,7 +68,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_cast_current_scaling.cu b/tests/cpp/operator/test_cast_current_scaling.cu
index 4dd6cd2d58..7cca0d72e0 100644
--- a/tests/cpp/operator/test_cast_current_scaling.cu
+++ b/tests/cpp/operator/test_cast_current_scaling.cu
@@ -123,6 +123,7 @@ void performTest(const std::vector<size_t>& shape) {
     nvte_compute_amax(input.data(), output_c.data(), 0);
     QuantizationConfigWrapper config;
     nvte_compute_scale_from_amax(output_c.data(), config, 0);
+
     // avoid atomic amax update in cuda cast kernels because of current per-tensor scaling
     amax_to_check = output_c.amax();
     output_c.set_tensor_amax_nullptr();
@@ -130,7 +131,7 @@ void performTest(const std::vector<size_t>& shape) {
   nvte_quantize(input.data(), output_c.data(), 0);
 
   float ref_amax;
-  float ref_scale;
+  float ref_scale = 1.0;
   float ref_scale_inv;
   if (is_out_fp8){
     compute_amax_scale_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(),
@@ -138,13 +139,13 @@ void performTest(const std::vector<size_t>& shape) {
   }
 
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                    full_size, nullptr, is_out_fp8 ? output_c.scale() : 1.0f );
+                                    full_size, nullptr, ref_scale);
 
   cudaDeviceSynchronize();
 
   auto err = cudaGetLastError();
   ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
-  if (isFp8Type(otype)) {
+  if (is_out_fp8) {
     auto [atol_fp32, rtol_fp32] = getTolerances(DType::kFloat32);
     compareResults("amax", amax_to_check, ref_amax, 0.0f, rtol_fp32);
     compareResults("scale", output_c.scale(), ref_scale, 0.0f, rtol_fp32);
diff --git a/tests/cpp/operator/test_cast_dbias.cu b/tests/cpp/operator/test_cast_dbias.cu
index 18f07153c6..b7b5db48c3 100644
--- a/tests/cpp/operator/test_cast_dbias.cu
+++ b/tests/cpp/operator/test_cast_dbias.cu
@@ -74,13 +74,14 @@ void performTest(const std::vector<size_t>& shape) {
 
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
 
   CType ref_amax;
   compute_ref_cast_dbias(input.rowwise_cpu_dptr<IType>(),
-                         output_c.scale(),
+                         ref_scale,
                          ref_output_c.get(),
                          &ref_amax,
                          ref_output_dbias.get(),
@@ -109,7 +110,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_cast_dbias_dgelu.cu b/tests/cpp/operator/test_cast_dbias_dgelu.cu
index 8213e5665a..d8b8a20e6f 100644
--- a/tests/cpp/operator/test_cast_dbias_dgelu.cu
+++ b/tests/cpp/operator/test_cast_dbias_dgelu.cu
@@ -84,6 +84,7 @@ void performTest(const std::vector<size_t>& shape) {
   fillUniform(&input);
   fillUniform(&grad);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
@@ -91,7 +92,7 @@ void performTest(const std::vector<size_t>& shape) {
   CType ref_amax;
   compute_ref_cast_dbias_dgelu(input.rowwise_cpu_dptr<IType>(),
                                grad.rowwise_cpu_dptr<IType>(),
-                               output_c.scale(),
+                               ref_scale,
                                ref_output_c.get(),
                                &ref_amax,
                                ref_output_dbias.get(),
@@ -123,7 +124,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 
diff --git a/tests/cpp/operator/test_cast_gated_swiglu.cu b/tests/cpp/operator/test_cast_gated_swiglu.cu
index 298b978f2a..5298cc7577 100644
--- a/tests/cpp/operator/test_cast_gated_swiglu.cu
+++ b/tests/cpp/operator/test_cast_gated_swiglu.cu
@@ -79,6 +79,7 @@ void performTest(const std::vector<size_t>& shape) {
   fillUniform(&grad);
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(input_size);
 
@@ -91,7 +92,7 @@ void performTest(const std::vector<size_t>& shape) {
   float ref_amax;
   compute_ref_cast_dgated_swiglu(grad.rowwise_cpu_dptr<IType>(),
                                  input.rowwise_cpu_dptr<IType>(),
-                                 output_c.scale(),
+                                 ref_scale,
                                  ref_output_c.get(),
                                  &ref_amax,
                                  rows,
@@ -100,7 +101,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 
diff --git a/tests/cpp/operator/test_cast_transpose.cu b/tests/cpp/operator/test_cast_transpose.cu
index ac05958b0b..9a5dc959da 100644
--- a/tests/cpp/operator/test_cast_transpose.cu
+++ b/tests/cpp/operator/test_cast_transpose.cu
@@ -55,13 +55,13 @@ void performTest(const size_t N, const size_t H) {
 
   fillUniform(&input);
   setRandomScale(&output);
-  const float scale = isFp8Type(otype) ? output.scale() : 1.0f;
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   nvte_quantize(input.data(), output.data(), 0);
 
   float ref_amax;
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                     ref_output_t.get(), N, H, &ref_amax, scale);
+                                     ref_output_t.get(), N, H, &ref_amax, ref_scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -69,7 +69,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / scale;
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_cast_transpose_dbias.cu b/tests/cpp/operator/test_cast_transpose_dbias.cu
index 5b06b28327..f9303d34f5 100644
--- a/tests/cpp/operator/test_cast_transpose_dbias.cu
+++ b/tests/cpp/operator/test_cast_transpose_dbias.cu
@@ -73,6 +73,7 @@ void performTest(const size_t N, const size_t H) {
 
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N*H);
@@ -80,7 +81,7 @@ void performTest(const size_t N, const size_t H) {
 
   CType ref_amax;
   compute_ref_cast_transpose_dbias(input.rowwise_cpu_dptr<IType>(),
-                                   output.scale(),
+                                   ref_scale,
                                    ref_output_c.get(),
                                    ref_output_t.get(),
                                    &ref_amax,
@@ -111,7 +112,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
index 9a4a2fa080..31eafff80f 100644
--- a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
+++ b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
@@ -86,6 +86,7 @@ void performTest(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&gelu_input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N*H);
@@ -94,7 +95,7 @@ void performTest(const size_t N, const size_t H) {
   CType ref_amax;
   compute_ref_cast_transpose_dbias_dgelu(input.rowwise_cpu_dptr<IType>(),
                                          gelu_input.rowwise_cpu_dptr<IType>(),
-                                         output.scale(),
+                                         ref_scale,
                                          ref_output_c.get(),
                                          ref_output_t.get(),
                                          &ref_amax,
@@ -127,7 +128,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 
diff --git a/tests/cpp/operator/test_cast_transpose_dgeglu.cu b/tests/cpp/operator/test_cast_transpose_dgeglu.cu
index a87c0c5a42..15ecd3ab66 100644
--- a/tests/cpp/operator/test_cast_transpose_dgeglu.cu
+++ b/tests/cpp/operator/test_cast_transpose_dgeglu.cu
@@ -81,6 +81,7 @@ void performTest(const size_t N, const size_t H) {
   fillUniform(&grad);
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N * H * 2);
   std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N * H * 2);
@@ -89,7 +90,7 @@ void performTest(const size_t N, const size_t H) {
 
   CType ref_amax;
   compute_ref_cast_transpose_dgated_gelu(grad.rowwise_cpu_dptr<IType>(), input.rowwise_cpu_dptr<IType>(),
-                                         output.scale(), ref_output_c.get(), ref_output_t.get(),
+                                         ref_scale, ref_output_c.get(), ref_output_t.get(),
                                          &ref_amax, N, H);
 
   cudaDeviceSynchronize();
@@ -99,7 +100,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 
diff --git a/tests/cpp/operator/test_multi_cast_transpose.cu b/tests/cpp/operator/test_multi_cast_transpose.cu
index 2bb35c4b89..0271c9dc6b 100644
--- a/tests/cpp/operator/test_multi_cast_transpose.cu
+++ b/tests/cpp/operator/test_multi_cast_transpose.cu
@@ -97,7 +97,7 @@ void performTest() {
     std::copy(input.rowwise_cpu_dptr<InputType>(),
               input.rowwise_cpu_dptr<InputType>() + height * width,
               ref_input_list.back().begin());
-    ref_scale_list[tensor_id] = output.scale();
+    ref_scale_list[tensor_id] = isFp8Type(otype) ? output.scale() : 1.0f;
     ref_height_list[tensor_id] = height;
     ref_width_list[tensor_id] = width;
   }
@@ -138,7 +138,7 @@ void performTest() {
                      atol_amax, rtol_amax);
       compareResults("scale_inv",
                      output_list[tensor_id].rowwise_scale_inv(),
-                     1.f / output_list[tensor_id].scale(),
+                     1.f / ref_scale_list[tensor_id],
                      atol_amax, rtol_amax);
     }
     auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_normalization.cu b/tests/cpp/operator/test_normalization.cu
index f737005e26..ea6692dba4 100644
--- a/tests/cpp/operator/test_normalization.cu
+++ b/tests/cpp/operator/test_normalization.cu
@@ -208,7 +208,7 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
   auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
   if (isFp8Type(otype)) {
     compareResults("amax", z.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / z.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", z.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 
diff --git a/tests/cpp/operator/test_qdq.cu b/tests/cpp/operator/test_qdq.cu
index 4e364fffa4..034280aa9a 100644
--- a/tests/cpp/operator/test_qdq.cu
+++ b/tests/cpp/operator/test_qdq.cu
@@ -65,12 +65,13 @@ void performTestQ(const size_t N) {
 
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = output.scale();
 
   nvte_quantize(input.data(), output.data(), 0);
 
   float ref_amax;
   compute_ref_q<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output.get(),
-                                       N, &ref_amax, output.scale());
+                                       N, &ref_amax, ref_scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index 04c1e8cf7e..bd81af100b 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -1015,6 +1015,9 @@ void fillCase_special(Tensor *t) {
   }
 
   // Try setting scale to 1, fallback to random scales
+  // Note: This is a hack to match behavior of an earlier
+  // implementation. Consider filling block scales with constant
+  // value.
   try {
     t->set_scale_inv(1.0);
   } catch (...) {

From 97945bce86797764db37f52d1021993af2f97434 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Tue, 5 May 2026 01:48:09 +0000
Subject: [PATCH 03/10] Create dedicated class for managing GPU/CPU buffers

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/cpp/test_common.cu | 303 +++++++++++++++------------------------
 tests/cpp/test_common.h  | 145 ++++++++++++-------
 2 files changed, 208 insertions(+), 240 deletions(-)

diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index bd81af100b..070ce924ad 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -277,6 +277,36 @@ std::pair<scale_inv_meta, scale_inv_meta> get_scales(const NVTEShape& shape,
   NVTE_ERROR("Invalid scaling mode!");
 }
 
+Tensor::Buffer::Buffer(size_t size, DType dtype)
+  : size_{size}, dtype_{dtype}, bytes_{size * typeToNumBits(dtype) / 8} {
+  if (bytes_ > 0) {
+    cpu_buffer_ = new unsigned char[bytes_];
+    std::memset(cpu_buffer_.get(), 0, bytes_);
+    void *gpu_buffer = nullptr;
+    NVTE_CHECK_CUDA(cudaMalloc(&gpu_buffer, bytes_));
+    gpu_buffer_ = gpu_buffer;
+    NVTE_CHECK_CUDA(cudaMemset(gpu_buffer_.get(), 0, bytes_));
+  }
+}
+
+void Tensor::Buffer::to_cpu() {
+  if (bytes_ > 0) {
+    NVTE_CHECK_CUDA(cudaMemcpy(cpu_buffer_.get(), gpu_buffer_.get(), bytes_, cudaMemcpyDeviceToHost));
+  }
+}
+
+void Tensor::Buffer::from_cpu() {
+  if (bytes_ > 0) {
+    NVTE_CHECK_CUDA(cudaMemcpy(gpu_buffer_.get(), cpu_buffer_.get(), bytes_, cudaMemcpyHostToDevice));
+  }
+}
+
+void Tensor::Buffer::GPUDeleter::operator() (void *ptr) {
+  if (ptr != nullptr) {
+    cudaFree(ptr);
+  }
+}
+
 Tensor::Tensor(const std::string& name,
                const NVTEShape &shape, const DType type,
                const bool rowwise, const bool columnwise,
@@ -304,31 +334,13 @@ Tensor::Tensor(const std::string& name,
     flattened_shape = convertShape(flattened_shape_vec);
   }
 
-  // Allocate and initialize data
-  void *dptr_rowwise = nullptr, *dptr_columnwise = nullptr;
-  const size_t total_size = bytes(shape, type);
-  if (total_size != 0) {
-    if (rowwise) {
-      cudaMalloc((void**)&dptr_rowwise, total_size);  // NOLINT(*)
-      cudaMemset(dptr_rowwise, 0, total_size);
-      cpu_data_rowwise_ = std::make_unique<unsigned char[]>(total_size);
-      std::fill_n(cpu_data_rowwise_.get(), total_size, 0);
-    }
-    if (columnwise) {
-      cudaMalloc((void**)&dptr_columnwise, total_size);  // NOLINT(*)
-      cudaMemset(dptr_columnwise, 0, total_size);
-      cpu_data_columnwise_ = std::make_unique<unsigned char[]>(total_size);
-      std::fill_n(cpu_data_columnwise_.get(), total_size, 0);
-    }
-  }
-
-  // Set tensor row-wise data
+  // Allocate row-wise data
   if (rowwise) {
-    const DType rowwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type;
-    tensor_.set_rowwise_data(dptr_rowwise, rowwise_type, shape);
+    data_rowwise_ = Tensor::Buffer(product(shape), type);
+    tensor_.set_rowwise_data(data_rowwise_.gpu_buffer(), type, shape);
   }
 
-  // Set tensor column-wise data
+  // Allocate column-wise data
   if (columnwise) {
     // Determine shape of column-wise data
     std::vector<size_t> columnwise_shape_vec;
@@ -359,162 +371,113 @@ Tensor::Tensor(const std::string& name,
     const auto columnwise_shape = nvte_make_shape(columnwise_shape_vec.data(),
                                                   columnwise_shape_vec.size());
 
-    // Set column-wise data buffer
-    const DType colwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type;
-    tensor_.set_columnwise_data(dptr_columnwise, colwise_type, columnwise_shape);
+    // Allocate buffer
+    data_columnwise_ = Tensor::Buffer(product(columnwise_shape), type);
+
+    // Configure TE tensor
+    tensor_.set_columnwise_data(data_columnwise_.gpu_buffer(), type, columnwise_shape);
   }
 
-  // Configure scales, amaxes, and other tensor buffers
-  float *amax = nullptr, *scale = nullptr;
-  float *rowwise_scale_inv = nullptr, *columnwise_scale_inv = nullptr;
-  if (isFp8Type(type) || isFp4Type(type)) {
-    if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING) {
-      cudaMalloc((void**)&amax, sizeof(float));  // NOLINT(*)
-      cudaMemset(amax, 0, sizeof(float));
-      cudaMalloc((void**)&scale, sizeof(float));  // NOLINT(*)
-      cudaMemset(scale, 0, sizeof(float));
-      amax_cpu_data_ = std::make_shared<float>(0);
-      scale_cpu_data_ = std::make_shared<float>(0);
-      tensor_.set_amax(amax, DType::kFloat32, std::vector<size_t>{1});
-      tensor_.set_scale(scale, DType::kFloat32, std::vector<size_t>{1});
-      cudaMalloc((void**)&rowwise_scale_inv, sizeof(float));  // NOLINT(*)
+  // Allocate recipe-specific buffers
+  switch (scaling_mode) {
+  case NVTE_DELAYED_TENSOR_SCALING:
+    if (isFp8Type(type)) {
+      amax_ = Tensor::Buffer(1, DType::kFloat32);
+      scale_ = Tensor::Buffer(1, DType::kFloat32);
+      scale_inv_rowwise_ = Tensor::Buffer(1, DType::kFloat32);
+      tensor_.set_amax(amax_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+      tensor_.set_scale(scale_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
       if (rowwise) {
-        tensor_.set_rowwise_scale_inv(rowwise_scale_inv, DType::kFloat32,
-                                      std::vector<size_t>{1});
-        rowwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(sizeof(float));
-        std::fill_n(rowwise_scale_inv_cpu_data_.get(), sizeof(float), 0);
+        tensor_.set_rowwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
       }
       if (columnwise) {
-        tensor_.set_columnwise_scale_inv(rowwise_scale_inv, DType::kFloat32,
-                                          std::vector<size_t>{1});
-        columnwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(sizeof(float));
-        std::fill_n(columnwise_scale_inv_cpu_data_.get(), sizeof(float), 0);
-      }
-    } else {
-      if (scaling_mode == NVTE_NVFP4_1D_SCALING) {
-        cudaMalloc((void**)&amax, sizeof(float));  // NOLINT(*)
-        cudaMemset(amax, 0, sizeof(float));
-        amax_cpu_data_ = std::make_shared<float>(0);
-        tensor_.set_amax(amax, DType::kFloat32, std::vector<size_t>{1});
+        tensor_.set_columnwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
       }
+    }
+    break;
+  case NVTE_MXFP8_1D_SCALING:
+  case NVTE_BLOCK_SCALING_1D:
+  case NVTE_BLOCK_SCALING_2D:
+  case NVTE_NVFP4_1D_SCALING::
+    {
+      // Block scaling factors
       auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(flattened_shape, tensor_.scaling_mode());
-      auto rowwise_scale_size = rowwise_scale_meta.bytes();
-      auto columnwise_scale_size = colwise_scale_meta.bytes();
-      auto scale_shape = rowwise_scale_meta.shape;
-      auto columnwise_scale_shape = colwise_scale_meta.shape;
       if (rowwise) {
-        cudaMalloc((void **)&rowwise_scale_inv, rowwise_scale_size);  // NOLINT(*)
-        cudaMemset(rowwise_scale_inv, 0, rowwise_scale_size);
-        rowwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(rowwise_scale_size);
-        std::fill_n(rowwise_scale_inv_cpu_data_.get(), rowwise_scale_size, 0);
-        auto scale_dtype = rowwise_scale_meta.type;
-        tensor_.set_rowwise_scale_inv(rowwise_scale_inv, scale_dtype, scale_shape);
+        const auto scale_shape = rowwise_scale_meta.shape;
+        const auto scale_dtype = rowwise_scale_meta.dtype;
+        scale_inv_rowwise_ = Tensor::Buffer(product(scale_shape), scale_dtype);
+        tensor_.set_rowwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), scale_dtype, scale_shape);
       }
       if (columnwise) {
-        cudaMalloc((void**)&columnwise_scale_inv, columnwise_scale_size);  // NOLINT(*)
-        cudaMemset(columnwise_scale_inv, 0, columnwise_scale_size);
-        columnwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(columnwise_scale_size);
-        std::fill_n(columnwise_scale_inv_cpu_data_.get(), columnwise_scale_size, 0);
-        auto scale_dtype = colwise_scale_meta.type;
-        tensor_.set_columnwise_scale_inv(columnwise_scale_inv, scale_dtype, columnwise_scale_shape);
+        const auto scale_shape = columnwise_scale_meta.shape;
+        const auto scale_dtype = columnwise_scale_meta.dtype;
+        scale_inv_columnwise_ = Tensor::Buffer(product(scale_shape), scale_dtype);
+        tensor_.set_columnwise_scale_inv(scale_inv_columnwise_.gpu_buffer(), scale_dtype, scale_shape);
       }
-    }
-  }
 
-  // Sanity check that CPU and GPU have corresponding buffers
-  NVTE_CHECK((cpu_data_rowwise_ == nullptr) == (tensor_.dptr() == nullptr));
-  NVTE_CHECK((cpu_data_columnwise_ == nullptr) == (tensor_.columnwise_dptr() == nullptr));
-  NVTE_CHECK((rowwise_scale_inv_cpu_data_ == nullptr) == (tensor_.scale_inv() == nullptr));
-  NVTE_CHECK((columnwise_scale_inv_cpu_data_ == nullptr)
-             == (tensor_.get_columnwise_scale_inv().data_ptr == nullptr));
-  NVTE_CHECK((amax_cpu_data_ == nullptr) == (tensor_.amax() == nullptr));
-  NVTE_CHECK((scale_cpu_data_ == nullptr) == (tensor_.scale() == nullptr));
-}
-
-Tensor::~Tensor() {
-  std::unordered_set<void *> freed_ptrs;
-  auto free_cuda_buffer = [&freed_ptrs] (void *ptr) -> void {
-    if (ptr != nullptr && freed_ptrs.count(ptr) > 0) {
-      cudaFree(ptr);
-      freed_ptrs.insert(ptr);
+      // NVFP4 uses amax for tensor scaling
+      if (scaling_mode == NVTE_NVFP4_1D_SCALING) {
+        amax_ = Tensor::Buffer(1, DType::kFloat32);
+      }
     }
-  };
-  free_cuda_buffer(tensor_.dptr());
-  free_cuda_buffer(tensor_.scale_inv());
-  free_cuda_buffer(tensor_.scale());
-  free_cuda_buffer(tensor_.amax());
-  free_cuda_buffer(tensor_.columnwise_dptr());
-  free_cuda_buffer(tensor_.get_columnwise_scale_inv().data_ptr);
+    break;
+  default:
+    NVTE_ERROR("Unsupported tensor format (", static_cast<int>(scaling_mode), ")");
+  }
 }
 
 void Tensor::to_cpu() const {
-  auto from_basic_tensor = [] (const NVTEBasicTensor src, void *dst) -> void {
-    if (dst != nullptr) {
-      const size_t copy_size = bytes(src.shape, static_cast<DType>(src.dtype));
-      cudaMemcpy(dst, src.data_ptr, copy_size, cudaMemcpyDeviceToHost);
-    }
-  };
-  from_basic_tensor(tensor_.get_rowwise_data(), cpu_data_rowwise_.get());
-  from_basic_tensor(tensor_.get_columnwise_data(), cpu_data_columnwise_.get());
-  from_basic_tensor(tensor_.get_rowwise_scale_inv(), rowwise_scale_inv_cpu_data_.get());
-  from_basic_tensor(tensor_.get_columnwise_scale_inv(), columnwise_scale_inv_cpu_data_.get());
-  from_basic_tensor(tensor_.get_amax(), amax_cpu_data_.get());
-  from_basic_tensor(tensor_.get_scale(), scale_cpu_data_.get());
+  data_rowwise_.to_cpu();
+  data_columnwise_.to_cpu();
+  scale_inv_rowwise_.to_cpu();
+  scale_inv_columnwise_.to_cpu();
+  amax_.to_cpu();
+  scale_.to_cpu();
 }
 
 void Tensor::from_cpu() const {
-  auto to_basic_tensor = [] (const void *src, NVTEBasicTensor dst) -> void {
-    if (src != nullptr) {
-      const size_t copy_size = bytes(dst.shape, static_cast<DType>(dst.dtype));
-      cudaMemcpy(dst.data_ptr, src, copy_size, cudaMemcpyHostToDevice);
-    }
-  };
-  to_basic_tensor(cpu_data_rowwise_.get(), tensor_.get_rowwise_data());
-  to_basic_tensor(cpu_data_columnwise_.get(), tensor_.get_columnwise_data());
-  to_basic_tensor(rowwise_scale_inv_cpu_data_.get(), tensor_.get_rowwise_scale_inv());
-  to_basic_tensor(columnwise_scale_inv_cpu_data_.get(), tensor_.get_columnwise_scale_inv());
-  to_basic_tensor(amax_cpu_data_.get(), tensor_.get_amax());
-  to_basic_tensor(scale_cpu_data_.get(), tensor_.get_scale());
+  data_rowwise_.from_cpu();
+  data_columnwise_.from_cpu();
+  scale_inv_rowwise_.from_cpu();
+  scale_inv_columnwise_.from_cpu();
+  amax_.from_cpu();
+  scale_.from_cpu();
 }
 
 void Tensor::set_amax(float amax) {
-  NVTE_CHECK(amax_cpu_data_);
-  NVTE_CHECK(tensor_.get_amax().dtype == kNVTEFloat32);
-  NVTE_CHECK(product(tensor_.get_amax().shape) == 1);
-  *amax_cpu_data_ = amax;
-  from_cpu();
+  NVTE_CHECK(amax_.size() == 1);
+  NVTE_CHECK(amax_.dtype() == kNVTEFloat32);
+  *amax_.cpu_data<float>() = amax;
+  amax_.from_gpu();
 }
 
 void Tensor::set_scale(float scale) {
-  NVTE_CHECK(scale_cpu_data_);
-  NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32);
-  NVTE_CHECK(product(tensor_.get_scale().shape) == 1);
-  *scale_cpu_data_ = scale;
-  from_cpu();
+  NVTE_CHECK(scale_.size() == 1);
+  NVTE_CHECK(scale_.dtype() == kNVTEFloat32);
+  *scale_.cpu_data<float>() = scale;
+  scale_.from_gpu();
 }
 
 void Tensor::set_scale_inv(float scale_inv) {
-  NVTE_CHECK(rowwise_scale_inv_cpu_data_);
-  NVTE_CHECK(columnwise_scale_inv_cpu_data_ == nullptr);  // TODO Not needed. Remove once all incorrect function calls have been removed
-  NVTE_CHECK(product(tensor_.get_rowwise_scale_inv().shape) == 1);
-  NVTE_CHECK(tensor_.get_rowwise_scale_inv().dtype == kNVTEFloat32);
-  *reinterpret_cast<float *>(rowwise_scale_inv_cpu_data_.get()) = scale_inv;
-  from_cpu();
+  NVTE_CHECK(scale_inv_rowwise_.size() == 1);
+  NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32);
+  *scale_inv_rowwise_.cpu_data<float>() = scale_inv;
+  scale_inv_rowwise_.from_gpu();
 }
 
 void Tensor::fill_uniform_rowwise_scale_inv() {
-  if (rowwise_scale_inv_cpu_data_ == nullptr) {
+  if (scale_inv_rowwise_.size() == 0) {
     return;
   }
 
   // Generate random scales on CPU
-  const auto numel = product(tensor_.get_rowwise_scale_inv().shape);
-  const auto dtype = tensor_.get_rowwise_scale_inv().dtype;
+  const auto numel = scale_inv_rowwise_.size();
+  const auto dtype = scale_inv_rowwise_.dtype();
   switch (dtype) {
   case kNVTEFloat32:
     {
-      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_cpu_data_.get());
-      std::uniform_real_distribution<> dis(-2.0, 1.0);
+      auto *cpu_data = scale_inv_rowwise_.cpu_data<float>();
+      std::uniform_real_distribution<float> dis(-2.0, 1.0);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
       }
@@ -524,7 +487,7 @@ void Tensor::fill_uniform_rowwise_scale_inv() {
   case kNVTEFloat8E8M0:
   case kNVTEByte:
     {
-      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_cpu_data_.get());
+      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_rowwise_.cpu_data());
       std::uniform_int_distribution<uint8_t> dis(0, 127);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
@@ -537,22 +500,22 @@ void Tensor::fill_uniform_rowwise_scale_inv() {
   }
 
   // Update GPU tensor
-  from_cpu();
+  scale_inv_rowwise_.from_cpu();
 }
 
 void Tensor::fill_uniform_columnwise_scale_inv() {
-  if (columnwise_scale_inv_cpu_data_ == nullptr) {
+  if (scale_inv_columnwise_.size() == 0) {
     return;
   }
 
   // Generate random scales on CPU
-  const auto numel = product(tensor_.get_columnwise_scale_inv().shape);
-  const auto dtype = tensor_.get_columnwise_scale_inv().dtype;
+  const auto numel = scale_inv_columnwise_.size();
+  const auto dtype = scale_inv_columnwise_.dtype();
   switch (dtype) {
   case kNVTEFloat32:
     {
-      auto *cpu_data = reinterpret_cast<float *>(scale_cpu_data_.get());
-      std::uniform_real_distribution<> dis(-2.0, 1.0);
+      auto *cpu_data = scale_inv_columnwise_.cpu_data<float>();
+      std::uniform_real_distribution<float> dis(-2.0, 1.0);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
       }
@@ -562,7 +525,7 @@ void Tensor::fill_uniform_columnwise_scale_inv() {
   case kNVTEFloat8E8M0:
   case kNVTEByte:
     {
-      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_cpu_data_.get());
+      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_columnwise_.cpu_data());
       std::uniform_int_distribution<uint8_t> dis(0, 127);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
@@ -575,55 +538,25 @@ void Tensor::fill_uniform_columnwise_scale_inv() {
   }
 
   // Update GPU tensor
-  from_cpu();
+  scale_inv_columnwise_.from_cpu();
 }
 
 void Tensor::fill_uniform_scale() {
-  if (scale_cpu_data_ == nullptr) {
+  if (scale_.size() == 0) {
     return;
   }
 
   // Generate random scales on CPU
-  auto *cpu_data = reinterpret_cast<float *>(scale_cpu_data_.get());
-  const auto numel = product(tensor_.get_columnwise_scale_inv().shape);
-  NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32);
-  std::uniform_real_distribution<> dis(-2.0, 1.0);
+  auto *cpu_data = scale_.cpu_data<float>();
+  const auto numel = scale_.size();
+  NVTE_CHECK(scale_.dtype() == kNVTEFloat32);
+  std::uniform_real_distribution<float> dis(-2.0, 1.0);
   for (size_t i = 0; i < numel; ++i) {
     cpu_data[i] = dis(gen_);
   }
 
   // Update GPU tensor
-  from_cpu();
-}
-
-void Tensor::shareFP8Meta(const Tensor &other) {
-  if ((isFp8Type(dtype()) && isFp8Type(other.dtype()))
-      || isFp4Type(dtype()) && isFp4Type(other.dtype())) {
-    auto new_tensor = TensorWrapper(other.tensor_.scaling_mode());
-    auto my_rowwise_data = tensor_.get_rowwise_data();
-    new_tensor.set_rowwise_data(my_rowwise_data.data_ptr, static_cast<DType>(my_rowwise_data.dtype),
-                                my_rowwise_data.shape);
-    auto my_columnwise_data = tensor_.get_columnwise_data();
-    new_tensor.set_columnwise_data(my_columnwise_data.data_ptr,
-                                   static_cast<DType>(my_columnwise_data.dtype),
-                                   my_columnwise_data.shape);
-    auto other_amax = other.tensor_.get_amax();
-    new_tensor.set_amax(other_amax.data_ptr, static_cast<DType>(other_amax.dtype),
-                        other_amax.shape);
-    auto other_scale = other.tensor_.get_scale();
-    new_tensor.set_scale(other_scale.data_ptr, static_cast<DType>(other_scale.dtype),
-                         other_scale.shape);
-    auto other_row_scale_inv = other.tensor_.get_rowwise_scale_inv();
-    new_tensor.set_rowwise_scale_inv(other_row_scale_inv.data_ptr,
-                                     static_cast<DType>(other_row_scale_inv.dtype),
-                                     other_row_scale_inv.shape);
-    auto other_col_scale_inv = other.tensor_.get_columnwise_scale_inv();
-    new_tensor.set_columnwise_scale_inv(other_col_scale_inv.data_ptr,
-                                        static_cast<DType>(other_col_scale_inv.dtype),
-                                        other_col_scale_inv.shape);
-    tensor_ = std::move(new_tensor);
-    to_cpu();
-  }
+  scale_.from_cpu();
 }
 
 using std::to_string;
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index 7696d9a36e..8137c184a4 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -139,7 +139,7 @@ class Tensor {
          const NVTEScalingMode &mode = NVTE_DELAYED_TENSOR_SCALING) :
     Tensor(name, nvte_make_shape(shape.data(), shape.size()), type, rowwise, columnwise, mode) {}
 
-  Tensor() {}
+  Tensor() = default;
 
   Tensor& operator=(const Tensor &other) = delete;
   Tensor(const Tensor &other) = delete;
@@ -147,7 +147,7 @@ class Tensor {
   Tensor(Tensor &&other) = default;
   Tensor& operator=(Tensor &&other) = default;
 
-  ~Tensor();
+  ~Tensor() = default;
 
   NVTETensor data() const noexcept { return tensor_.data(); }
 
@@ -185,71 +185,45 @@ class Tensor {
 
   template <typename T>
   T *rowwise_cpu_dptr() const {
-    NVTE_CHECK(TypeInfo<T>::dtype == tensor_.dtype(), "Invalid type!");
-    NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!");
-    return reinterpret_cast<T *>(cpu_data_rowwise_.get());
+    return data_rowwise_.cpu_buffer<T>();
   }
 
   template <typename T>
   T *columnwise_cpu_dptr() const {
-    NVTE_CHECK(TypeInfo<T>::dtype == tensor_.dtype(), "Invalid type!");
-    NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!");
-    return reinterpret_cast<T *>(cpu_data_columnwise_.get());
+    return data_columnwise_.cpu_buffer<T>();
   }
 
-  float amax() const {
-    NVTE_CHECK(amax_cpu_data_);
-    NVTE_CHECK(tensor_.get_amax().dtype == kNVTEFloat32);
-    NVTE_CHECK(product(tensor_.get_amax().shape) == 1);
-    to_cpu();
-    return *amax_cpu_data_;
+  float amax() {
+    NVTE_CHECK(amax_.size() == 1);
+    NVTE_CHECK(amax_.dtype() == kNVTEFloat32);
+    amax_.to_cpu();
+    return *amax_.cpu_data<float>();
   }
 
-  float scale() const {
-    NVTE_CHECK(scale_cpu_data_);
-    NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32);
-    NVTE_CHECK(product(tensor_.get_scale().shape) == 1);
-    to_cpu();
-    return *scale_cpu_data_;
+  float scale() {
+    NVTE_CHECK(scale_.size() == 1);
+    NVTE_CHECK(scale_.dtype() == kNVTEFloat32);
+    scale_.to_cpu();
+    return *scale_.cpu_data<float>();
   }
 
   template <typename T>
   T *rowwise_cpu_scale_inv_ptr(){
-    if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING){
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat32, "Invalid type!");
-    } else if (tensor_.scaling_mode() == NVTE_BLOCK_SCALING_1D || tensor_.scaling_mode() == NVTE_BLOCK_SCALING_2D) {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat32, "Invalid type!");
-    } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat8E4M3, "Invalid type!");
-    } else {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kByte, "Invalid type!");
-    }
-    to_cpu();
-    return reinterpret_cast<T*>(rowwise_scale_inv_cpu_data_.get());
+    scale_inv_rowwise_.to_cpu();
+    return scale_inv_rowwise_.cpu_buffer<T>();
   }
 
   template <typename T>
   T *columnwise_cpu_scale_inv_ptr(){
-    if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING){
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat32, "Invalid type!");
-    } else if (tensor_.scaling_mode() == NVTE_BLOCK_SCALING_1D || tensor_.scaling_mode() == NVTE_BLOCK_SCALING_2D) {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat32, "Invalid type!");
-    } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat8E4M3, "Invalid type!");
-    } else {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kByte, "Invalid type!");
-    }
-    to_cpu();
-    return reinterpret_cast<T*>(columnwise_scale_inv_cpu_data_.get());
+    scale_inv_columnwise_.to_cpu();
+    return scale_inv_columnwise_.cpu_buffer<T>();
   }
 
   float rowwise_scale_inv(){
-    to_cpu();
-    NVTE_CHECK(rowwise_scale_inv_cpu_data_);
-    auto scale_inv_tensor = tensor_.get_rowwise_scale_inv();
-    NVTE_CHECK(product(scale_inv_tensor.shape) == 1);
-    NVTE_CHECK(scale_inv_tensor.dtype == kNVTEFloat32);
-    return *reinterpret_cast<const float *>(rowwise_scale_inv_cpu_data_.get());
+    NVTE_CHECK(scale_inv_rowwise_.size() == 1);
+    NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32);
+    scale_inv_rowwise_.to_cpu();
+    return *scale_inv_rowwise_.cpu_data<float>();
   }
 
   bool rowwise() const {
@@ -278,18 +252,79 @@ class Tensor {
   void fill_uniform_rowwise_scale_inv();
   void fill_uniform_columnwise_scale_inv();
   void fill_uniform_scale();
-  void shareFP8Meta(const Tensor &other);
 
   std::mt19937& gen() { return gen_; }
 
  private:
+
+  /* Manages matching GPU and CPU buffers. */
+  class Buffer {
+  public:
+
+    Buffer(size_t size = 0, DType dtype = DType::kByte);
+    ~Buffer() = default;
+    Buffer(const Buffer&) = delete;
+    Buffer& operator=(const Buffer&) = delete;
+    Buffer(Buffer&&) = default;
+    Buffer& operator=(Buffer&&) = default;
+
+    size_t size() const noexcept { return size_; }
+    DType dtype() const noexcept { return dtype_; }
+
+    // Void pointer accessors
+    void *cpu_buffer() { return cpu_buffer_.get(); }
+    const void *cpu_buffer() const { return cpu_buffer_.get(); }
+    void *gpu_buffer() { return gpu_buffer_.get(); }
+    const void *gpu_buffer() const { return gpu_buffer_.get(); }
+
+    // Templated pointer accessors
+    template <typename T>
+    T *cpu_buffer() {
+      NVTE_CHECK(TypeInfo<T>::dtype == dtype_, "Invalid type.");
+      return reinterpret_cast<T *>(cpu_buffer());;
+    }
+    template <typename T>
+    const T *cpu_buffer() const {
+      return const_cast<Buffer *>(this)->cpu_buffer<T>();
+    }
+    template <typename T>
+    T *gpu_buffer() {
+      NVTE_CHECK(TypeInfo<T>::dtype == dtype_, "Invalid type.");
+      return reinterpret_cast<T *>(gpu_buffer());;
+    }
+    template <typename T>
+    const T *gpu_buffer() const {
+      return const_cast<Buffer *>(this)->gpu_buffer<T>();
+    }
+
+    // Memory transfers between CPU and GPU
+    void to_cpu();
+    void from_cpu();
+
+  private:
+
+    struct GPUDeleter {
+      void operator()(void *ptr);
+    };
+
+    std::unique_ptr<unsigned char[]> cpu_buffer_;
+    std::unique_ptr<unsigned char[], GPUDeleter> gpu_buffer_;
+    size_t size_;
+    DType dtype_;
+    size_t bytes_;
+  };
+
+  // Transformer Engine tensor
   TensorWrapper tensor_;
-  std::unique_ptr<unsigned char[]> cpu_data_rowwise_;
-  std::unique_ptr<unsigned char[]> cpu_data_columnwise_;
-  std::shared_ptr<float> amax_cpu_data_;
-  std::shared_ptr<float> scale_cpu_data_;
-  std::unique_ptr<unsigned char[]> rowwise_scale_inv_cpu_data_;
-  std::unique_ptr<unsigned char[]> columnwise_scale_inv_cpu_data_;
+
+  // Data buffers
+  Buffer data_rowwise_;
+  Buffer data_columnwise_;
+  Buffer scale_inv_rowwise_;
+  Buffer scale_inv_columnwise_;
+  Buffer amax_;
+  Buffer scale_;
+
   bool rowwise_;
   bool columnwise_;
   std::string name_;

From ea63f052bf7659b2822d043e43710254a9c737be Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Tue, 5 May 2026 04:37:08 +0000
Subject: [PATCH 04/10] Fix bugs in C++ test tensor infrastructure

- Fix syntax error in switch case (:: -> :)
- Fix double-underscore typo in variable name
- Fix wrong buffer passed to set_amax_columnwise
- Fix unique_ptr assignment from raw pointer (use reset())
- Remove dead duplicate NVTE_MXFP8_1D_SCALING branch in get_scales()
- Rename cpu_data -> cpu_buffer to match Buffer class API
- Remove const from Tensor::to_cpu/from_cpu and their callers,
  since both methods write to the CPU buffer

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 .../cpp/operator/test_cast_nvfp4_transpose.cu |  2 +-
 tests/cpp/test_common.cu                      | 67 ++++++-------------
 tests/cpp/test_common.h                       | 20 +++---
 3 files changed, 31 insertions(+), 58 deletions(-)

diff --git a/tests/cpp/operator/test_cast_nvfp4_transpose.cu b/tests/cpp/operator/test_cast_nvfp4_transpose.cu
index 15d7c695c9..4ab06abcde 100644
--- a/tests/cpp/operator/test_cast_nvfp4_transpose.cu
+++ b/tests/cpp/operator/test_cast_nvfp4_transpose.cu
@@ -502,7 +502,7 @@ void print_detailed_tensor_comparison(const std::string& name,
     printf("==================================\n");
 }
 
-void compareResults_nvfp4(const Tensor &test,
+void compareResults_nvfp4(Tensor &test,
                           const void *ref, const void *ref_t, const int rows, const int cols,
                           double atol = 1e-5, double rtol = 1e-8, bool if_on_gpus = true, bool dump_data = false) {
     if (if_on_gpus) test.to_cpu();
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index 186f80d85e..4902989fbb 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -194,33 +194,6 @@ std::pair<scale_inv_meta, scale_inv_meta> get_scales(const NVTEShape& shape,
 
     return {ret_rowwise, ret_colwise};
   }
-  if (scaling_mode == NVTE_MXFP8_1D_SCALING) {
-    std::vector<size_t> shape_vec;
-    for (size_t i = 0; i < shape.ndim; ++i) {
-      shape_vec.push_back(shape.data[i]);
-    }
-    size_t first_dim = first_dimension(shape_vec);
-    size_t last_dim = last_dimension(shape_vec);
-
-    scale_inv_meta ret_rowwise, ret_colwise;
-
-    const size_t block_size_X_rowwise = 32;
-    size_t scale_dim_Y_rowwise = DIVUP_TO_MULTIPLE(first_dim, scale_tensor_alignment_Y_rowwise);
-    size_t scale_dim_X_rowwise = DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_size_X_rowwise), scale_tensor_alignment_X_rowwise);
-    ret_rowwise.shape = {scale_dim_Y_rowwise, scale_dim_X_rowwise};
-
-    const size_t block_size_Y_colwise = 32;
-    size_t scale_dim_Y_colwise = DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_size_Y_colwise), scale_tensor_alignment_Y_colwise);
-    size_t scale_dim_X_colwise = DIVUP_TO_MULTIPLE(last_dim, scale_tensor_alignment_X_colwise);
-    ret_colwise.shape = {scale_dim_Y_colwise, scale_dim_X_colwise};
-
-    ret_rowwise.type = DType::kFloat8E8M0;
-    ret_colwise.type = DType::kFloat8E8M0;
-    ret_rowwise.type_size_bits = typeToNumBits(DType::kFloat8E8M0);
-    ret_colwise.type_size_bits = typeToNumBits(DType::kFloat8E8M0);
-
-    return {ret_rowwise, ret_colwise};
-  }
   if (scaling_mode == NVTE_BLOCK_SCALING_2D) {
     std::vector<size_t> shape_vec;
     for (size_t i = 0; i < shape.ndim; ++i) {
@@ -280,11 +253,11 @@ std::pair<scale_inv_meta, scale_inv_meta> get_scales(const NVTEShape& shape,
 Tensor::Buffer::Buffer(size_t size, DType dtype)
   : size_{size}, dtype_{dtype}, bytes_{size * typeToNumBits(dtype) / 8} {
   if (bytes_ > 0) {
-    cpu_buffer_ = new unsigned char[bytes_];
+    cpu_buffer_.reset(new unsigned char[bytes_]);
     std::memset(cpu_buffer_.get(), 0, bytes_);
-    void *gpu_buffer = nullptr;
+    unsigned char *gpu_buffer = nullptr;
     NVTE_CHECK_CUDA(cudaMalloc(&gpu_buffer, bytes_));
-    gpu_buffer_ = gpu_buffer;
+    gpu_buffer_.reset(gpu_buffer);
     NVTE_CHECK_CUDA(cudaMemset(gpu_buffer_.get(), 0, bytes_));
   }
 }
@@ -398,7 +371,7 @@ Tensor::Tensor(const std::string& name,
   case NVTE_MXFP8_1D_SCALING:
   case NVTE_BLOCK_SCALING_1D:
   case NVTE_BLOCK_SCALING_2D:
-  case NVTE_NVFP4_1D_SCALING::
+  case NVTE_NVFP4_1D_SCALING:
     {
       // Block scaling factors
       auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(flattened_shape, tensor_.scaling_mode());
@@ -418,9 +391,9 @@ Tensor::Tensor(const std::string& name,
       // NVFP4 uses amax for tensor scaling
       if (scaling_mode == NVTE_NVFP4_1D_SCALING) {
         amax_rowwise_ = Tensor::Buffer(1, DType::kFloat32);
-        amax_columnwise__ = Tensor::Buffer(1, DType::kFloat32);
+        amax_columnwise_ = Tensor::Buffer(1, DType::kFloat32);
         tensor_.set_amax(amax_rowwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
-        tensor_.set_amax_columnwise(amax_rowwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+        tensor_.set_amax_columnwise(amax_columnwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
       }
     }
     break;
@@ -429,7 +402,7 @@ Tensor::Tensor(const std::string& name,
   }
 }
 
-void Tensor::to_cpu() const {
+void Tensor::to_cpu() {
   data_rowwise_.to_cpu();
   data_columnwise_.to_cpu();
   scale_inv_rowwise_.to_cpu();
@@ -439,7 +412,7 @@ void Tensor::to_cpu() const {
   scale_.to_cpu();
 }
 
-void Tensor::from_cpu() const {
+void Tensor::from_cpu() {
   data_rowwise_.from_cpu();
   data_columnwise_.from_cpu();
   scale_inv_rowwise_.from_cpu();
@@ -452,21 +425,21 @@ void Tensor::from_cpu() const {
 void Tensor::set_amax(float amax) {
   NVTE_CHECK(amax_rowwise_.size() == 1);
   NVTE_CHECK(amax_rowwise_.dtype() == kNVTEFloat32);
-  *amax_rowwise_.cpu_data<float>() = amax;
+  *amax_rowwise_.cpu_buffer<float>() = amax;
   amax_rowwise_.from_cpu();
 }
 
 void Tensor::set_scale(float scale) {
   NVTE_CHECK(scale_.size() == 1);
   NVTE_CHECK(scale_.dtype() == kNVTEFloat32);
-  *scale_.cpu_data<float>() = scale;
+  *scale_.cpu_buffer<float>() = scale;
   scale_.from_cpu();
 }
 
 void Tensor::set_scale_inv(float scale_inv) {
   NVTE_CHECK(scale_inv_rowwise_.size() == 1);
   NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32);
-  *scale_inv_rowwise_.cpu_data<float>() = scale_inv;
+  *scale_inv_rowwise_.cpu_buffer<float>() = scale_inv;
   scale_inv_rowwise_.from_cpu();
 }
 
@@ -477,7 +450,7 @@ void Tensor::set_tensor_amax(float amax) {
 void Tensor::set_tensor_amax_columnwise(float amax) {
   NVTE_CHECK(amax_columnwise_.size() == 1);
   NVTE_CHECK(amax_columnwise_.dtype() == kNVTEFloat32);
-  *amax_columnwise_.cpu_data<float>() = amax;
+  *amax_columnwise_.cpu_buffer<float>() = amax;
   amax_columnwise_.from_cpu();
 }
 
@@ -492,7 +465,7 @@ void Tensor::fill_uniform_rowwise_scale_inv() {
   switch (dtype) {
   case kNVTEFloat32:
     {
-      auto *cpu_data = scale_inv_rowwise_.cpu_data<float>();
+      auto *cpu_data = scale_inv_rowwise_.cpu_buffer<float>();
       std::uniform_real_distribution<float> dis(-2.0, 1.0);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
@@ -503,7 +476,7 @@ void Tensor::fill_uniform_rowwise_scale_inv() {
   case kNVTEFloat8E8M0:
   case kNVTEByte:
     {
-      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_rowwise_.cpu_data());
+      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_rowwise_.cpu_buffer());
       std::uniform_int_distribution<uint8_t> dis(0, 127);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
@@ -530,7 +503,7 @@ void Tensor::fill_uniform_columnwise_scale_inv() {
   switch (dtype) {
   case kNVTEFloat32:
     {
-      auto *cpu_data = scale_inv_columnwise_.cpu_data<float>();
+      auto *cpu_data = scale_inv_columnwise_.cpu_buffer<float>();
       std::uniform_real_distribution<float> dis(-2.0, 1.0);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
@@ -541,7 +514,7 @@ void Tensor::fill_uniform_columnwise_scale_inv() {
   case kNVTEFloat8E8M0:
   case kNVTEByte:
     {
-      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_columnwise_.cpu_data());
+      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_columnwise_.cpu_buffer());
       std::uniform_int_distribution<uint8_t> dis(0, 127);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
@@ -563,7 +536,7 @@ void Tensor::fill_uniform_scale() {
   }
 
   // Generate random scales on CPU
-  auto *cpu_data = scale_.cpu_data<float>();
+  auto *cpu_data = scale_.cpu_buffer<float>();
   const auto numel = scale_.size();
   NVTE_CHECK(scale_.dtype() == kNVTEFloat32);
   std::uniform_real_distribution<float> dis(-2.0, 1.0);
@@ -600,7 +573,7 @@ std::vector<size_t> unravel(const size_t i, const NVTEShape &shape) {
   return ret;
 }
 
-void compareResults_sequential(const std::string &name, const Tensor &test,
+void compareResults_sequential(const std::string &name, Tensor &test,
                                const void *ref, const bool rowwise,
                                double atol, double rtol, bool if_on_gpus,
                                const size_t tolerable_mismatches_limit) {
@@ -690,7 +663,7 @@ static size_t getFirstMismatchIdx(const DType data_type, const T* test_data, con
   return first_mismatch_idx;
 }
 
-void compareResults_parallel(const std::string &name, const Tensor &test, const void *ref,
+void compareResults_parallel(const std::string &name, Tensor &test, const void *ref,
                              const bool rowwise, double atol, double rtol, bool if_on_gpus,
                              const size_t tolerable_mismatches_limit) {
   if (if_on_gpus) test.to_cpu();
@@ -717,7 +690,7 @@ void compareResults_parallel(const std::string &name, const Tensor &test, const
   );
 }
 
-void compareResults(const std::string &name, const Tensor &test, const void *ref,
+void compareResults(const std::string &name, Tensor &test, const void *ref,
                     const bool rowwise, double atol, double rtol, bool if_on_gpus,
                     const size_t tolerable_mismatches_limit) {
   constexpr bool sequential = false;
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index 4ea31b50b0..52138bd4c4 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -120,7 +120,7 @@ struct TypeInfo {
     }
 
     constexpr static DType dtype = getType<T>();
-    constexpr static size_t size = BitsNumber<T>::num_bits;;
+    constexpr static size_t size = BitsNumber<T>::num_bits;
 };
 
 class Tensor {
@@ -197,21 +197,21 @@ class Tensor {
     NVTE_CHECK(amax_rowwise_.size() == 1);
     NVTE_CHECK(amax_rowwise_.dtype() == kNVTEFloat32);
     amax_rowwise_.to_cpu();
-    return *amax_rowwise_.cpu_data<float>();
+    return *amax_rowwise_.cpu_buffer<float>();
   }
 
   float amax_columnwise() {
     NVTE_CHECK(amax_columnwise_.size() == 1);
     NVTE_CHECK(amax_columnwise_.dtype() == kNVTEFloat32);
     amax_columnwise_.to_cpu();
-    return *amax_columnwise_.cpu_data<float>();
+    return *amax_columnwise_.cpu_buffer<float>();
   }
 
   float scale() {
     NVTE_CHECK(scale_.size() == 1);
     NVTE_CHECK(scale_.dtype() == kNVTEFloat32);
     scale_.to_cpu();
-    return *scale_.cpu_data<float>();
+    return *scale_.cpu_buffer<float>();
   }
 
   template <typename T>
@@ -230,7 +230,7 @@ class Tensor {
     NVTE_CHECK(scale_inv_rowwise_.size() == 1);
     NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32);
     scale_inv_rowwise_.to_cpu();
-    return *scale_inv_rowwise_.cpu_data<float>();
+    return *scale_inv_rowwise_.cpu_buffer<float>();
   }
 
   bool rowwise() const {
@@ -249,8 +249,8 @@ class Tensor {
     tensor_.set_with_gemm_swizzled_scales(with_gemm_swizzled_scales);
   }
 
-  void to_cpu() const;
-  void from_cpu() const;
+  void to_cpu();
+  void from_cpu();
 
   void set_amax(float amax);
   void set_scale(float scale);
@@ -290,7 +290,7 @@ class Tensor {
     template <typename T>
     T *cpu_buffer() {
       NVTE_CHECK(TypeInfo<T>::dtype == dtype_, "Invalid type.");
-      return reinterpret_cast<T *>(cpu_buffer());;
+      return reinterpret_cast<T *>(cpu_buffer());
     }
     template <typename T>
     const T *cpu_buffer() const {
@@ -299,7 +299,7 @@ class Tensor {
     template <typename T>
     T *gpu_buffer() {
       NVTE_CHECK(TypeInfo<T>::dtype == dtype_, "Invalid type.");
-      return reinterpret_cast<T *>(gpu_buffer());;
+      return reinterpret_cast<T *>(gpu_buffer());
     }
     template <typename T>
     const T *gpu_buffer() const {
@@ -489,7 +489,7 @@ size_t last_dimension(const std::vector<size_t> &shape);
 
 bool areShapesEqual(const NVTEShape &s1, const NVTEShape &s2);
 
-void compareResults(const std::string &name, const Tensor &test, const void *ref,
+void compareResults(const std::string &name, Tensor &test, const void *ref,
                     bool rowwise, double atol = 1e-5, double rtol = 1e-8, bool if_on_gpus = true,
                     const size_t tolerable_mismatches_limit = 0);
 void compareResults(const std::string &name, const float test, const float ref,

From 6d3e1db9eb4b954dd9684119f3141b180683ba53 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Tue, 5 May 2026 05:32:35 +0000
Subject: [PATCH 05/10] Debug compilation errors

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/cpp/operator/test_dequantize_nvfp4.cu |  2 +-
 tests/cpp/test_common.cu                    | 35 +++++++++++----------
 tests/cpp/test_common.h                     | 12 +++----
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/tests/cpp/operator/test_dequantize_nvfp4.cu b/tests/cpp/operator/test_dequantize_nvfp4.cu
index 96e85cb5ed..20efc943b6 100644
--- a/tests/cpp/operator/test_dequantize_nvfp4.cu
+++ b/tests/cpp/operator/test_dequantize_nvfp4.cu
@@ -75,7 +75,7 @@ void compute_ref_dequantize_nvfp4(const uint8_t *packed_data,
 }
 
 template <typename OutputType>
-float compute_amax(const test::Tensor &t, size_t rows, size_t cols) {
+float compute_amax(test::Tensor &t, size_t rows, size_t cols) {
     t.to_cpu();
     const auto *data = t.rowwise_cpu_dptr<OutputType>();
     float amax = 0.0f;
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index 4902989fbb..de3fc76554 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -10,6 +10,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cmath>
+#include <cstring>
 #include <iostream>
 #include <memory>
 #include <numeric>
@@ -377,13 +378,13 @@ Tensor::Tensor(const std::string& name,
       auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(flattened_shape, tensor_.scaling_mode());
       if (rowwise) {
         const auto scale_shape = rowwise_scale_meta.shape;
-        const auto scale_dtype = rowwise_scale_meta.dtype;
+        const auto scale_dtype = rowwise_scale_meta.type;
         scale_inv_rowwise_ = Tensor::Buffer(product(scale_shape), scale_dtype);
         tensor_.set_rowwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), scale_dtype, scale_shape);
       }
       if (columnwise) {
-        const auto scale_shape = columnwise_scale_meta.shape;
-        const auto scale_dtype = columnwise_scale_meta.dtype;
+        const auto scale_shape = colwise_scale_meta.shape;
+        const auto scale_dtype = colwise_scale_meta.type;
         scale_inv_columnwise_ = Tensor::Buffer(product(scale_shape), scale_dtype);
         tensor_.set_columnwise_scale_inv(scale_inv_columnwise_.gpu_buffer(), scale_dtype, scale_shape);
       }
@@ -393,7 +394,7 @@ Tensor::Tensor(const std::string& name,
         amax_rowwise_ = Tensor::Buffer(1, DType::kFloat32);
         amax_columnwise_ = Tensor::Buffer(1, DType::kFloat32);
         tensor_.set_amax(amax_rowwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
-        tensor_.set_amax_columnwise(amax_columnwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+        tensor_.set_columnwise_amax(amax_columnwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
       }
     }
     break;
@@ -424,21 +425,21 @@ void Tensor::from_cpu() {
 
 void Tensor::set_amax(float amax) {
   NVTE_CHECK(amax_rowwise_.size() == 1);
-  NVTE_CHECK(amax_rowwise_.dtype() == kNVTEFloat32);
+  NVTE_CHECK(amax_rowwise_.dtype() == DType::kFloat32);
   *amax_rowwise_.cpu_buffer<float>() = amax;
   amax_rowwise_.from_cpu();
 }
 
 void Tensor::set_scale(float scale) {
   NVTE_CHECK(scale_.size() == 1);
-  NVTE_CHECK(scale_.dtype() == kNVTEFloat32);
+  NVTE_CHECK(scale_.dtype() == DType::kFloat32);
   *scale_.cpu_buffer<float>() = scale;
   scale_.from_cpu();
 }
 
 void Tensor::set_scale_inv(float scale_inv) {
   NVTE_CHECK(scale_inv_rowwise_.size() == 1);
-  NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32);
+  NVTE_CHECK(scale_inv_rowwise_.dtype() == DType::kFloat32);
   *scale_inv_rowwise_.cpu_buffer<float>() = scale_inv;
   scale_inv_rowwise_.from_cpu();
 }
@@ -449,7 +450,7 @@ void Tensor::set_tensor_amax(float amax) {
 
 void Tensor::set_tensor_amax_columnwise(float amax) {
   NVTE_CHECK(amax_columnwise_.size() == 1);
-  NVTE_CHECK(amax_columnwise_.dtype() == kNVTEFloat32);
+  NVTE_CHECK(amax_columnwise_.dtype() == DType::kFloat32);
   *amax_columnwise_.cpu_buffer<float>() = amax;
   amax_columnwise_.from_cpu();
 }
@@ -463,7 +464,7 @@ void Tensor::fill_uniform_rowwise_scale_inv() {
   const auto numel = scale_inv_rowwise_.size();
   const auto dtype = scale_inv_rowwise_.dtype();
   switch (dtype) {
-  case kNVTEFloat32:
+  case DType::kFloat32:
     {
       auto *cpu_data = scale_inv_rowwise_.cpu_buffer<float>();
       std::uniform_real_distribution<float> dis(-2.0, 1.0);
@@ -472,9 +473,9 @@ void Tensor::fill_uniform_rowwise_scale_inv() {
       }
     }
     break;
-  case kNVTEFloat8E4M3:
-  case kNVTEFloat8E8M0:
-  case kNVTEByte:
+  case DType::kFloat8E4M3:
+  case DType::kFloat8E8M0:
+  case DType::kByte:
     {
       auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_rowwise_.cpu_buffer());
       std::uniform_int_distribution<uint8_t> dis(0, 127);
@@ -501,7 +502,7 @@ void Tensor::fill_uniform_columnwise_scale_inv() {
   const auto numel = scale_inv_columnwise_.size();
   const auto dtype = scale_inv_columnwise_.dtype();
   switch (dtype) {
-  case kNVTEFloat32:
+  case DType::kFloat32:
     {
       auto *cpu_data = scale_inv_columnwise_.cpu_buffer<float>();
       std::uniform_real_distribution<float> dis(-2.0, 1.0);
@@ -510,9 +511,9 @@ void Tensor::fill_uniform_columnwise_scale_inv() {
       }
     }
     break;
-  case kNVTEFloat8E4M3:
-  case kNVTEFloat8E8M0:
-  case kNVTEByte:
+  case DType::kFloat8E4M3:
+  case DType::kFloat8E8M0:
+  case DType::kByte:
     {
       auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_columnwise_.cpu_buffer());
       std::uniform_int_distribution<uint8_t> dis(0, 127);
@@ -538,7 +539,7 @@ void Tensor::fill_uniform_scale() {
   // Generate random scales on CPU
   auto *cpu_data = scale_.cpu_buffer<float>();
   const auto numel = scale_.size();
-  NVTE_CHECK(scale_.dtype() == kNVTEFloat32);
+  NVTE_CHECK(scale_.dtype() == DType::kFloat32);
   std::uniform_real_distribution<float> dis(-2.0, 1.0);
   for (size_t i = 0; i < numel; ++i) {
     cpu_data[i] = dis(gen_);
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index 52138bd4c4..355865ef41 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -184,32 +184,32 @@ class Tensor {
   }
 
   template <typename T>
-  T *rowwise_cpu_dptr() const {
+  T *rowwise_cpu_dptr() {
     return data_rowwise_.cpu_buffer<T>();
   }
 
   template <typename T>
-  T *columnwise_cpu_dptr() const {
+  T *columnwise_cpu_dptr() {
     return data_columnwise_.cpu_buffer<T>();
   }
 
   float amax() {
     NVTE_CHECK(amax_rowwise_.size() == 1);
-    NVTE_CHECK(amax_rowwise_.dtype() == kNVTEFloat32);
+    NVTE_CHECK(amax_rowwise_.dtype() == DType::kFloat32);
     amax_rowwise_.to_cpu();
     return *amax_rowwise_.cpu_buffer<float>();
   }
 
   float amax_columnwise() {
     NVTE_CHECK(amax_columnwise_.size() == 1);
-    NVTE_CHECK(amax_columnwise_.dtype() == kNVTEFloat32);
+    NVTE_CHECK(amax_columnwise_.dtype() == DType::kFloat32);
     amax_columnwise_.to_cpu();
     return *amax_columnwise_.cpu_buffer<float>();
   }
 
   float scale() {
     NVTE_CHECK(scale_.size() == 1);
-    NVTE_CHECK(scale_.dtype() == kNVTEFloat32);
+    NVTE_CHECK(scale_.dtype() == DType::kFloat32);
     scale_.to_cpu();
     return *scale_.cpu_buffer<float>();
   }
@@ -228,7 +228,7 @@ class Tensor {
 
   float rowwise_scale_inv(){
     NVTE_CHECK(scale_inv_rowwise_.size() == 1);
-    NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32);
+    NVTE_CHECK(scale_inv_rowwise_.dtype() == DType::kFloat32);
     scale_inv_rowwise_.to_cpu();
     return *scale_inv_rowwise_.cpu_buffer<float>();
   }

From e324ae271cbe7d3316c76b07cbd7a983dfbf295a Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Tue, 5 May 2026 23:58:01 +0000
Subject: [PATCH 06/10] Remove type check when accessing raw pointers

CPU and GPU types are inconsistent, so the type checks cause too many problems.

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/cpp/test_common.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index 355865ef41..99db7f5874 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -289,7 +289,6 @@ class Tensor {
     // Templated pointer accessors
     template <typename T>
     T *cpu_buffer() {
-      NVTE_CHECK(TypeInfo<T>::dtype == dtype_, "Invalid type.");
       return reinterpret_cast<T *>(cpu_buffer());
     }
     template <typename T>
@@ -298,7 +297,6 @@ class Tensor {
     }
     template <typename T>
     T *gpu_buffer() {
-      NVTE_CHECK(TypeInfo<T>::dtype == dtype_, "Invalid type.");
       return reinterpret_cast<T *>(gpu_buffer());
     }
     template <typename T>

From 346e92a1f1b04f46cddb6fa6f353e07d22bc0684 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Wed, 6 May 2026 00:45:51 +0000
Subject: [PATCH 07/10] Debug distributed C++ tests

Also adopt review suggestions from @greptile-apps.

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/cpp/test_common.cu                | 15 ++++++++-------
 tests/cpp/test_common.h                 |  4 ++++
 tests/cpp_distributed/test_comm_gemm.cu | 12 ++++++++----
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index de3fc76554..d59f7e36ca 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -937,13 +937,14 @@ void fillCase_special(Tensor *t) {
     });
   }
 
-  // Try setting scale to 1, fallback to random scales
-  // Note: This is a hack to match behavior of an earlier
-  // implementation. Consider filling block scales with constant
-  // value.
-  try {
-    t->set_scale_inv(1.0);
-  } catch (...) {
+  // Fill scales
+  if (t->scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
+    if (isFp8Type(t->dtype())) {
+      // FP8 tensor scale is set to 1
+      t->set_scale_inv(1.0);
+    }
+  } else {
+    // Block scales are filled randomly
     t->fill_uniform_rowwise_scale_inv();
     t->fill_uniform_columnwise_scale_inv();
   }
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index 99db7f5874..1b5ceed068 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -185,11 +185,15 @@ class Tensor {
 
   template <typename T>
   T *rowwise_cpu_dptr() {
+    NVTE_CHECK(TypeInfo<T>::dtype == data_rowwise_.dtype(), "Invalid type!");
+    NVTE_CHECK(rowwise_, "Tensor does not have columnwise data!");
     return data_rowwise_.cpu_buffer<T>();
   }
 
   template <typename T>
   T *columnwise_cpu_dptr() {
+    NVTE_CHECK(TypeInfo<T>::dtype == data_columnwise_.dtype(), "Invalid type!");
+    NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!");
     return data_columnwise_.cpu_buffer<T>();
   }
 
diff --git a/tests/cpp_distributed/test_comm_gemm.cu b/tests/cpp_distributed/test_comm_gemm.cu
index cc0d760a39..45f6664567 100644
--- a/tests/cpp_distributed/test_comm_gemm.cu
+++ b/tests/cpp_distributed/test_comm_gemm.cu
@@ -107,8 +107,10 @@ std::vector<T> CopyMatrix(const std::vector<T>& data, size_t mstart, size_t nsta
 template <typename T>
 test::Tensor Make(size_t m, size_t n, float scale) {
   test::Tensor ret("", std::vector{n, m}, TypeInfo<T>::dtype);
-  ret.set_scale(scale);
-  ret.set_scale_inv(1.0 / scale);
+  if (test::isFp8Type(TypeInfo<T>::dtype)) {
+    ret.set_scale(scale);
+    ret.set_scale_inv(1.0 / scale);
+  }
   return ret;
 }
 
@@ -116,8 +118,10 @@ template <typename T>
 test::Tensor MakeFromData(const std::vector<T>& data, size_t mstart, size_t nstart, size_t msize,
                           size_t nsize, size_t ld, float scale) {
   test::Tensor ret("", std::vector{nsize, msize}, TypeInfo<T>::dtype);
-  ret.set_scale(scale);
-  ret.set_scale_inv(1.0 / scale);
+  if (test::isFp8Type(TypeInfo<T>::dtype)) {
+    ret.set_scale(scale);
+    ret.set_scale_inv(1.0 / scale);
+  }
   auto local = CopyMatrix(data, mstart, nstart, msize, nsize, ld);
   NVTE_CHECK_CUDA(cudaMemcpy(ret.rowwise_dptr(), local.data(), local.size() * sizeof local[0],
                              cudaMemcpyDefault));

From 181cfabf67f581b6451ea8fbada6eddbde5e393d Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Tue, 5 May 2026 17:50:04 -0700
Subject: [PATCH 08/10] Remove unused header

Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
---
 tests/cpp/test_common.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index d59f7e36ca..2becbc7302 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -16,7 +16,6 @@
 #include <numeric>
 #include <random>
 #include <string>
-#include <unordered_set>
 
 #include <gtest/gtest.h>
 #include <omp.h>

From b193fa7a8646cdb87b015c1bbdd75856641abb40 Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Tue, 5 May 2026 17:58:23 -0700
Subject: [PATCH 09/10] Copy-paste error

Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
---
 tests/cpp/test_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index 1b5ceed068..860fc7d7eb 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -186,7 +186,7 @@ class Tensor {
   template <typename T>
   T *rowwise_cpu_dptr() {
     NVTE_CHECK(TypeInfo<T>::dtype == data_rowwise_.dtype(), "Invalid type!");
-    NVTE_CHECK(rowwise_, "Tensor does not have columnwise data!");
+    NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!");
     return data_rowwise_.cpu_buffer<T>();
   }
 

From 67b730f34c44aeaf70130b73f41341e2e73583c3 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Thu, 7 May 2026 03:11:10 +0000
Subject: [PATCH 10/10] Use shared buffer for FP8 row-wise scale-inv and
 col-wise scale-inv

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/cpp/test_common.cu | 141 +++++++++++++++++++++------------------
 tests/cpp/test_common.h  |  71 +++++++++++---------
 2 files changed, 115 insertions(+), 97 deletions(-)

diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index 2becbc7302..925fa8b25f 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -14,6 +14,7 @@
 #include <iostream>
 #include <memory>
 #include <numeric>
+#include <optional>
 #include <random>
 #include <string>
 
@@ -309,8 +310,8 @@ Tensor::Tensor(const std::string& name,
 
   // Allocate row-wise data
   if (rowwise) {
-    data_rowwise_ = Tensor::Buffer(product(shape), type);
-    tensor_.set_rowwise_data(data_rowwise_.gpu_buffer(), type, shape);
+    data_rowwise_.emplace(product(shape), type);
+    tensor_.set_rowwise_data(data_rowwise_->gpu_buffer(), type, shape);
   }
 
   // Allocate column-wise data
@@ -345,26 +346,30 @@ Tensor::Tensor(const std::string& name,
                                                   columnwise_shape_vec.size());
 
     // Allocate buffer
-    data_columnwise_ = Tensor::Buffer(product(columnwise_shape), type);
+    data_columnwise_.emplace(product(columnwise_shape), type);
 
     // Configure TE tensor
-    tensor_.set_columnwise_data(data_columnwise_.gpu_buffer(), type, columnwise_shape);
+    tensor_.set_columnwise_data(data_columnwise_->gpu_buffer(), type, columnwise_shape);
   }
 
   // Allocate recipe-specific buffers
   switch (scaling_mode) {
   case NVTE_DELAYED_TENSOR_SCALING:
     if (isFp8Type(type)) {
-      amax_rowwise_ = Tensor::Buffer(1, DType::kFloat32);
-      scale_ = Tensor::Buffer(1, DType::kFloat32);
-      scale_inv_rowwise_ = Tensor::Buffer(1, DType::kFloat32);
-      tensor_.set_amax(amax_rowwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
-      tensor_.set_scale(scale_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+      amax_rowwise_.emplace(1, DType::kFloat32);
+      scale_.emplace(1, DType::kFloat32);
+      tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+      tensor_.set_scale(scale_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+
+      // Use same buffer for row-wise and column-wise scale-inverse
+      auto scale_inv = std::make_shared<Buffer>(1, DType::kFloat32);
       if (rowwise) {
-        tensor_.set_rowwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+        scale_inv_rowwise_ = scale_inv;
+        tensor_.set_rowwise_scale_inv(scale_inv_rowwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
       }
       if (columnwise) {
-        tensor_.set_columnwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+        scale_inv_columnwise_ = scale_inv;
+        tensor_.set_columnwise_scale_inv(scale_inv_rowwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
       }
     }
     break;
@@ -378,22 +383,22 @@ Tensor::Tensor(const std::string& name,
       if (rowwise) {
         const auto scale_shape = rowwise_scale_meta.shape;
         const auto scale_dtype = rowwise_scale_meta.type;
-        scale_inv_rowwise_ = Tensor::Buffer(product(scale_shape), scale_dtype);
-        tensor_.set_rowwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), scale_dtype, scale_shape);
+        scale_inv_rowwise_ = std::make_shared<Buffer>(product(scale_shape), scale_dtype);
+        tensor_.set_rowwise_scale_inv(scale_inv_rowwise_->gpu_buffer(), scale_dtype, scale_shape);
       }
       if (columnwise) {
         const auto scale_shape = colwise_scale_meta.shape;
         const auto scale_dtype = colwise_scale_meta.type;
-        scale_inv_columnwise_ = Tensor::Buffer(product(scale_shape), scale_dtype);
-        tensor_.set_columnwise_scale_inv(scale_inv_columnwise_.gpu_buffer(), scale_dtype, scale_shape);
+        scale_inv_columnwise_ = std::make_shared<Buffer>(product(scale_shape), scale_dtype);
+        tensor_.set_columnwise_scale_inv(scale_inv_columnwise_->gpu_buffer(), scale_dtype, scale_shape);
       }
 
       // NVFP4 uses amax for tensor scaling
       if (scaling_mode == NVTE_NVFP4_1D_SCALING) {
-        amax_rowwise_ = Tensor::Buffer(1, DType::kFloat32);
-        amax_columnwise_ = Tensor::Buffer(1, DType::kFloat32);
-        tensor_.set_amax(amax_rowwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
-        tensor_.set_columnwise_amax(amax_columnwise_.gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+        amax_rowwise_.emplace(1, DType::kFloat32);
+        amax_columnwise_.emplace(1, DType::kFloat32);
+        tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+        tensor_.set_columnwise_amax(amax_columnwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
       }
     }
     break;
@@ -403,44 +408,47 @@ Tensor::Tensor(const std::string& name,
 }
 
 void Tensor::to_cpu() {
-  data_rowwise_.to_cpu();
-  data_columnwise_.to_cpu();
-  scale_inv_rowwise_.to_cpu();
-  scale_inv_columnwise_.to_cpu();
-  amax_rowwise_.to_cpu();
-  amax_columnwise_.to_cpu();
-  scale_.to_cpu();
+  if (data_rowwise_) { data_rowwise_->to_cpu(); }
+  if (data_columnwise_) { data_columnwise_->to_cpu(); }
+  if (scale_inv_rowwise_) { scale_inv_rowwise_->to_cpu(); }
+  if (scale_inv_columnwise_) { scale_inv_columnwise_->to_cpu(); }
+  if (amax_rowwise_) { amax_rowwise_->to_cpu(); }
+  if (amax_columnwise_) { amax_columnwise_->to_cpu(); }
+  if (scale_) { scale_->to_cpu(); }
 }
 
 void Tensor::from_cpu() {
-  data_rowwise_.from_cpu();
-  data_columnwise_.from_cpu();
-  scale_inv_rowwise_.from_cpu();
-  scale_inv_columnwise_.from_cpu();
-  amax_rowwise_.from_cpu();
-  amax_columnwise_.from_cpu();
-  scale_.from_cpu();
+  if (data_rowwise_) { data_rowwise_->from_cpu(); }
+  if (data_columnwise_) { data_columnwise_->from_cpu(); }
+  if (scale_inv_rowwise_) { scale_inv_rowwise_->from_cpu(); }
+  if (scale_inv_columnwise_) { scale_inv_columnwise_->from_cpu(); }
+  if (amax_rowwise_) { amax_rowwise_->from_cpu(); }
+  if (amax_columnwise_) { amax_columnwise_->from_cpu(); }
+  if (scale_) { scale_->from_cpu(); }
 }
 
 void Tensor::set_amax(float amax) {
-  NVTE_CHECK(amax_rowwise_.size() == 1);
-  NVTE_CHECK(amax_rowwise_.dtype() == DType::kFloat32);
-  *amax_rowwise_.cpu_buffer<float>() = amax;
-  amax_rowwise_.from_cpu();
+  NVTE_CHECK(amax_rowwise_);
+  NVTE_CHECK(amax_rowwise_->size() == 1);
+  NVTE_CHECK(amax_rowwise_->dtype() == DType::kFloat32);
+  *amax_rowwise_->cpu_buffer<float>() = amax;
+  amax_rowwise_->from_cpu();
 }
 
 void Tensor::set_scale(float scale) {
-  NVTE_CHECK(scale_.size() == 1);
-  NVTE_CHECK(scale_.dtype() == DType::kFloat32);
-  *scale_.cpu_buffer<float>() = scale;
-  scale_.from_cpu();
+  NVTE_CHECK(scale_);
+  NVTE_CHECK(scale_->size() == 1);
+  NVTE_CHECK(scale_->dtype() == DType::kFloat32);
+  *scale_->cpu_buffer<float>() = scale;
+  scale_->from_cpu();
 }
 
 void Tensor::set_scale_inv(float scale_inv) {
-  NVTE_CHECK(scale_inv_rowwise_.size() == 1);
-  NVTE_CHECK(scale_inv_rowwise_.dtype() == DType::kFloat32);
-  *scale_inv_rowwise_.cpu_buffer<float>() = scale_inv;
-  scale_inv_rowwise_.from_cpu();
+  NVTE_CHECK(scale_inv_rowwise_);
+  NVTE_CHECK(scale_inv_rowwise_->size() == 1);
+  NVTE_CHECK(scale_inv_rowwise_->dtype() == DType::kFloat32);
+  *scale_inv_rowwise_->cpu_buffer<float>() = scale_inv;
+  scale_inv_rowwise_->from_cpu();
 }
 
 void Tensor::set_tensor_amax(float amax) {
@@ -448,24 +456,25 @@ void Tensor::set_tensor_amax(float amax) {
 }
 
 void Tensor::set_tensor_amax_columnwise(float amax) {
-  NVTE_CHECK(amax_columnwise_.size() == 1);
-  NVTE_CHECK(amax_columnwise_.dtype() == DType::kFloat32);
-  *amax_columnwise_.cpu_buffer<float>() = amax;
-  amax_columnwise_.from_cpu();
+  NVTE_CHECK(amax_columnwise_);
+  NVTE_CHECK(amax_columnwise_->size() == 1);
+  NVTE_CHECK(amax_columnwise_->dtype() == DType::kFloat32);
+  *amax_columnwise_->cpu_buffer<float>() = amax;
+  amax_columnwise_->from_cpu();
 }
 
 void Tensor::fill_uniform_rowwise_scale_inv() {
-  if (scale_inv_rowwise_.size() == 0) {
+  if (!scale_inv_rowwise_ || scale_inv_rowwise_->size() == 0) {
     return;
   }
 
   // Generate random scales on CPU
-  const auto numel = scale_inv_rowwise_.size();
-  const auto dtype = scale_inv_rowwise_.dtype();
+  const auto numel = scale_inv_rowwise_->size();
+  const auto dtype = scale_inv_rowwise_->dtype();
   switch (dtype) {
   case DType::kFloat32:
     {
-      auto *cpu_data = scale_inv_rowwise_.cpu_buffer<float>();
+      auto *cpu_data = scale_inv_rowwise_->cpu_buffer<float>();
       std::uniform_real_distribution<float> dis(-2.0, 1.0);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
@@ -476,7 +485,7 @@ void Tensor::fill_uniform_rowwise_scale_inv() {
   case DType::kFloat8E8M0:
   case DType::kByte:
     {
-      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_rowwise_.cpu_buffer());
+      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_rowwise_->cpu_buffer());
       std::uniform_int_distribution<uint8_t> dis(0, 127);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
@@ -489,21 +498,21 @@ void Tensor::fill_uniform_rowwise_scale_inv() {
   }
 
   // Update GPU tensor
-  scale_inv_rowwise_.from_cpu();
+  scale_inv_rowwise_->from_cpu();
 }
 
 void Tensor::fill_uniform_columnwise_scale_inv() {
-  if (scale_inv_columnwise_.size() == 0) {
+  if (!scale_inv_columnwise_ || scale_inv_columnwise_->size() == 0) {
     return;
   }
 
   // Generate random scales on CPU
-  const auto numel = scale_inv_columnwise_.size();
-  const auto dtype = scale_inv_columnwise_.dtype();
+  const auto numel = scale_inv_columnwise_->size();
+  const auto dtype = scale_inv_columnwise_->dtype();
   switch (dtype) {
   case DType::kFloat32:
     {
-      auto *cpu_data = scale_inv_columnwise_.cpu_buffer<float>();
+      auto *cpu_data = scale_inv_columnwise_->cpu_buffer<float>();
       std::uniform_real_distribution<float> dis(-2.0, 1.0);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
@@ -514,7 +523,7 @@ void Tensor::fill_uniform_columnwise_scale_inv() {
   case DType::kFloat8E8M0:
   case DType::kByte:
     {
-      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_columnwise_.cpu_buffer());
+      auto *cpu_data = reinterpret_cast<uint8_t *>(scale_inv_columnwise_->cpu_buffer());
       std::uniform_int_distribution<uint8_t> dis(0, 127);
       for (size_t i = 0; i < numel; ++i) {
         cpu_data[i] = dis(gen_);
@@ -527,25 +536,25 @@ void Tensor::fill_uniform_columnwise_scale_inv() {
   }
 
   // Update GPU tensor
-  scale_inv_columnwise_.from_cpu();
+  scale_inv_columnwise_->from_cpu();
 }
 
 void Tensor::fill_uniform_scale() {
-  if (scale_.size() == 0) {
+  if (!scale_ || scale_->size() == 0) {
     return;
   }
 
   // Generate random scales on CPU
-  auto *cpu_data = scale_.cpu_buffer<float>();
-  const auto numel = scale_.size();
-  NVTE_CHECK(scale_.dtype() == DType::kFloat32);
+  auto *cpu_data = scale_->cpu_buffer<float>();
+  const auto numel = scale_->size();
+  NVTE_CHECK(scale_->dtype() == DType::kFloat32);
   std::uniform_real_distribution<float> dis(-2.0, 1.0);
   for (size_t i = 0; i < numel; ++i) {
     cpu_data[i] = dis(gen_);
   }
 
   // Update GPU tensor
-  scale_.from_cpu();
+  scale_->from_cpu();
 }
 
 using std::to_string;
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index 860fc7d7eb..da467465d7 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -8,6 +8,7 @@
 
 #include <array>
 #include <memory>
+#include <optional>
 #include <random>
 #include <vector>
 
@@ -185,56 +186,64 @@ class Tensor {
 
   template <typename T>
   T *rowwise_cpu_dptr() {
-    NVTE_CHECK(TypeInfo<T>::dtype == data_rowwise_.dtype(), "Invalid type!");
+    NVTE_CHECK(data_rowwise_, "Tensor does not have rowwise data!");
+    NVTE_CHECK(TypeInfo<T>::dtype == data_rowwise_->dtype(), "Invalid type!");
     NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!");
-    return data_rowwise_.cpu_buffer<T>();
+    return data_rowwise_->cpu_buffer<T>();
   }
 
   template <typename T>
   T *columnwise_cpu_dptr() {
-    NVTE_CHECK(TypeInfo<T>::dtype == data_columnwise_.dtype(), "Invalid type!");
+    NVTE_CHECK(data_columnwise_, "Tensor does not have columnwise data!");
+    NVTE_CHECK(TypeInfo<T>::dtype == data_columnwise_->dtype(), "Invalid type!");
     NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!");
-    return data_columnwise_.cpu_buffer<T>();
+    return data_columnwise_->cpu_buffer<T>();
   }
 
   float amax() {
-    NVTE_CHECK(amax_rowwise_.size() == 1);
-    NVTE_CHECK(amax_rowwise_.dtype() == DType::kFloat32);
-    amax_rowwise_.to_cpu();
-    return *amax_rowwise_.cpu_buffer<float>();
+    NVTE_CHECK(amax_rowwise_);
+    NVTE_CHECK(amax_rowwise_->size() == 1);
+    NVTE_CHECK(amax_rowwise_->dtype() == DType::kFloat32);
+    amax_rowwise_->to_cpu();
+    return *amax_rowwise_->cpu_buffer<float>();
   }
 
   float amax_columnwise() {
-    NVTE_CHECK(amax_columnwise_.size() == 1);
-    NVTE_CHECK(amax_columnwise_.dtype() == DType::kFloat32);
-    amax_columnwise_.to_cpu();
-    return *amax_columnwise_.cpu_buffer<float>();
+    NVTE_CHECK(amax_columnwise_);
+    NVTE_CHECK(amax_columnwise_->size() == 1);
+    NVTE_CHECK(amax_columnwise_->dtype() == DType::kFloat32);
+    amax_columnwise_->to_cpu();
+    return *amax_columnwise_->cpu_buffer<float>();
   }
 
   float scale() {
-    NVTE_CHECK(scale_.size() == 1);
-    NVTE_CHECK(scale_.dtype() == DType::kFloat32);
-    scale_.to_cpu();
-    return *scale_.cpu_buffer<float>();
+    NVTE_CHECK(scale_);
+    NVTE_CHECK(scale_->size() == 1);
+    NVTE_CHECK(scale_->dtype() == DType::kFloat32);
+    scale_->to_cpu();
+    return *scale_->cpu_buffer<float>();
   }
 
   template <typename T>
   T *rowwise_cpu_scale_inv_ptr(){
-    scale_inv_rowwise_.to_cpu();
-    return scale_inv_rowwise_.cpu_buffer<T>();
+    NVTE_CHECK(scale_inv_rowwise_);
+    scale_inv_rowwise_->to_cpu();
+    return scale_inv_rowwise_->cpu_buffer<T>();
   }
 
   template <typename T>
   T *columnwise_cpu_scale_inv_ptr(){
-    scale_inv_columnwise_.to_cpu();
-    return scale_inv_columnwise_.cpu_buffer<T>();
+    NVTE_CHECK(scale_inv_columnwise_);
+    scale_inv_columnwise_->to_cpu();
+    return scale_inv_columnwise_->cpu_buffer<T>();
   }
 
   float rowwise_scale_inv(){
-    NVTE_CHECK(scale_inv_rowwise_.size() == 1);
-    NVTE_CHECK(scale_inv_rowwise_.dtype() == DType::kFloat32);
-    scale_inv_rowwise_.to_cpu();
-    return *scale_inv_rowwise_.cpu_buffer<float>();
+    NVTE_CHECK(scale_inv_rowwise_);
+    NVTE_CHECK(scale_inv_rowwise_->size() == 1);
+    NVTE_CHECK(scale_inv_rowwise_->dtype() == DType::kFloat32);
+    scale_inv_rowwise_->to_cpu();
+    return *scale_inv_rowwise_->cpu_buffer<float>();
   }
 
   bool rowwise() const {
@@ -329,13 +338,13 @@ class Tensor {
   TensorWrapper tensor_;
 
   // Data buffers
-  Buffer data_rowwise_;
-  Buffer data_columnwise_;
-  Buffer scale_inv_rowwise_;
-  Buffer scale_inv_columnwise_;
-  Buffer amax_rowwise_;
-  Buffer amax_columnwise_;
-  Buffer scale_;
+  std::optional<Buffer> data_rowwise_;
+  std::optional<Buffer> data_columnwise_;
+  std::shared_ptr<Buffer> scale_inv_rowwise_;
+  std::shared_ptr<Buffer> scale_inv_columnwise_;
+  std::optional<Buffer> amax_rowwise_;
+  std::optional<Buffer> amax_columnwise_;
+  std::optional<Buffer> scale_;
 
   bool rowwise_;
   bool columnwise_;