From 45aa776d803244fefa0f7523f25fa716004bd393 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 30 Apr 2026 01:51:01 +0000 Subject: [PATCH 01/10] Fix bug in NVFP4 quantize test where we set scale instead of amax Refactor test tensor wrapper by removing recipe-specific logic whenever possible. Signed-off-by: Tim Moon --- .../cpp/operator/test_cast_nvfp4_transpose.cu | 5 +- tests/cpp/operator/test_cast_transpose.cu | 6 +- tests/cpp/test_common.cu | 307 ++++++++++-------- tests/cpp/test_common.h | 83 ++--- 4 files changed, 216 insertions(+), 185 deletions(-) diff --git a/tests/cpp/operator/test_cast_nvfp4_transpose.cu b/tests/cpp/operator/test_cast_nvfp4_transpose.cu index d8d495d61f..08e43f3544 100644 --- a/tests/cpp/operator/test_cast_nvfp4_transpose.cu +++ b/tests/cpp/operator/test_cast_nvfp4_transpose.cu @@ -574,8 +574,7 @@ void performTest(float (*OP)(const float), amax = fmaxf(amax, static_cast(input_dptr[idx])); } } - // Set 2nd stage NVFP4 scaling factor - output.set_scale(amax); + output.set_amax(amax); bool use_2d_quantization = false; @@ -585,7 +584,7 @@ void performTest(float (*OP)(const float), ref_output_t.get(), ref_scales.get(), ref_scales_t.get(), - output.scale(), + output.amax(), rows, cols, scales_stride, diff --git a/tests/cpp/operator/test_cast_transpose.cu b/tests/cpp/operator/test_cast_transpose.cu index 44c78e4a09..ac05958b0b 100644 --- a/tests/cpp/operator/test_cast_transpose.cu +++ b/tests/cpp/operator/test_cast_transpose.cu @@ -55,13 +55,13 @@ void performTest(const size_t N, const size_t H) { fillUniform(&input); setRandomScale(&output); + const float scale = isFp8Type(otype) ? output.scale() : 1.0f; nvte_quantize(input.data(), output.data(), 0); float ref_amax; compute_ref(input.rowwise_cpu_dptr(), ref_output_c.get(), - ref_output_t.get(), N, H, &ref_amax, - output.scale()); + ref_output_t.get(), N, H, &ref_amax, scale); cudaDeviceSynchronize(); auto err = cudaGetLastError(); @@ -69,7 +69,7 @@ void performTest(const size_t N, const size_t H) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output.scale(); + float ref_scale_inv = 1.f / scale; compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index 5196684118..04c1e8cf7e 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -8,13 +8,14 @@ #include "test_common.h" #include +#include +#include +#include #include #include #include -#include -#include -#include #include +#include #include #include @@ -391,11 +392,10 @@ Tensor::Tensor(const std::string& name, } } else { if (scaling_mode == NVTE_NVFP4_1D_SCALING) { - // Used for NVFP4 second stage scaling - cudaMalloc((void**)&scale, sizeof(float)); // NOLINT(*) - cudaMemset(scale, 0, sizeof(float)); - scale_cpu_data_ = std::make_shared(0); - tensor_.set_scale(scale, DType::kFloat32, std::vector{1}); + cudaMalloc((void**)&amax, sizeof(float)); // NOLINT(*) + cudaMemset(amax, 0, sizeof(float)); + amax_cpu_data_ = std::make_shared(0); + tensor_.set_amax(amax, DType::kFloat32, std::vector{1}); } auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(flattened_shape, tensor_.scaling_mode()); auto rowwise_scale_size = rowwise_scale_meta.bytes(); @@ -420,138 +420,180 @@ Tensor::Tensor(const std::string& name, } } } + + // Sanity check that CPU and GPU have corresponding buffers + NVTE_CHECK((cpu_data_rowwise_ == nullptr) == (tensor_.dptr() == nullptr)); + NVTE_CHECK((cpu_data_columnwise_ == nullptr) == (tensor_.columnwise_dptr() == nullptr)); + NVTE_CHECK((rowwise_scale_inv_cpu_data_ == nullptr) == (tensor_.scale_inv() == nullptr)); + NVTE_CHECK((columnwise_scale_inv_cpu_data_ == nullptr) + == (tensor_.get_columnwise_scale_inv().data_ptr == nullptr)); + NVTE_CHECK((amax_cpu_data_ == nullptr) == (tensor_.amax() == nullptr)); + NVTE_CHECK((scale_cpu_data_ == nullptr) == (tensor_.scale() == nullptr)); } -void Tensor::to_cpu() const { - const NVTEShape s = tensor_.shape(); - const size_t size = bytes(s, tensor_.dtype()); - if (rowwise_) { - cudaMemcpy(cpu_data_rowwise_.get(), - tensor_.get_rowwise_data().data_ptr, - size, - cudaMemcpyDeviceToHost); - } - if (columnwise_) { - const DType colwise_type = tensor_.dtype(); - - const size_t colwise_size = bytes(s, colwise_type); - cudaMemcpy(cpu_data_columnwise_.get(), - tensor_.get_columnwise_data().data_ptr, - colwise_size, - cudaMemcpyDeviceToHost); - } - if (isFp8Type(dtype()) || isFp4Type(dtype())) { - if ((tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING)) { - if (tensor_.amax() != nullptr){ - cudaMemcpy(amax_cpu_data_.get(), - tensor_.amax(), - sizeof(float), - cudaMemcpyDeviceToHost); - } - cudaMemcpy(scale_cpu_data_.get(), - tensor_.scale(), - sizeof(float), - cudaMemcpyDeviceToHost); - } - auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(s, tensor_.scaling_mode()); - if (rowwise_) { - auto scale_size = rowwise_scale_meta.bytes(); - cudaMemcpy(rowwise_scale_inv_cpu_data_.get(), - tensor_.get_rowwise_scale_inv().data_ptr, - scale_size, - cudaMemcpyDeviceToHost); +Tensor::~Tensor() { + std::unordered_set freed_ptrs; + auto free_cuda_buffer = [&freed_ptrs] (void *ptr) -> void { + if (ptr != nullptr && freed_ptrs.count(ptr) > 0) { + cudaFree(ptr); + freed_ptrs.insert(ptr); } - if (columnwise_) { - auto scale_size = colwise_scale_meta.bytes(); - cudaMemcpy(columnwise_scale_inv_cpu_data_.get(), - tensor_.get_columnwise_scale_inv().data_ptr, - scale_size, - cudaMemcpyDeviceToHost); + }; + free_cuda_buffer(tensor_.dptr()); + free_cuda_buffer(tensor_.scale_inv()); + free_cuda_buffer(tensor_.scale()); + free_cuda_buffer(tensor_.amax()); + free_cuda_buffer(tensor_.columnwise_dptr()); + free_cuda_buffer(tensor_.get_columnwise_scale_inv().data_ptr); +} + +void Tensor::to_cpu() const { + auto from_basic_tensor = [] (const NVTEBasicTensor src, void *dst) -> void { + if (dst != nullptr) { + const size_t copy_size = bytes(src.shape, static_cast(src.dtype)); + cudaMemcpy(dst, src.data_ptr, copy_size, cudaMemcpyDeviceToHost); } - } + }; + from_basic_tensor(tensor_.get_rowwise_data(), cpu_data_rowwise_.get()); + from_basic_tensor(tensor_.get_columnwise_data(), cpu_data_columnwise_.get()); + from_basic_tensor(tensor_.get_rowwise_scale_inv(), rowwise_scale_inv_cpu_data_.get()); + from_basic_tensor(tensor_.get_columnwise_scale_inv(), columnwise_scale_inv_cpu_data_.get()); + from_basic_tensor(tensor_.get_amax(), amax_cpu_data_.get()); + from_basic_tensor(tensor_.get_scale(), scale_cpu_data_.get()); } void Tensor::from_cpu() const { - const NVTEShape s = tensor_.shape(); - const size_t size = bytes(s, tensor_.dtype()); - if (rowwise_) { - cudaMemcpy(tensor_.get_rowwise_data().data_ptr, cpu_data_rowwise_.get(), size, - cudaMemcpyHostToDevice); - } - if (columnwise_) { - cudaMemcpy(tensor_.get_columnwise_data().data_ptr, cpu_data_columnwise_.get(), size, - cudaMemcpyHostToDevice); - } - if (isFp8Type(dtype()) || isFp4Type(dtype())) { - if ((tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) - || (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING)) { - if (tensor_.amax() != nullptr){ - cudaMemcpy(tensor_.amax(), amax_cpu_data_.get(), sizeof(float), cudaMemcpyHostToDevice); - } - cudaMemcpy(tensor_.scale(), scale_cpu_data_.get(), sizeof(float), cudaMemcpyHostToDevice); - } - auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(s, tensor_.scaling_mode()); - if (rowwise_) { - auto scale_size = rowwise_scale_meta.bytes(); - cudaMemcpy(tensor_.get_rowwise_scale_inv().data_ptr, - rowwise_scale_inv_cpu_data_.get(), scale_size, - cudaMemcpyHostToDevice); + auto to_basic_tensor = [] (const void *src, NVTEBasicTensor dst) -> void { + if (src != nullptr) { + const size_t copy_size = bytes(dst.shape, static_cast(dst.dtype)); + cudaMemcpy(dst.data_ptr, src, copy_size, cudaMemcpyHostToDevice); } - if (columnwise_) { - auto scale_size = colwise_scale_meta.bytes(); - cudaMemcpy(tensor_.get_columnwise_scale_inv().data_ptr, - columnwise_scale_inv_cpu_data_.get(), scale_size, - cudaMemcpyHostToDevice); - } - } + }; + to_basic_tensor(cpu_data_rowwise_.get(), tensor_.get_rowwise_data()); + to_basic_tensor(cpu_data_columnwise_.get(), tensor_.get_columnwise_data()); + to_basic_tensor(rowwise_scale_inv_cpu_data_.get(), tensor_.get_rowwise_scale_inv()); + to_basic_tensor(columnwise_scale_inv_cpu_data_.get(), tensor_.get_columnwise_scale_inv()); + to_basic_tensor(amax_cpu_data_.get(), tensor_.get_amax()); + to_basic_tensor(scale_cpu_data_.get(), tensor_.get_scale()); +} + +void Tensor::set_amax(float amax) { + NVTE_CHECK(amax_cpu_data_); + NVTE_CHECK(tensor_.get_amax().dtype == kNVTEFloat32); + NVTE_CHECK(product(tensor_.get_amax().shape) == 1); + *amax_cpu_data_ = amax; + from_cpu(); } void Tensor::set_scale(float scale) { - if (isFp8Type(dtype()) || isFp4Type(dtype())) { - NVTE_CHECK(scale_cpu_data_); - if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) { - *scale_cpu_data_ = scale; - from_cpu(); - } - } + NVTE_CHECK(scale_cpu_data_); + NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32); + NVTE_CHECK(product(tensor_.get_scale().shape) == 1); + *scale_cpu_data_ = scale; + from_cpu(); } void Tensor::set_scale_inv(float scale_inv) { - if (isFp8Type(dtype()) || isFp4Type(dtype())) { - if (rowwise_) { - NVTE_CHECK(rowwise_scale_inv_cpu_data_); + NVTE_CHECK(rowwise_scale_inv_cpu_data_); + NVTE_CHECK(columnwise_scale_inv_cpu_data_ == nullptr); // TODO Not needed. Remove once all incorrect function calls have been removed + NVTE_CHECK(product(tensor_.get_rowwise_scale_inv().shape) == 1); + NVTE_CHECK(tensor_.get_rowwise_scale_inv().dtype == kNVTEFloat32); + *reinterpret_cast(rowwise_scale_inv_cpu_data_.get()) = scale_inv; + from_cpu(); +} + +void Tensor::fill_uniform_rowwise_scale_inv() { + if (rowwise_scale_inv_cpu_data_ == nullptr) { + return; + } + + // Generate random scales on CPU + const auto numel = product(tensor_.get_rowwise_scale_inv().shape); + const auto dtype = tensor_.get_rowwise_scale_inv().dtype; + switch (dtype) { + case kNVTEFloat32: + { + auto *cpu_data = reinterpret_cast(scale_cpu_data_.get()); + std::uniform_real_distribution<> dis(-2.0, 1.0); + for (size_t i = 0; i < numel; ++i) { + cpu_data[i] = dis(gen_); + } } - if (columnwise_) { - NVTE_CHECK(columnwise_scale_inv_cpu_data_); + break; + case kNVTEFloat8E4M3: + case kNVTEFloat8E8M0: + case kNVTEByte: + { + auto *cpu_data = reinterpret_cast(scale_cpu_data_.get()); + std::uniform_int_distribution dis(0, 127); + for (size_t i = 0; i < numel; ++i) { + cpu_data[i] = dis(gen_); + } } + break; + default: + NVTE_ERROR("Unsupported rowwise scale-inv dtype (", + static_cast(dtype), ")."); + } - auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(tensor_.shape(), tensor_.scaling_mode()); - if (rowwise_) { - auto num_scales = product(rowwise_scale_meta.shape); - if (num_scales == 1) { - rowwise_cpu_scale_inv_ptr()[0] = scale_inv; - } else { - std::uniform_int_distribution dis(0, 127); - auto *scale_inv_ptr = rowwise_cpu_scale_inv_ptr(); - for (size_t i = 0; i < num_scales; i++) { - scale_inv_ptr[i] = dis(gen_); - } + // Update GPU tensor + from_cpu(); +} + +void Tensor::fill_uniform_columnwise_scale_inv() { + if (columnwise_scale_inv_cpu_data_ == nullptr) { + return; + } + + // Generate random scales on CPU + const auto numel = product(tensor_.get_columnwise_scale_inv().shape); + const auto dtype = tensor_.get_columnwise_scale_inv().dtype; + switch (dtype) { + case kNVTEFloat32: + { + auto *cpu_data = reinterpret_cast(scale_cpu_data_.get()); + std::uniform_real_distribution<> dis(-2.0, 1.0); + for (size_t i = 0; i < numel; ++i) { + cpu_data[i] = dis(gen_); } } - if (columnwise_) { - auto num_scales = product(colwise_scale_meta.shape); - if (num_scales == 1) { - columnwise_cpu_scale_inv_ptr()[0] = scale_inv; - } else { - std::uniform_int_distribution dis(0, 127); - auto *scale_inv_ptr = columnwise_cpu_scale_inv_ptr(); - for (size_t i = 0; i < num_scales; i++) { - scale_inv_ptr[i] = dis(gen_); - } + break; + case kNVTEFloat8E4M3: + case kNVTEFloat8E8M0: + case kNVTEByte: + { + auto *cpu_data = reinterpret_cast(scale_cpu_data_.get()); + std::uniform_int_distribution dis(0, 127); + for (size_t i = 0; i < numel; ++i) { + cpu_data[i] = dis(gen_); } } - from_cpu(); + break; + default: + NVTE_ERROR("Unsupported columnwise scale-inv dtype (", + static_cast(dtype), ")."); } + + // Update GPU tensor + from_cpu(); +} + +void Tensor::fill_uniform_scale() { + if (scale_cpu_data_ == nullptr) { + return; + } + + // Generate random scales on CPU + auto *cpu_data = reinterpret_cast(scale_cpu_data_.get()); + const auto numel = product(tensor_.get_columnwise_scale_inv().shape); + NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32); + std::uniform_real_distribution<> dis(-2.0, 1.0); + for (size_t i = 0; i < numel; ++i) { + cpu_data[i] = dis(gen_); + } + + // Update GPU tensor + from_cpu(); } void Tensor::shareFP8Meta(const Tensor &other) { @@ -912,6 +954,7 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) { } void fillUniform(Tensor *t) { + // Generate random row-wise data and column-wise data if (t->rowwise()) { const size_t size = product(t->rowwise_shape()); TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(t->dtype(), T, @@ -929,8 +972,12 @@ void fillUniform(Tensor *t) { } ); } - std::uniform_real_distribution<> dis(-2.0, 1.0); - t->set_scale_inv(dis(t->gen())); + + // Generate random scales + t->fill_uniform_rowwise_scale_inv(); + t->fill_uniform_columnwise_scale_inv(); + + // Update data on GPU t->from_cpu(); } @@ -966,7 +1013,16 @@ void fillCase_special(Tensor *t) { } }); } - t->set_scale_inv(1.0); + + // Try setting scale to 1, fallback to random scales + try { + t->set_scale_inv(1.0); + } catch (...) { + t->fill_uniform_rowwise_scale_inv(); + t->fill_uniform_columnwise_scale_inv(); + } + + // Update GPU tensor data t->from_cpu(); } @@ -1000,15 +1056,12 @@ template void fillCase(Tensor *t, const InputsFillCase fill_case); #endif void setRandomScale(Tensor *t) { - std::uniform_real_distribution<> dis(-2.0, 1.0); - const float scale = dis(t->gen()); - t->set_scale(scale); + t->fill_uniform_scale(); } void setRandomScaleInv(Tensor *t) { - std::uniform_real_distribution<> dis(-2.0, 1.0); - const float scale_inv = dis(t->gen()); - t->set_scale_inv(scale_inv); + t->fill_uniform_rowwise_scale_inv(); + t->fill_uniform_columnwise_scale_inv(); } bool isFp8Type(DType type) { diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index b5a7f26d14..7696d9a36e 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -6,10 +6,11 @@ #pragma once -#include -#include #include +#include #include +#include + #include #define FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080) @@ -27,6 +28,11 @@ namespace test { using namespace transformer_engine; +size_t typeToNumBits(DType type); +size_t product(const NVTEShape &shape); +size_t product(const std::vector &shape); +size_t bytes(const NVTEShape& shape, const DType type); + template struct BytesToType {}; @@ -141,30 +147,7 @@ class Tensor { Tensor(Tensor &&other) = default; Tensor& operator=(Tensor &&other) = default; - ~Tensor() { - void *data_ptr = tensor_.dptr(); - void *scale_inv = tensor_.scale_inv(); - void *columnwise_data_ptr = tensor_.get_columnwise_data().data_ptr; - void *columnwise_scale_inv = tensor_.get_columnwise_scale_inv().data_ptr; - if (columnwise_data_ptr == data_ptr) { - columnwise_data_ptr = nullptr; - } - if (columnwise_scale_inv == scale_inv) { - columnwise_scale_inv = nullptr; - } - if (data_ptr != nullptr) { - cudaFree(data_ptr); - } - if (scale_inv != nullptr) { - cudaFree(scale_inv); - } - if (columnwise_data_ptr != nullptr) { - cudaFree(columnwise_data_ptr); - } - if (columnwise_scale_inv != nullptr) { - cudaFree(columnwise_scale_inv); - } - } + ~Tensor(); NVTETensor data() const noexcept { return tensor_.data(); } @@ -215,24 +198,19 @@ class Tensor { } float amax() const { - if(amax_cpu_data_) { - to_cpu(); - return *amax_cpu_data_; - } else { - return 0; - } + NVTE_CHECK(amax_cpu_data_); + NVTE_CHECK(tensor_.get_amax().dtype == kNVTEFloat32); + NVTE_CHECK(product(tensor_.get_amax().shape) == 1); + to_cpu(); + return *amax_cpu_data_; } float scale() const { - if(scale_cpu_data_) { - NVTE_CHECK((tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) - || (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING), - "Invalid scaling_mode!"); - to_cpu(); - return *scale_cpu_data_; - } else { - return 1; - } + NVTE_CHECK(scale_cpu_data_); + NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32); + NVTE_CHECK(product(tensor_.get_scale().shape) == 1); + to_cpu(); + return *scale_cpu_data_; } template @@ -266,12 +244,12 @@ class Tensor { } float rowwise_scale_inv(){ - if(rowwise_scale_inv_cpu_data_) { - float scale_inv = rowwise_cpu_scale_inv_ptr()[0]; - return scale_inv; - } else { - return 1; - } + to_cpu(); + NVTE_CHECK(rowwise_scale_inv_cpu_data_); + auto scale_inv_tensor = tensor_.get_rowwise_scale_inv(); + NVTE_CHECK(product(scale_inv_tensor.shape) == 1); + NVTE_CHECK(scale_inv_tensor.dtype == kNVTEFloat32); + return *reinterpret_cast(rowwise_scale_inv_cpu_data_.get()); } bool rowwise() const { @@ -292,8 +270,14 @@ class Tensor { void to_cpu() const; void from_cpu() const; + + void set_amax(float amax); void set_scale(float scale); void set_scale_inv(float scale_inv); + + void fill_uniform_rowwise_scale_inv(); + void fill_uniform_columnwise_scale_inv(); + void fill_uniform_scale(); void shareFP8Meta(const Tensor &other); std::mt19937& gen() { return gen_; } @@ -455,11 +439,6 @@ inline float dsilu(const float x) { return x * dsigmoid(x) + sigmoid(x); } inline float srelu(const float x) { return x > 0 ? x * x : 0; } inline float dsrelu(const float x) { return fmaxf(0, 2 * x); } -size_t typeToNumBits(DType type); -size_t product(const NVTEShape &shape); -size_t product(const std::vector &shape); -size_t bytes(const NVTEShape& shape, const DType type); - size_t first_dimension(const std::vector &shape); size_t last_dimension(const std::vector &shape); From 1acb3c3cb43c7cf72f120ef67a9bb7581525688c Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 1 May 2026 02:53:23 +0000 Subject: [PATCH 02/10] Only get fp32 scale when tensor is expected to have fp32 scale Signed-off-by: Tim Moon --- tests/cpp/operator/test_act.cu | 10 ++++++---- tests/cpp/operator/test_cast.cu | 5 +++-- tests/cpp/operator/test_cast_current_scaling.cu | 7 ++++--- tests/cpp/operator/test_cast_dbias.cu | 5 +++-- tests/cpp/operator/test_cast_dbias_dgelu.cu | 5 +++-- tests/cpp/operator/test_cast_gated_swiglu.cu | 5 +++-- tests/cpp/operator/test_cast_transpose.cu | 6 +++--- tests/cpp/operator/test_cast_transpose_dbias.cu | 5 +++-- tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu | 5 +++-- tests/cpp/operator/test_cast_transpose_dgeglu.cu | 5 +++-- tests/cpp/operator/test_multi_cast_transpose.cu | 4 ++-- tests/cpp/operator/test_normalization.cu | 2 +- tests/cpp/operator/test_qdq.cu | 3 ++- tests/cpp/test_common.cu | 3 +++ 14 files changed, 42 insertions(+), 28 deletions(-) diff --git a/tests/cpp/operator/test_act.cu b/tests/cpp/operator/test_act.cu index ca5ccdc4ce..6edc6bd63b 100644 --- a/tests/cpp/operator/test_act.cu +++ b/tests/cpp/operator/test_act.cu @@ -124,6 +124,7 @@ void performTest(const size_t N, const size_t H) { fillUniform(&input); fillUniform(&ograd); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; std::unique_ptr ref_output = std::make_unique(N*H); std::unique_ptr ref_igrad = std::make_unique(N*H); @@ -132,7 +133,7 @@ void performTest(const size_t N, const size_t H) { float ref_amax; compute_ref_act_cast(input.rowwise_cpu_dptr(), ref_output.get(), - output.scale(), &ref_amax, N, H); + ref_scale, &ref_amax, N, H); cudaDeviceSynchronize(); auto err = cudaGetLastError(); @@ -179,6 +180,7 @@ void performTestGLU(const size_t N, const size_t H) { fillUniform(&input); fillUniform(&ograd); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; std::unique_ptr ref_output = std::make_unique(N * H); std::unique_ptr ref_igrad = std::make_unique(2 * N * H); @@ -187,7 +189,7 @@ void performTestGLU(const size_t N, const size_t H) { float ref_amax; compute_ref_glu_act_cast(input.rowwise_cpu_dptr(), ref_output.get(), - output.scale(), &ref_amax, N, H); + ref_scale, &ref_amax, N, H); cudaDeviceSynchronize(); auto err = cudaGetLastError(); @@ -197,8 +199,8 @@ void performTestGLU(const size_t N, const size_t H) { auto [atol, rtol] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol, rtol); if (output.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) { - const float ref_scale = 1.f / output.scale(); - compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr(), ref_scale, atol, rtol); + const float ref_scale_inv = 1.f / ref_scale; + compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr(), ref_scale_inv, atol, rtol); } } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_cast.cu b/tests/cpp/operator/test_cast.cu index 35d9dd2efd..e8f48feef8 100644 --- a/tests/cpp/operator/test_cast.cu +++ b/tests/cpp/operator/test_cast.cu @@ -53,13 +53,14 @@ void performTest(const std::vector& shape) { fillUniform(&input); setRandomScale(&output_c); + const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f; nvte_quantize(input.data(), output_c.data(), 0); float ref_amax; compute_ref(input.rowwise_cpu_dptr(), ref_output_c.get(), - full_size, &ref_amax, output_c.scale()); + full_size, &ref_amax, ref_scale); cudaDeviceSynchronize(); auto err = cudaGetLastError(); @@ -67,7 +68,7 @@ void performTest(const std::vector& shape) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output_c.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_cast_current_scaling.cu b/tests/cpp/operator/test_cast_current_scaling.cu index 4dd6cd2d58..7cca0d72e0 100644 --- a/tests/cpp/operator/test_cast_current_scaling.cu +++ b/tests/cpp/operator/test_cast_current_scaling.cu @@ -123,6 +123,7 @@ void performTest(const std::vector& shape) { nvte_compute_amax(input.data(), output_c.data(), 0); QuantizationConfigWrapper config; nvte_compute_scale_from_amax(output_c.data(), config, 0); + // avoid atomic amax update in cuda cast kernels because of current per-tensor scaling amax_to_check = output_c.amax(); output_c.set_tensor_amax_nullptr(); @@ -130,7 +131,7 @@ void performTest(const std::vector& shape) { nvte_quantize(input.data(), output_c.data(), 0); float ref_amax; - float ref_scale; + float ref_scale = 1.0; float ref_scale_inv; if (is_out_fp8){ compute_amax_scale_ref(input.rowwise_cpu_dptr(), @@ -138,13 +139,13 @@ void performTest(const std::vector& shape) { } compute_ref(input.rowwise_cpu_dptr(), ref_output_c.get(), - full_size, nullptr, is_out_fp8 ? output_c.scale() : 1.0f ); + full_size, nullptr, ref_scale); cudaDeviceSynchronize(); auto err = cudaGetLastError(); ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err); - if (isFp8Type(otype)) { + if (is_out_fp8) { auto [atol_fp32, rtol_fp32] = getTolerances(DType::kFloat32); compareResults("amax", amax_to_check, ref_amax, 0.0f, rtol_fp32); compareResults("scale", output_c.scale(), ref_scale, 0.0f, rtol_fp32); diff --git a/tests/cpp/operator/test_cast_dbias.cu b/tests/cpp/operator/test_cast_dbias.cu index 18f07153c6..b7b5db48c3 100644 --- a/tests/cpp/operator/test_cast_dbias.cu +++ b/tests/cpp/operator/test_cast_dbias.cu @@ -74,13 +74,14 @@ void performTest(const std::vector& shape) { fillUniform(&input); setRandomScale(&output_c); + const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(N*H); std::unique_ptr ref_output_dbias = std::make_unique(H); CType ref_amax; compute_ref_cast_dbias(input.rowwise_cpu_dptr(), - output_c.scale(), + ref_scale, ref_output_c.get(), &ref_amax, ref_output_dbias.get(), @@ -109,7 +110,7 @@ void performTest(const std::vector& shape) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output_c.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_cast_dbias_dgelu.cu b/tests/cpp/operator/test_cast_dbias_dgelu.cu index 8213e5665a..d8b8a20e6f 100644 --- a/tests/cpp/operator/test_cast_dbias_dgelu.cu +++ b/tests/cpp/operator/test_cast_dbias_dgelu.cu @@ -84,6 +84,7 @@ void performTest(const std::vector& shape) { fillUniform(&input); fillUniform(&grad); setRandomScale(&output_c); + const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(N*H); std::unique_ptr ref_output_dbias = std::make_unique(H); @@ -91,7 +92,7 @@ void performTest(const std::vector& shape) { CType ref_amax; compute_ref_cast_dbias_dgelu(input.rowwise_cpu_dptr(), grad.rowwise_cpu_dptr(), - output_c.scale(), + ref_scale, ref_output_c.get(), &ref_amax, ref_output_dbias.get(), @@ -123,7 +124,7 @@ void performTest(const std::vector& shape) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output_c.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } diff --git a/tests/cpp/operator/test_cast_gated_swiglu.cu b/tests/cpp/operator/test_cast_gated_swiglu.cu index 298b978f2a..5298cc7577 100644 --- a/tests/cpp/operator/test_cast_gated_swiglu.cu +++ b/tests/cpp/operator/test_cast_gated_swiglu.cu @@ -79,6 +79,7 @@ void performTest(const std::vector& shape) { fillUniform(&grad); fillUniform(&input); setRandomScale(&output_c); + const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(input_size); @@ -91,7 +92,7 @@ void performTest(const std::vector& shape) { float ref_amax; compute_ref_cast_dgated_swiglu(grad.rowwise_cpu_dptr(), input.rowwise_cpu_dptr(), - output_c.scale(), + ref_scale, ref_output_c.get(), &ref_amax, rows, @@ -100,7 +101,7 @@ void performTest(const std::vector& shape) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output_c.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } diff --git a/tests/cpp/operator/test_cast_transpose.cu b/tests/cpp/operator/test_cast_transpose.cu index ac05958b0b..9a5dc959da 100644 --- a/tests/cpp/operator/test_cast_transpose.cu +++ b/tests/cpp/operator/test_cast_transpose.cu @@ -55,13 +55,13 @@ void performTest(const size_t N, const size_t H) { fillUniform(&input); setRandomScale(&output); - const float scale = isFp8Type(otype) ? output.scale() : 1.0f; + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; nvte_quantize(input.data(), output.data(), 0); float ref_amax; compute_ref(input.rowwise_cpu_dptr(), ref_output_c.get(), - ref_output_t.get(), N, H, &ref_amax, scale); + ref_output_t.get(), N, H, &ref_amax, ref_scale); cudaDeviceSynchronize(); auto err = cudaGetLastError(); @@ -69,7 +69,7 @@ void performTest(const size_t N, const size_t H) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / scale; + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_cast_transpose_dbias.cu b/tests/cpp/operator/test_cast_transpose_dbias.cu index 5b06b28327..f9303d34f5 100644 --- a/tests/cpp/operator/test_cast_transpose_dbias.cu +++ b/tests/cpp/operator/test_cast_transpose_dbias.cu @@ -73,6 +73,7 @@ void performTest(const size_t N, const size_t H) { fillUniform(&input); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(N*H); std::unique_ptr ref_output_t = std::make_unique(N*H); @@ -80,7 +81,7 @@ void performTest(const size_t N, const size_t H) { CType ref_amax; compute_ref_cast_transpose_dbias(input.rowwise_cpu_dptr(), - output.scale(), + ref_scale, ref_output_c.get(), ref_output_t.get(), &ref_amax, @@ -111,7 +112,7 @@ void performTest(const size_t N, const size_t H) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu index 9a4a2fa080..31eafff80f 100644 --- a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu +++ b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu @@ -86,6 +86,7 @@ void performTest(const size_t N, const size_t H) { fillUniform(&input); fillUniform(&gelu_input); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(N*H); std::unique_ptr ref_output_t = std::make_unique(N*H); @@ -94,7 +95,7 @@ void performTest(const size_t N, const size_t H) { CType ref_amax; compute_ref_cast_transpose_dbias_dgelu(input.rowwise_cpu_dptr(), gelu_input.rowwise_cpu_dptr(), - output.scale(), + ref_scale, ref_output_c.get(), ref_output_t.get(), &ref_amax, @@ -127,7 +128,7 @@ void performTest(const size_t N, const size_t H) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } diff --git a/tests/cpp/operator/test_cast_transpose_dgeglu.cu b/tests/cpp/operator/test_cast_transpose_dgeglu.cu index a87c0c5a42..15ecd3ab66 100644 --- a/tests/cpp/operator/test_cast_transpose_dgeglu.cu +++ b/tests/cpp/operator/test_cast_transpose_dgeglu.cu @@ -81,6 +81,7 @@ void performTest(const size_t N, const size_t H) { fillUniform(&grad); fillUniform(&input); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(N * H * 2); std::unique_ptr ref_output_t = std::make_unique(N * H * 2); @@ -89,7 +90,7 @@ void performTest(const size_t N, const size_t H) { CType ref_amax; compute_ref_cast_transpose_dgated_gelu(grad.rowwise_cpu_dptr(), input.rowwise_cpu_dptr(), - output.scale(), ref_output_c.get(), ref_output_t.get(), + ref_scale, ref_output_c.get(), ref_output_t.get(), &ref_amax, N, H); cudaDeviceSynchronize(); @@ -99,7 +100,7 @@ void performTest(const size_t N, const size_t H) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } diff --git a/tests/cpp/operator/test_multi_cast_transpose.cu b/tests/cpp/operator/test_multi_cast_transpose.cu index 2bb35c4b89..0271c9dc6b 100644 --- a/tests/cpp/operator/test_multi_cast_transpose.cu +++ b/tests/cpp/operator/test_multi_cast_transpose.cu @@ -97,7 +97,7 @@ void performTest() { std::copy(input.rowwise_cpu_dptr(), input.rowwise_cpu_dptr() + height * width, ref_input_list.back().begin()); - ref_scale_list[tensor_id] = output.scale(); + ref_scale_list[tensor_id] = isFp8Type(otype) ? output.scale() : 1.0f; ref_height_list[tensor_id] = height; ref_width_list[tensor_id] = width; } @@ -138,7 +138,7 @@ void performTest() { atol_amax, rtol_amax); compareResults("scale_inv", output_list[tensor_id].rowwise_scale_inv(), - 1.f / output_list[tensor_id].scale(), + 1.f / ref_scale_list[tensor_id], atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_normalization.cu b/tests/cpp/operator/test_normalization.cu index f737005e26..ea6692dba4 100644 --- a/tests/cpp/operator/test_normalization.cu +++ b/tests/cpp/operator/test_normalization.cu @@ -208,7 +208,7 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma, auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); if (isFp8Type(otype)) { compareResults("amax", z.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / z.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", z.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } diff --git a/tests/cpp/operator/test_qdq.cu b/tests/cpp/operator/test_qdq.cu index 4e364fffa4..034280aa9a 100644 --- a/tests/cpp/operator/test_qdq.cu +++ b/tests/cpp/operator/test_qdq.cu @@ -65,12 +65,13 @@ void performTestQ(const size_t N) { fillUniform(&input); setRandomScale(&output); + const float ref_scale = output.scale(); nvte_quantize(input.data(), output.data(), 0); float ref_amax; compute_ref_q(input.rowwise_cpu_dptr(), ref_output.get(), - N, &ref_amax, output.scale()); + N, &ref_amax, ref_scale); cudaDeviceSynchronize(); auto err = cudaGetLastError(); diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index 04c1e8cf7e..bd81af100b 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -1015,6 +1015,9 @@ void fillCase_special(Tensor *t) { } // Try setting scale to 1, fallback to random scales + // Note: This is a hack to match behavior of an earlier + // implementation. Consider filling block scales with constant + // value. try { t->set_scale_inv(1.0); } catch (...) { From 97945bce86797764db37f52d1021993af2f97434 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 5 May 2026 01:48:09 +0000 Subject: [PATCH 03/10] Create dedicated class for managing GPU/CPU buffers Signed-off-by: Tim Moon --- tests/cpp/test_common.cu | 303 +++++++++++++++------------------------ tests/cpp/test_common.h | 145 ++++++++++++------- 2 files changed, 208 insertions(+), 240 deletions(-) diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index bd81af100b..070ce924ad 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -277,6 +277,36 @@ std::pair get_scales(const NVTEShape& shape, NVTE_ERROR("Invalid scaling mode!"); } +Tensor::Buffer::Buffer(size_t size, DType dtype) + : size_{size}, dtype_{dtype}, bytes_{size * typeToNumBits(dtype) / 8} { + if (bytes_ > 0) { + cpu_buffer_ = new unsigned char[bytes_]; + std::memset(cpu_buffer_.get(), 0, bytes_); + void *gpu_buffer = nullptr; + NVTE_CHECK_CUDA(cudaMalloc(&gpu_buffer, bytes_)); + gpu_buffer_ = gpu_buffer; + NVTE_CHECK_CUDA(cudaMemset(gpu_buffer_.get(), 0, bytes_)); + } +} + +void Tensor::Buffer::to_cpu() { + if (bytes_ > 0) { + NVTE_CHECK_CUDA(cudaMemcpy(cpu_buffer_.get(), gpu_buffer_.get(), bytes_, cudaMemcpyDeviceToHost)); + } +} + +void Tensor::Buffer::from_cpu() { + if (bytes_ > 0) { + NVTE_CHECK_CUDA(cudaMemcpy(gpu_buffer_.get(), cpu_buffer_.get(), bytes_, cudaMemcpyHostToDevice)); + } +} + +void Tensor::Buffer::GPUDeleter::operator() (void *ptr) { + if (ptr != nullptr) { + cudaFree(ptr); + } +} + Tensor::Tensor(const std::string& name, const NVTEShape &shape, const DType type, const bool rowwise, const bool columnwise, @@ -304,31 +334,13 @@ Tensor::Tensor(const std::string& name, flattened_shape = convertShape(flattened_shape_vec); } - // Allocate and initialize data - void *dptr_rowwise = nullptr, *dptr_columnwise = nullptr; - const size_t total_size = bytes(shape, type); - if (total_size != 0) { - if (rowwise) { - cudaMalloc((void**)&dptr_rowwise, total_size); // NOLINT(*) - cudaMemset(dptr_rowwise, 0, total_size); - cpu_data_rowwise_ = std::make_unique(total_size); - std::fill_n(cpu_data_rowwise_.get(), total_size, 0); - } - if (columnwise) { - cudaMalloc((void**)&dptr_columnwise, total_size); // NOLINT(*) - cudaMemset(dptr_columnwise, 0, total_size); - cpu_data_columnwise_ = std::make_unique(total_size); - std::fill_n(cpu_data_columnwise_.get(), total_size, 0); - } - } - - // Set tensor row-wise data + // Allocate row-wise data if (rowwise) { - const DType rowwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type; - tensor_.set_rowwise_data(dptr_rowwise, rowwise_type, shape); + data_rowwise_ = Tensor::Buffer(product(shape), type); + tensor_.set_rowwise_data(data_rowwise_.gpu_buffer(), type, shape); } - // Set tensor column-wise data + // Allocate column-wise data if (columnwise) { // Determine shape of column-wise data std::vector columnwise_shape_vec; @@ -359,162 +371,113 @@ Tensor::Tensor(const std::string& name, const auto columnwise_shape = nvte_make_shape(columnwise_shape_vec.data(), columnwise_shape_vec.size()); - // Set column-wise data buffer - const DType colwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type; - tensor_.set_columnwise_data(dptr_columnwise, colwise_type, columnwise_shape); + // Allocate buffer + data_columnwise_ = Tensor::Buffer(product(columnwise_shape), type); + + // Configure TE tensor + tensor_.set_columnwise_data(data_columnwise_.gpu_buffer(), type, columnwise_shape); } - // Configure scales, amaxes, and other tensor buffers - float *amax = nullptr, *scale = nullptr; - float *rowwise_scale_inv = nullptr, *columnwise_scale_inv = nullptr; - if (isFp8Type(type) || isFp4Type(type)) { - if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING) { - cudaMalloc((void**)&amax, sizeof(float)); // NOLINT(*) - cudaMemset(amax, 0, sizeof(float)); - cudaMalloc((void**)&scale, sizeof(float)); // NOLINT(*) - cudaMemset(scale, 0, sizeof(float)); - amax_cpu_data_ = std::make_shared(0); - scale_cpu_data_ = std::make_shared(0); - tensor_.set_amax(amax, DType::kFloat32, std::vector{1}); - tensor_.set_scale(scale, DType::kFloat32, std::vector{1}); - cudaMalloc((void**)&rowwise_scale_inv, sizeof(float)); // NOLINT(*) + // Allocate recipe-specific buffers + switch (scaling_mode) { + case NVTE_DELAYED_TENSOR_SCALING: + if (isFp8Type(type)) { + amax_ = Tensor::Buffer(1, DType::kFloat32); + scale_ = Tensor::Buffer(1, DType::kFloat32); + scale_inv_rowwise_ = Tensor::Buffer(1, DType::kFloat32); + tensor_.set_amax(amax_.gpu_buffer(), DType::kFloat32, std::vector{1}); + tensor_.set_scale(scale_.gpu_buffer(), DType::kFloat32, std::vector{1}); if (rowwise) { - tensor_.set_rowwise_scale_inv(rowwise_scale_inv, DType::kFloat32, - std::vector{1}); - rowwise_scale_inv_cpu_data_ = std::make_unique(sizeof(float)); - std::fill_n(rowwise_scale_inv_cpu_data_.get(), sizeof(float), 0); + tensor_.set_rowwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); } if (columnwise) { - tensor_.set_columnwise_scale_inv(rowwise_scale_inv, DType::kFloat32, - std::vector{1}); - columnwise_scale_inv_cpu_data_ = std::make_unique(sizeof(float)); - std::fill_n(columnwise_scale_inv_cpu_data_.get(), sizeof(float), 0); - } - } else { - if (scaling_mode == NVTE_NVFP4_1D_SCALING) { - cudaMalloc((void**)&amax, sizeof(float)); // NOLINT(*) - cudaMemset(amax, 0, sizeof(float)); - amax_cpu_data_ = std::make_shared(0); - tensor_.set_amax(amax, DType::kFloat32, std::vector{1}); + tensor_.set_columnwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); } + } + break; + case NVTE_MXFP8_1D_SCALING: + case NVTE_BLOCK_SCALING_1D: + case NVTE_BLOCK_SCALING_2D: + case NVTE_NVFP4_1D_SCALING:: + { + // Block scaling factors auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(flattened_shape, tensor_.scaling_mode()); - auto rowwise_scale_size = rowwise_scale_meta.bytes(); - auto columnwise_scale_size = colwise_scale_meta.bytes(); - auto scale_shape = rowwise_scale_meta.shape; - auto columnwise_scale_shape = colwise_scale_meta.shape; if (rowwise) { - cudaMalloc((void **)&rowwise_scale_inv, rowwise_scale_size); // NOLINT(*) - cudaMemset(rowwise_scale_inv, 0, rowwise_scale_size); - rowwise_scale_inv_cpu_data_ = std::make_unique(rowwise_scale_size); - std::fill_n(rowwise_scale_inv_cpu_data_.get(), rowwise_scale_size, 0); - auto scale_dtype = rowwise_scale_meta.type; - tensor_.set_rowwise_scale_inv(rowwise_scale_inv, scale_dtype, scale_shape); + const auto scale_shape = rowwise_scale_meta.shape; + const auto scale_dtype = rowwise_scale_meta.dtype; + scale_inv_rowwise_ = Tensor::Buffer(product(scale_shape), scale_dtype); + tensor_.set_rowwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), scale_dtype, scale_shape); } if (columnwise) { - cudaMalloc((void**)&columnwise_scale_inv, columnwise_scale_size); // NOLINT(*) - cudaMemset(columnwise_scale_inv, 0, columnwise_scale_size); - columnwise_scale_inv_cpu_data_ = std::make_unique(columnwise_scale_size); - std::fill_n(columnwise_scale_inv_cpu_data_.get(), columnwise_scale_size, 0); - auto scale_dtype = colwise_scale_meta.type; - tensor_.set_columnwise_scale_inv(columnwise_scale_inv, scale_dtype, columnwise_scale_shape); + const auto scale_shape = columnwise_scale_meta.shape; + const auto scale_dtype = columnwise_scale_meta.dtype; + scale_inv_columnwise_ = Tensor::Buffer(product(scale_shape), scale_dtype); + tensor_.set_columnwise_scale_inv(scale_inv_columnwise_.gpu_buffer(), scale_dtype, scale_shape); } - } - } - // Sanity check that CPU and GPU have corresponding buffers - NVTE_CHECK((cpu_data_rowwise_ == nullptr) == (tensor_.dptr() == nullptr)); - NVTE_CHECK((cpu_data_columnwise_ == nullptr) == (tensor_.columnwise_dptr() == nullptr)); - NVTE_CHECK((rowwise_scale_inv_cpu_data_ == nullptr) == (tensor_.scale_inv() == nullptr)); - NVTE_CHECK((columnwise_scale_inv_cpu_data_ == nullptr) - == (tensor_.get_columnwise_scale_inv().data_ptr == nullptr)); - NVTE_CHECK((amax_cpu_data_ == nullptr) == (tensor_.amax() == nullptr)); - NVTE_CHECK((scale_cpu_data_ == nullptr) == (tensor_.scale() == nullptr)); -} - -Tensor::~Tensor() { - std::unordered_set freed_ptrs; - auto free_cuda_buffer = [&freed_ptrs] (void *ptr) -> void { - if (ptr != nullptr && freed_ptrs.count(ptr) > 0) { - cudaFree(ptr); - freed_ptrs.insert(ptr); + // NVFP4 uses amax for tensor scaling + if (scaling_mode == NVTE_NVFP4_1D_SCALING) { + amax_ = Tensor::Buffer(1, DType::kFloat32); + } } - }; - free_cuda_buffer(tensor_.dptr()); - free_cuda_buffer(tensor_.scale_inv()); - free_cuda_buffer(tensor_.scale()); - free_cuda_buffer(tensor_.amax()); - free_cuda_buffer(tensor_.columnwise_dptr()); - free_cuda_buffer(tensor_.get_columnwise_scale_inv().data_ptr); + break; + default: + NVTE_ERROR("Unsupported tensor format (", static_cast(scaling_mode), ")"); + } } void Tensor::to_cpu() const { - auto from_basic_tensor = [] (const NVTEBasicTensor src, void *dst) -> void { - if (dst != nullptr) { - const size_t copy_size = bytes(src.shape, static_cast(src.dtype)); - cudaMemcpy(dst, src.data_ptr, copy_size, cudaMemcpyDeviceToHost); - } - }; - from_basic_tensor(tensor_.get_rowwise_data(), cpu_data_rowwise_.get()); - from_basic_tensor(tensor_.get_columnwise_data(), cpu_data_columnwise_.get()); - from_basic_tensor(tensor_.get_rowwise_scale_inv(), rowwise_scale_inv_cpu_data_.get()); - from_basic_tensor(tensor_.get_columnwise_scale_inv(), columnwise_scale_inv_cpu_data_.get()); - from_basic_tensor(tensor_.get_amax(), amax_cpu_data_.get()); - from_basic_tensor(tensor_.get_scale(), scale_cpu_data_.get()); + data_rowwise_.to_cpu(); + data_columnwise_.to_cpu(); + scale_inv_rowwise_.to_cpu(); + scale_inv_columnwise_.to_cpu(); + amax_.to_cpu(); + scale_.to_cpu(); } void Tensor::from_cpu() const { - auto to_basic_tensor = [] (const void *src, NVTEBasicTensor dst) -> void { - if (src != nullptr) { - const size_t copy_size = bytes(dst.shape, static_cast(dst.dtype)); - cudaMemcpy(dst.data_ptr, src, copy_size, cudaMemcpyHostToDevice); - } - }; - to_basic_tensor(cpu_data_rowwise_.get(), tensor_.get_rowwise_data()); - to_basic_tensor(cpu_data_columnwise_.get(), tensor_.get_columnwise_data()); - to_basic_tensor(rowwise_scale_inv_cpu_data_.get(), tensor_.get_rowwise_scale_inv()); - to_basic_tensor(columnwise_scale_inv_cpu_data_.get(), tensor_.get_columnwise_scale_inv()); - to_basic_tensor(amax_cpu_data_.get(), tensor_.get_amax()); - to_basic_tensor(scale_cpu_data_.get(), tensor_.get_scale()); + data_rowwise_.from_cpu(); + data_columnwise_.from_cpu(); + scale_inv_rowwise_.from_cpu(); + scale_inv_columnwise_.from_cpu(); + amax_.from_cpu(); + scale_.from_cpu(); } void Tensor::set_amax(float amax) { - NVTE_CHECK(amax_cpu_data_); - NVTE_CHECK(tensor_.get_amax().dtype == kNVTEFloat32); - NVTE_CHECK(product(tensor_.get_amax().shape) == 1); - *amax_cpu_data_ = amax; - from_cpu(); + NVTE_CHECK(amax_.size() == 1); + NVTE_CHECK(amax_.dtype() == kNVTEFloat32); + *amax_.cpu_data() = amax; + amax_.from_gpu(); } void Tensor::set_scale(float scale) { - NVTE_CHECK(scale_cpu_data_); - NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32); - NVTE_CHECK(product(tensor_.get_scale().shape) == 1); - *scale_cpu_data_ = scale; - from_cpu(); + NVTE_CHECK(scale_.size() == 1); + NVTE_CHECK(scale_.dtype() == kNVTEFloat32); + *scale_.cpu_data() = scale; + scale_.from_gpu(); } void Tensor::set_scale_inv(float scale_inv) { - NVTE_CHECK(rowwise_scale_inv_cpu_data_); - NVTE_CHECK(columnwise_scale_inv_cpu_data_ == nullptr); // TODO Not needed. Remove once all incorrect function calls have been removed - NVTE_CHECK(product(tensor_.get_rowwise_scale_inv().shape) == 1); - NVTE_CHECK(tensor_.get_rowwise_scale_inv().dtype == kNVTEFloat32); - *reinterpret_cast(rowwise_scale_inv_cpu_data_.get()) = scale_inv; - from_cpu(); + NVTE_CHECK(scale_inv_rowwise_.size() == 1); + NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32); + *scale_inv_rowwise_.cpu_data() = scale_inv; + scale_inv_rowwise_.from_gpu(); } void Tensor::fill_uniform_rowwise_scale_inv() { - if (rowwise_scale_inv_cpu_data_ == nullptr) { + if (scale_inv_rowwise_.size() == 0) { return; } // Generate random scales on CPU - const auto numel = product(tensor_.get_rowwise_scale_inv().shape); - const auto dtype = tensor_.get_rowwise_scale_inv().dtype; + const auto numel = scale_inv_rowwise_.size(); + const auto dtype = scale_inv_rowwise_.dtype(); switch (dtype) { case kNVTEFloat32: { - auto *cpu_data = reinterpret_cast(scale_cpu_data_.get()); - std::uniform_real_distribution<> dis(-2.0, 1.0); + auto *cpu_data = scale_inv_rowwise_.cpu_data(); + std::uniform_real_distribution dis(-2.0, 1.0); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); } @@ -524,7 +487,7 @@ void Tensor::fill_uniform_rowwise_scale_inv() { case kNVTEFloat8E8M0: case kNVTEByte: { - auto *cpu_data = reinterpret_cast(scale_cpu_data_.get()); + auto *cpu_data = reinterpret_cast(scale_inv_rowwise_.cpu_data()); std::uniform_int_distribution dis(0, 127); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); @@ -537,22 +500,22 @@ void Tensor::fill_uniform_rowwise_scale_inv() { } // Update GPU tensor - from_cpu(); + scale_inv_rowwise_.from_cpu(); } void Tensor::fill_uniform_columnwise_scale_inv() { - if (columnwise_scale_inv_cpu_data_ == nullptr) { + if (scale_inv_columnwise_.size() == 0) { return; } // Generate random scales on CPU - const auto numel = product(tensor_.get_columnwise_scale_inv().shape); - const auto dtype = tensor_.get_columnwise_scale_inv().dtype; + const auto numel = scale_inv_columnwise_.size(); + const auto dtype = scale_inv_columnwise_.dtype(); switch (dtype) { case kNVTEFloat32: { - auto *cpu_data = reinterpret_cast(scale_cpu_data_.get()); - std::uniform_real_distribution<> dis(-2.0, 1.0); + auto *cpu_data = scale_inv_columnwise_.cpu_data(); + std::uniform_real_distribution dis(-2.0, 1.0); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); } @@ -562,7 +525,7 @@ void Tensor::fill_uniform_columnwise_scale_inv() { case kNVTEFloat8E8M0: case kNVTEByte: { - auto *cpu_data = reinterpret_cast(scale_cpu_data_.get()); + auto *cpu_data = reinterpret_cast(scale_inv_columnwise_.cpu_data()); std::uniform_int_distribution dis(0, 127); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); @@ -575,55 +538,25 @@ void Tensor::fill_uniform_columnwise_scale_inv() { } // Update GPU tensor - from_cpu(); + scale_inv_columnwise_.from_cpu(); } void Tensor::fill_uniform_scale() { - if (scale_cpu_data_ == nullptr) { + if (scale_.size() == 0) { return; } // Generate random scales on CPU - auto *cpu_data = reinterpret_cast(scale_cpu_data_.get()); - const auto numel = product(tensor_.get_columnwise_scale_inv().shape); - NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32); - std::uniform_real_distribution<> dis(-2.0, 1.0); + auto *cpu_data = scale_.cpu_data(); + const auto numel = scale_.size(); + NVTE_CHECK(scale_.dtype() == kNVTEFloat32); + std::uniform_real_distribution dis(-2.0, 1.0); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); } // Update GPU tensor - from_cpu(); -} - -void Tensor::shareFP8Meta(const Tensor &other) { - if ((isFp8Type(dtype()) && isFp8Type(other.dtype())) - || isFp4Type(dtype()) && isFp4Type(other.dtype())) { - auto new_tensor = TensorWrapper(other.tensor_.scaling_mode()); - auto my_rowwise_data = tensor_.get_rowwise_data(); - new_tensor.set_rowwise_data(my_rowwise_data.data_ptr, static_cast(my_rowwise_data.dtype), - my_rowwise_data.shape); - auto my_columnwise_data = tensor_.get_columnwise_data(); - new_tensor.set_columnwise_data(my_columnwise_data.data_ptr, - static_cast(my_columnwise_data.dtype), - my_columnwise_data.shape); - auto other_amax = other.tensor_.get_amax(); - new_tensor.set_amax(other_amax.data_ptr, static_cast(other_amax.dtype), - other_amax.shape); - auto other_scale = other.tensor_.get_scale(); - new_tensor.set_scale(other_scale.data_ptr, static_cast(other_scale.dtype), - other_scale.shape); - auto other_row_scale_inv = other.tensor_.get_rowwise_scale_inv(); - new_tensor.set_rowwise_scale_inv(other_row_scale_inv.data_ptr, - static_cast(other_row_scale_inv.dtype), - other_row_scale_inv.shape); - auto other_col_scale_inv = other.tensor_.get_columnwise_scale_inv(); - new_tensor.set_columnwise_scale_inv(other_col_scale_inv.data_ptr, - static_cast(other_col_scale_inv.dtype), - other_col_scale_inv.shape); - tensor_ = std::move(new_tensor); - to_cpu(); - } + scale_.from_cpu(); } using std::to_string; diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index 7696d9a36e..8137c184a4 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -139,7 +139,7 @@ class Tensor { const NVTEScalingMode &mode = NVTE_DELAYED_TENSOR_SCALING) : Tensor(name, nvte_make_shape(shape.data(), shape.size()), type, rowwise, columnwise, mode) {} - Tensor() {} + Tensor() = default; Tensor& operator=(const Tensor &other) = delete; Tensor(const Tensor &other) = delete; @@ -147,7 +147,7 @@ class Tensor { Tensor(Tensor &&other) = default; Tensor& operator=(Tensor &&other) = default; - ~Tensor(); + ~Tensor() = default; NVTETensor data() const noexcept { return tensor_.data(); } @@ -185,71 +185,45 @@ class Tensor { template T *rowwise_cpu_dptr() const { - NVTE_CHECK(TypeInfo::dtype == tensor_.dtype(), "Invalid type!"); - NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!"); - return reinterpret_cast(cpu_data_rowwise_.get()); + return data_rowwise_.cpu_buffer(); } template T *columnwise_cpu_dptr() const { - NVTE_CHECK(TypeInfo::dtype == tensor_.dtype(), "Invalid type!"); - NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!"); - return reinterpret_cast(cpu_data_columnwise_.get()); + return data_columnwise_.cpu_buffer(); } - float amax() const { - NVTE_CHECK(amax_cpu_data_); - NVTE_CHECK(tensor_.get_amax().dtype == kNVTEFloat32); - NVTE_CHECK(product(tensor_.get_amax().shape) == 1); - to_cpu(); - return *amax_cpu_data_; + float amax() { + NVTE_CHECK(amax_.size() == 1); + NVTE_CHECK(amax_.dtype() == kNVTEFloat32); + amax_.to_cpu(); + return *amax_.cpu_data(); } - float scale() const { - NVTE_CHECK(scale_cpu_data_); - NVTE_CHECK(tensor_.get_scale().dtype == kNVTEFloat32); - NVTE_CHECK(product(tensor_.get_scale().shape) == 1); - to_cpu(); - return *scale_cpu_data_; + float scale() { + NVTE_CHECK(scale_.size() == 1); + NVTE_CHECK(scale_.dtype() == kNVTEFloat32); + scale_.to_cpu(); + return *scale_.cpu_data(); } template T *rowwise_cpu_scale_inv_ptr(){ - if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING){ - NVTE_CHECK(TypeInfo::dtype == DType::kFloat32, "Invalid type!"); - } else if (tensor_.scaling_mode() == NVTE_BLOCK_SCALING_1D || tensor_.scaling_mode() == NVTE_BLOCK_SCALING_2D) { - NVTE_CHECK(TypeInfo::dtype == DType::kFloat32, "Invalid type!"); - } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) { - NVTE_CHECK(TypeInfo::dtype == DType::kFloat8E4M3, "Invalid type!"); - } else { - NVTE_CHECK(TypeInfo::dtype == DType::kByte, "Invalid type!"); - } - to_cpu(); - return reinterpret_cast(rowwise_scale_inv_cpu_data_.get()); + scale_inv_rowwise_.to_cpu(); + return scale_inv_rowwise_.cpu_buffer(); } template T *columnwise_cpu_scale_inv_ptr(){ - if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING){ - NVTE_CHECK(TypeInfo::dtype == DType::kFloat32, "Invalid type!"); - } else if (tensor_.scaling_mode() == NVTE_BLOCK_SCALING_1D || tensor_.scaling_mode() == NVTE_BLOCK_SCALING_2D) { - NVTE_CHECK(TypeInfo::dtype == DType::kFloat32, "Invalid type!"); - } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) { - NVTE_CHECK(TypeInfo::dtype == DType::kFloat8E4M3, "Invalid type!"); - } else { - NVTE_CHECK(TypeInfo::dtype == DType::kByte, "Invalid type!"); - } - to_cpu(); - return reinterpret_cast(columnwise_scale_inv_cpu_data_.get()); + scale_inv_columnwise_.to_cpu(); + return scale_inv_columnwise_.cpu_buffer(); } float rowwise_scale_inv(){ - to_cpu(); - NVTE_CHECK(rowwise_scale_inv_cpu_data_); - auto scale_inv_tensor = tensor_.get_rowwise_scale_inv(); - NVTE_CHECK(product(scale_inv_tensor.shape) == 1); - NVTE_CHECK(scale_inv_tensor.dtype == kNVTEFloat32); - return *reinterpret_cast(rowwise_scale_inv_cpu_data_.get()); + NVTE_CHECK(scale_inv_rowwise_.size() == 1); + NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32); + scale_inv_rowwise_.to_cpu(); + return *scale_inv_rowwise_.cpu_data(); } bool rowwise() const { @@ -278,18 +252,79 @@ class Tensor { void fill_uniform_rowwise_scale_inv(); void fill_uniform_columnwise_scale_inv(); void fill_uniform_scale(); - void shareFP8Meta(const Tensor &other); std::mt19937& gen() { return gen_; } private: + + /* Manages matching GPU and CPU buffers. */ + class Buffer { + public: + + Buffer(size_t size = 0, DType dtype = DType::kByte); + ~Buffer() = default; + Buffer(const Buffer&) = delete; + Buffer& operator=(const Buffer&) = delete; + Buffer(Buffer&&) = default; + Buffer& operator=(Buffer&&) = default; + + size_t size() const noexcept { return size_; } + DType dtype() const noexcept { return dtype_; } + + // Void pointer accessors + void *cpu_buffer() { return cpu_buffer_.get(); } + const void *cpu_buffer() const { return cpu_buffer_.get(); } + void *gpu_buffer() { return gpu_buffer_.get(); } + const void *gpu_buffer() const { return gpu_buffer_.get(); } + + // Templated pointer accessors + template + T *cpu_buffer() { + NVTE_CHECK(TypeInfo::dtype == dtype_, "Invalid type."); + return reinterpret_cast(cpu_buffer());; + } + template + const T *cpu_buffer() const { + return const_cast(this)->cpu_buffer(); + } + template + T *gpu_buffer() { + NVTE_CHECK(TypeInfo::dtype == dtype_, "Invalid type."); + return reinterpret_cast(gpu_buffer());; + } + template + const T *gpu_buffer() const { + return const_cast(this)->gpu_buffer(); + } + + // Memory transfers between CPU and GPU + void to_cpu(); + void from_cpu(); + + private: + + struct GPUDeleter { + void operator()(void *ptr); + }; + + std::unique_ptr cpu_buffer_; + std::unique_ptr gpu_buffer_; + size_t size_; + DType dtype_; + size_t bytes_; + }; + + // Transformer Engine tensor TensorWrapper tensor_; - std::unique_ptr cpu_data_rowwise_; - std::unique_ptr cpu_data_columnwise_; - std::shared_ptr amax_cpu_data_; - std::shared_ptr scale_cpu_data_; - std::unique_ptr rowwise_scale_inv_cpu_data_; - std::unique_ptr columnwise_scale_inv_cpu_data_; + + // Data buffers + Buffer data_rowwise_; + Buffer data_columnwise_; + Buffer scale_inv_rowwise_; + Buffer scale_inv_columnwise_; + Buffer amax_; + Buffer scale_; + bool rowwise_; bool columnwise_; std::string name_; From ea63f052bf7659b2822d043e43710254a9c737be Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 5 May 2026 04:37:08 +0000 Subject: [PATCH 04/10] Fix bugs in C++ test tensor infrastructure - Fix syntax error in switch case (:: -> :) - Fix double-underscore typo in variable name - Fix wrong buffer passed to set_amax_columnwise - Fix unique_ptr assignment from raw pointer (use reset()) - Remove dead duplicate NVTE_MXFP8_1D_SCALING branch in get_scales() - Rename cpu_data -> cpu_buffer to match Buffer class API - Remove const from Tensor::to_cpu/from_cpu and their callers, since both methods write to the CPU buffer Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Tim Moon --- .../cpp/operator/test_cast_nvfp4_transpose.cu | 2 +- tests/cpp/test_common.cu | 67 ++++++------------- tests/cpp/test_common.h | 20 +++--- 3 files changed, 31 insertions(+), 58 deletions(-) diff --git a/tests/cpp/operator/test_cast_nvfp4_transpose.cu b/tests/cpp/operator/test_cast_nvfp4_transpose.cu index 15d7c695c9..4ab06abcde 100644 --- a/tests/cpp/operator/test_cast_nvfp4_transpose.cu +++ b/tests/cpp/operator/test_cast_nvfp4_transpose.cu @@ -502,7 +502,7 @@ void print_detailed_tensor_comparison(const std::string& name, printf("==================================\n"); } -void compareResults_nvfp4(const Tensor &test, +void compareResults_nvfp4(Tensor &test, const void *ref, const void *ref_t, const int rows, const int cols, double atol = 1e-5, double rtol = 1e-8, bool if_on_gpus = true, bool dump_data = false) { if (if_on_gpus) test.to_cpu(); diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index 186f80d85e..4902989fbb 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -194,33 +194,6 @@ std::pair get_scales(const NVTEShape& shape, return {ret_rowwise, ret_colwise}; } - if (scaling_mode == NVTE_MXFP8_1D_SCALING) { - std::vector shape_vec; - for (size_t i = 0; i < shape.ndim; ++i) { - shape_vec.push_back(shape.data[i]); - } - size_t first_dim = first_dimension(shape_vec); - size_t last_dim = last_dimension(shape_vec); - - scale_inv_meta ret_rowwise, ret_colwise; - - const size_t block_size_X_rowwise = 32; - size_t scale_dim_Y_rowwise = DIVUP_TO_MULTIPLE(first_dim, scale_tensor_alignment_Y_rowwise); - size_t scale_dim_X_rowwise = DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_size_X_rowwise), scale_tensor_alignment_X_rowwise); - ret_rowwise.shape = {scale_dim_Y_rowwise, scale_dim_X_rowwise}; - - const size_t block_size_Y_colwise = 32; - size_t scale_dim_Y_colwise = DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_size_Y_colwise), scale_tensor_alignment_Y_colwise); - size_t scale_dim_X_colwise = DIVUP_TO_MULTIPLE(last_dim, scale_tensor_alignment_X_colwise); - ret_colwise.shape = {scale_dim_Y_colwise, scale_dim_X_colwise}; - - ret_rowwise.type = DType::kFloat8E8M0; - ret_colwise.type = DType::kFloat8E8M0; - ret_rowwise.type_size_bits = typeToNumBits(DType::kFloat8E8M0); - ret_colwise.type_size_bits = typeToNumBits(DType::kFloat8E8M0); - - return {ret_rowwise, ret_colwise}; - } if (scaling_mode == NVTE_BLOCK_SCALING_2D) { std::vector shape_vec; for (size_t i = 0; i < shape.ndim; ++i) { @@ -280,11 +253,11 @@ std::pair get_scales(const NVTEShape& shape, Tensor::Buffer::Buffer(size_t size, DType dtype) : size_{size}, dtype_{dtype}, bytes_{size * typeToNumBits(dtype) / 8} { if (bytes_ > 0) { - cpu_buffer_ = new unsigned char[bytes_]; + cpu_buffer_.reset(new unsigned char[bytes_]); std::memset(cpu_buffer_.get(), 0, bytes_); - void *gpu_buffer = nullptr; + unsigned char *gpu_buffer = nullptr; NVTE_CHECK_CUDA(cudaMalloc(&gpu_buffer, bytes_)); - gpu_buffer_ = gpu_buffer; + gpu_buffer_.reset(gpu_buffer); NVTE_CHECK_CUDA(cudaMemset(gpu_buffer_.get(), 0, bytes_)); } } @@ -398,7 +371,7 @@ Tensor::Tensor(const std::string& name, case NVTE_MXFP8_1D_SCALING: case NVTE_BLOCK_SCALING_1D: case NVTE_BLOCK_SCALING_2D: - case NVTE_NVFP4_1D_SCALING:: + case NVTE_NVFP4_1D_SCALING: { // Block scaling factors auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(flattened_shape, tensor_.scaling_mode()); @@ -418,9 +391,9 @@ Tensor::Tensor(const std::string& name, // NVFP4 uses amax for tensor scaling if (scaling_mode == NVTE_NVFP4_1D_SCALING) { amax_rowwise_ = Tensor::Buffer(1, DType::kFloat32); - amax_columnwise__ = Tensor::Buffer(1, DType::kFloat32); + amax_columnwise_ = Tensor::Buffer(1, DType::kFloat32); tensor_.set_amax(amax_rowwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); - tensor_.set_amax_columnwise(amax_rowwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); + tensor_.set_amax_columnwise(amax_columnwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); } } break; @@ -429,7 +402,7 @@ Tensor::Tensor(const std::string& name, } } -void Tensor::to_cpu() const { +void Tensor::to_cpu() { data_rowwise_.to_cpu(); data_columnwise_.to_cpu(); scale_inv_rowwise_.to_cpu(); @@ -439,7 +412,7 @@ void Tensor::to_cpu() const { scale_.to_cpu(); } -void Tensor::from_cpu() const { +void Tensor::from_cpu() { data_rowwise_.from_cpu(); data_columnwise_.from_cpu(); scale_inv_rowwise_.from_cpu(); @@ -452,21 +425,21 @@ void Tensor::from_cpu() const { void Tensor::set_amax(float amax) { NVTE_CHECK(amax_rowwise_.size() == 1); NVTE_CHECK(amax_rowwise_.dtype() == kNVTEFloat32); - *amax_rowwise_.cpu_data() = amax; + *amax_rowwise_.cpu_buffer() = amax; amax_rowwise_.from_cpu(); } void Tensor::set_scale(float scale) { NVTE_CHECK(scale_.size() == 1); NVTE_CHECK(scale_.dtype() == kNVTEFloat32); - *scale_.cpu_data() = scale; + *scale_.cpu_buffer() = scale; scale_.from_cpu(); } void Tensor::set_scale_inv(float scale_inv) { NVTE_CHECK(scale_inv_rowwise_.size() == 1); NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32); - *scale_inv_rowwise_.cpu_data() = scale_inv; + *scale_inv_rowwise_.cpu_buffer() = scale_inv; scale_inv_rowwise_.from_cpu(); } @@ -477,7 +450,7 @@ void Tensor::set_tensor_amax(float amax) { void Tensor::set_tensor_amax_columnwise(float amax) { NVTE_CHECK(amax_columnwise_.size() == 1); NVTE_CHECK(amax_columnwise_.dtype() == kNVTEFloat32); - *amax_columnwise_.cpu_data() = amax; + *amax_columnwise_.cpu_buffer() = amax; amax_columnwise_.from_cpu(); } @@ -492,7 +465,7 @@ void Tensor::fill_uniform_rowwise_scale_inv() { switch (dtype) { case kNVTEFloat32: { - auto *cpu_data = scale_inv_rowwise_.cpu_data(); + auto *cpu_data = scale_inv_rowwise_.cpu_buffer(); std::uniform_real_distribution dis(-2.0, 1.0); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); @@ -503,7 +476,7 @@ void Tensor::fill_uniform_rowwise_scale_inv() { case kNVTEFloat8E8M0: case kNVTEByte: { - auto *cpu_data = reinterpret_cast(scale_inv_rowwise_.cpu_data()); + auto *cpu_data = reinterpret_cast(scale_inv_rowwise_.cpu_buffer()); std::uniform_int_distribution dis(0, 127); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); @@ -530,7 +503,7 @@ void Tensor::fill_uniform_columnwise_scale_inv() { switch (dtype) { case kNVTEFloat32: { - auto *cpu_data = scale_inv_columnwise_.cpu_data(); + auto *cpu_data = scale_inv_columnwise_.cpu_buffer(); std::uniform_real_distribution dis(-2.0, 1.0); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); @@ -541,7 +514,7 @@ void Tensor::fill_uniform_columnwise_scale_inv() { case kNVTEFloat8E8M0: case kNVTEByte: { - auto *cpu_data = reinterpret_cast(scale_inv_columnwise_.cpu_data()); + auto *cpu_data = reinterpret_cast(scale_inv_columnwise_.cpu_buffer()); std::uniform_int_distribution dis(0, 127); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); @@ -563,7 +536,7 @@ void Tensor::fill_uniform_scale() { } // Generate random scales on CPU - auto *cpu_data = scale_.cpu_data(); + auto *cpu_data = scale_.cpu_buffer(); const auto numel = scale_.size(); NVTE_CHECK(scale_.dtype() == kNVTEFloat32); std::uniform_real_distribution dis(-2.0, 1.0); @@ -600,7 +573,7 @@ std::vector unravel(const size_t i, const NVTEShape &shape) { return ret; } -void compareResults_sequential(const std::string &name, const Tensor &test, +void compareResults_sequential(const std::string &name, Tensor &test, const void *ref, const bool rowwise, double atol, double rtol, bool if_on_gpus, const size_t tolerable_mismatches_limit) { @@ -690,7 +663,7 @@ static size_t getFirstMismatchIdx(const DType data_type, const T* test_data, con return first_mismatch_idx; } -void compareResults_parallel(const std::string &name, const Tensor &test, const void *ref, +void compareResults_parallel(const std::string &name, Tensor &test, const void *ref, const bool rowwise, double atol, double rtol, bool if_on_gpus, const size_t tolerable_mismatches_limit) { if (if_on_gpus) test.to_cpu(); @@ -717,7 +690,7 @@ void compareResults_parallel(const std::string &name, const Tensor &test, const ); } -void compareResults(const std::string &name, const Tensor &test, const void *ref, +void compareResults(const std::string &name, Tensor &test, const void *ref, const bool rowwise, double atol, double rtol, bool if_on_gpus, const size_t tolerable_mismatches_limit) { constexpr bool sequential = false; diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index 4ea31b50b0..52138bd4c4 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -120,7 +120,7 @@ struct TypeInfo { } constexpr static DType dtype = getType(); - constexpr static size_t size = BitsNumber::num_bits;; + constexpr static size_t size = BitsNumber::num_bits; }; class Tensor { @@ -197,21 +197,21 @@ class Tensor { NVTE_CHECK(amax_rowwise_.size() == 1); NVTE_CHECK(amax_rowwise_.dtype() == kNVTEFloat32); amax_rowwise_.to_cpu(); - return *amax_rowwise_.cpu_data(); + return *amax_rowwise_.cpu_buffer(); } float amax_columnwise() { NVTE_CHECK(amax_columnwise_.size() == 1); NVTE_CHECK(amax_columnwise_.dtype() == kNVTEFloat32); amax_columnwise_.to_cpu(); - return *amax_columnwise_.cpu_data(); + return *amax_columnwise_.cpu_buffer(); } float scale() { NVTE_CHECK(scale_.size() == 1); NVTE_CHECK(scale_.dtype() == kNVTEFloat32); scale_.to_cpu(); - return *scale_.cpu_data(); + return *scale_.cpu_buffer(); } template @@ -230,7 +230,7 @@ class Tensor { NVTE_CHECK(scale_inv_rowwise_.size() == 1); NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32); scale_inv_rowwise_.to_cpu(); - return *scale_inv_rowwise_.cpu_data(); + return *scale_inv_rowwise_.cpu_buffer(); } bool rowwise() const { @@ -249,8 +249,8 @@ class Tensor { tensor_.set_with_gemm_swizzled_scales(with_gemm_swizzled_scales); } - void to_cpu() const; - void from_cpu() const; + void to_cpu(); + void from_cpu(); void set_amax(float amax); void set_scale(float scale); @@ -290,7 +290,7 @@ class Tensor { template T *cpu_buffer() { NVTE_CHECK(TypeInfo::dtype == dtype_, "Invalid type."); - return reinterpret_cast(cpu_buffer());; + return reinterpret_cast(cpu_buffer()); } template const T *cpu_buffer() const { @@ -299,7 +299,7 @@ class Tensor { template T *gpu_buffer() { NVTE_CHECK(TypeInfo::dtype == dtype_, "Invalid type."); - return reinterpret_cast(gpu_buffer());; + return reinterpret_cast(gpu_buffer()); } template const T *gpu_buffer() const { @@ -489,7 +489,7 @@ size_t last_dimension(const std::vector &shape); bool areShapesEqual(const NVTEShape &s1, const NVTEShape &s2); -void compareResults(const std::string &name, const Tensor &test, const void *ref, +void compareResults(const std::string &name, Tensor &test, const void *ref, bool rowwise, double atol = 1e-5, double rtol = 1e-8, bool if_on_gpus = true, const size_t tolerable_mismatches_limit = 0); void compareResults(const std::string &name, const float test, const float ref, From 6d3e1db9eb4b954dd9684119f3141b180683ba53 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 5 May 2026 05:32:35 +0000 Subject: [PATCH 05/10] Debug compilation errors Signed-off-by: Tim Moon --- tests/cpp/operator/test_dequantize_nvfp4.cu | 2 +- tests/cpp/test_common.cu | 35 +++++++++++---------- tests/cpp/test_common.h | 12 +++---- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/tests/cpp/operator/test_dequantize_nvfp4.cu b/tests/cpp/operator/test_dequantize_nvfp4.cu index 96e85cb5ed..20efc943b6 100644 --- a/tests/cpp/operator/test_dequantize_nvfp4.cu +++ b/tests/cpp/operator/test_dequantize_nvfp4.cu @@ -75,7 +75,7 @@ void compute_ref_dequantize_nvfp4(const uint8_t *packed_data, } template -float compute_amax(const test::Tensor &t, size_t rows, size_t cols) { +float compute_amax(test::Tensor &t, size_t rows, size_t cols) { t.to_cpu(); const auto *data = t.rowwise_cpu_dptr(); float amax = 0.0f; diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index 4902989fbb..de3fc76554 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -377,13 +378,13 @@ Tensor::Tensor(const std::string& name, auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(flattened_shape, tensor_.scaling_mode()); if (rowwise) { const auto scale_shape = rowwise_scale_meta.shape; - const auto scale_dtype = rowwise_scale_meta.dtype; + const auto scale_dtype = rowwise_scale_meta.type; scale_inv_rowwise_ = Tensor::Buffer(product(scale_shape), scale_dtype); tensor_.set_rowwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), scale_dtype, scale_shape); } if (columnwise) { - const auto scale_shape = columnwise_scale_meta.shape; - const auto scale_dtype = columnwise_scale_meta.dtype; + const auto scale_shape = colwise_scale_meta.shape; + const auto scale_dtype = colwise_scale_meta.type; scale_inv_columnwise_ = Tensor::Buffer(product(scale_shape), scale_dtype); tensor_.set_columnwise_scale_inv(scale_inv_columnwise_.gpu_buffer(), scale_dtype, scale_shape); } @@ -393,7 +394,7 @@ Tensor::Tensor(const std::string& name, amax_rowwise_ = Tensor::Buffer(1, DType::kFloat32); amax_columnwise_ = Tensor::Buffer(1, DType::kFloat32); tensor_.set_amax(amax_rowwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); - tensor_.set_amax_columnwise(amax_columnwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); + tensor_.set_columnwise_amax(amax_columnwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); } } break; @@ -424,21 +425,21 @@ void Tensor::from_cpu() { void Tensor::set_amax(float amax) { NVTE_CHECK(amax_rowwise_.size() == 1); - NVTE_CHECK(amax_rowwise_.dtype() == kNVTEFloat32); + NVTE_CHECK(amax_rowwise_.dtype() == DType::kFloat32); *amax_rowwise_.cpu_buffer() = amax; amax_rowwise_.from_cpu(); } void Tensor::set_scale(float scale) { NVTE_CHECK(scale_.size() == 1); - NVTE_CHECK(scale_.dtype() == kNVTEFloat32); + NVTE_CHECK(scale_.dtype() == DType::kFloat32); *scale_.cpu_buffer() = scale; scale_.from_cpu(); } void Tensor::set_scale_inv(float scale_inv) { NVTE_CHECK(scale_inv_rowwise_.size() == 1); - NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32); + NVTE_CHECK(scale_inv_rowwise_.dtype() == DType::kFloat32); *scale_inv_rowwise_.cpu_buffer() = scale_inv; scale_inv_rowwise_.from_cpu(); } @@ -449,7 +450,7 @@ void Tensor::set_tensor_amax(float amax) { void Tensor::set_tensor_amax_columnwise(float amax) { NVTE_CHECK(amax_columnwise_.size() == 1); - NVTE_CHECK(amax_columnwise_.dtype() == kNVTEFloat32); + NVTE_CHECK(amax_columnwise_.dtype() == DType::kFloat32); *amax_columnwise_.cpu_buffer() = amax; amax_columnwise_.from_cpu(); } @@ -463,7 +464,7 @@ void Tensor::fill_uniform_rowwise_scale_inv() { const auto numel = scale_inv_rowwise_.size(); const auto dtype = scale_inv_rowwise_.dtype(); switch (dtype) { - case kNVTEFloat32: + case DType::kFloat32: { auto *cpu_data = scale_inv_rowwise_.cpu_buffer(); std::uniform_real_distribution dis(-2.0, 1.0); @@ -472,9 +473,9 @@ void Tensor::fill_uniform_rowwise_scale_inv() { } } break; - case kNVTEFloat8E4M3: - case kNVTEFloat8E8M0: - case kNVTEByte: + case DType::kFloat8E4M3: + case DType::kFloat8E8M0: + case DType::kByte: { auto *cpu_data = reinterpret_cast(scale_inv_rowwise_.cpu_buffer()); std::uniform_int_distribution dis(0, 127); @@ -501,7 +502,7 @@ void Tensor::fill_uniform_columnwise_scale_inv() { const auto numel = scale_inv_columnwise_.size(); const auto dtype = scale_inv_columnwise_.dtype(); switch (dtype) { - case kNVTEFloat32: + case DType::kFloat32: { auto *cpu_data = scale_inv_columnwise_.cpu_buffer(); std::uniform_real_distribution dis(-2.0, 1.0); @@ -510,9 +511,9 @@ void Tensor::fill_uniform_columnwise_scale_inv() { } } break; - case kNVTEFloat8E4M3: - case kNVTEFloat8E8M0: - case kNVTEByte: + case DType::kFloat8E4M3: + case DType::kFloat8E8M0: + case DType::kByte: { auto *cpu_data = reinterpret_cast(scale_inv_columnwise_.cpu_buffer()); std::uniform_int_distribution dis(0, 127); @@ -538,7 +539,7 @@ void Tensor::fill_uniform_scale() { // Generate random scales on CPU auto *cpu_data = scale_.cpu_buffer(); const auto numel = scale_.size(); - NVTE_CHECK(scale_.dtype() == kNVTEFloat32); + NVTE_CHECK(scale_.dtype() == DType::kFloat32); std::uniform_real_distribution dis(-2.0, 1.0); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index 52138bd4c4..355865ef41 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -184,32 +184,32 @@ class Tensor { } template - T *rowwise_cpu_dptr() const { + T *rowwise_cpu_dptr() { return data_rowwise_.cpu_buffer(); } template - T *columnwise_cpu_dptr() const { + T *columnwise_cpu_dptr() { return data_columnwise_.cpu_buffer(); } float amax() { NVTE_CHECK(amax_rowwise_.size() == 1); - NVTE_CHECK(amax_rowwise_.dtype() == kNVTEFloat32); + NVTE_CHECK(amax_rowwise_.dtype() == DType::kFloat32); amax_rowwise_.to_cpu(); return *amax_rowwise_.cpu_buffer(); } float amax_columnwise() { NVTE_CHECK(amax_columnwise_.size() == 1); - NVTE_CHECK(amax_columnwise_.dtype() == kNVTEFloat32); + NVTE_CHECK(amax_columnwise_.dtype() == DType::kFloat32); amax_columnwise_.to_cpu(); return *amax_columnwise_.cpu_buffer(); } float scale() { NVTE_CHECK(scale_.size() == 1); - NVTE_CHECK(scale_.dtype() == kNVTEFloat32); + NVTE_CHECK(scale_.dtype() == DType::kFloat32); scale_.to_cpu(); return *scale_.cpu_buffer(); } @@ -228,7 +228,7 @@ class Tensor { float rowwise_scale_inv(){ NVTE_CHECK(scale_inv_rowwise_.size() == 1); - NVTE_CHECK(scale_inv_rowwise_.dtype() == kNVTEFloat32); + NVTE_CHECK(scale_inv_rowwise_.dtype() == DType::kFloat32); scale_inv_rowwise_.to_cpu(); return *scale_inv_rowwise_.cpu_buffer(); } From e324ae271cbe7d3316c76b07cbd7a983dfbf295a Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 5 May 2026 23:58:01 +0000 Subject: [PATCH 06/10] Remove type check when accessing raw pointers CPU and GPU types are inconsistent, so the type checks cause too many problems. Signed-off-by: Tim Moon --- tests/cpp/test_common.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index 355865ef41..99db7f5874 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -289,7 +289,6 @@ class Tensor { // Templated pointer accessors template T *cpu_buffer() { - NVTE_CHECK(TypeInfo::dtype == dtype_, "Invalid type."); return reinterpret_cast(cpu_buffer()); } template @@ -298,7 +297,6 @@ class Tensor { } template T *gpu_buffer() { - NVTE_CHECK(TypeInfo::dtype == dtype_, "Invalid type."); return reinterpret_cast(gpu_buffer()); } template From 346e92a1f1b04f46cddb6fa6f353e07d22bc0684 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 6 May 2026 00:45:51 +0000 Subject: [PATCH 07/10] Debug distributed C++ tests Also adopt review suggestions from @greptile-apps. Signed-off-by: Tim Moon --- tests/cpp/test_common.cu | 15 ++++++++------- tests/cpp/test_common.h | 4 ++++ tests/cpp_distributed/test_comm_gemm.cu | 12 ++++++++---- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index de3fc76554..d59f7e36ca 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -937,13 +937,14 @@ void fillCase_special(Tensor *t) { }); } - // Try setting scale to 1, fallback to random scales - // Note: This is a hack to match behavior of an earlier - // implementation. Consider filling block scales with constant - // value. - try { - t->set_scale_inv(1.0); - } catch (...) { + // Fill scales + if (t->scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) { + if (isFp8Type(t->dtype())) { + // FP8 tensor scale is set to 1 + t->set_scale_inv(1.0); + } + } else { + // Block scales are filled randomly t->fill_uniform_rowwise_scale_inv(); t->fill_uniform_columnwise_scale_inv(); } diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index 99db7f5874..1b5ceed068 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -185,11 +185,15 @@ class Tensor { template T *rowwise_cpu_dptr() { + NVTE_CHECK(TypeInfo::dtype == data_rowwise_.dtype(), "Invalid type!"); + NVTE_CHECK(rowwise_, "Tensor does not have columnwise data!"); return data_rowwise_.cpu_buffer(); } template T *columnwise_cpu_dptr() { + NVTE_CHECK(TypeInfo::dtype == data_columnwise_.dtype(), "Invalid type!"); + NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!"); return data_columnwise_.cpu_buffer(); } diff --git a/tests/cpp_distributed/test_comm_gemm.cu b/tests/cpp_distributed/test_comm_gemm.cu index cc0d760a39..45f6664567 100644 --- a/tests/cpp_distributed/test_comm_gemm.cu +++ b/tests/cpp_distributed/test_comm_gemm.cu @@ -107,8 +107,10 @@ std::vector CopyMatrix(const std::vector& data, size_t mstart, size_t nsta template test::Tensor Make(size_t m, size_t n, float scale) { test::Tensor ret("", std::vector{n, m}, TypeInfo::dtype); - ret.set_scale(scale); - ret.set_scale_inv(1.0 / scale); + if (test::isFp8Type(TypeInfo::dtype)) { + ret.set_scale(scale); + ret.set_scale_inv(1.0 / scale); + } return ret; } @@ -116,8 +118,10 @@ template test::Tensor MakeFromData(const std::vector& data, size_t mstart, size_t nstart, size_t msize, size_t nsize, size_t ld, float scale) { test::Tensor ret("", std::vector{nsize, msize}, TypeInfo::dtype); - ret.set_scale(scale); - ret.set_scale_inv(1.0 / scale); + if (test::isFp8Type(TypeInfo::dtype)) { + ret.set_scale(scale); + ret.set_scale_inv(1.0 / scale); + } auto local = CopyMatrix(data, mstart, nstart, msize, nsize, ld); NVTE_CHECK_CUDA(cudaMemcpy(ret.rowwise_dptr(), local.data(), local.size() * sizeof local[0], cudaMemcpyDefault)); From 181cfabf67f581b6451ea8fbada6eddbde5e393d Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Tue, 5 May 2026 17:50:04 -0700 Subject: [PATCH 08/10] Remove unused header Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> --- tests/cpp/test_common.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index d59f7e36ca..2becbc7302 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -16,7 +16,6 @@ #include #include #include -#include #include #include From b193fa7a8646cdb87b015c1bbdd75856641abb40 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Tue, 5 May 2026 17:58:23 -0700 Subject: [PATCH 09/10] Copy-paste error Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> --- tests/cpp/test_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index 1b5ceed068..860fc7d7eb 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -186,7 +186,7 @@ class Tensor { template T *rowwise_cpu_dptr() { NVTE_CHECK(TypeInfo::dtype == data_rowwise_.dtype(), "Invalid type!"); - NVTE_CHECK(rowwise_, "Tensor does not have columnwise data!"); + NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!"); return data_rowwise_.cpu_buffer(); } From 67b730f34c44aeaf70130b73f41341e2e73583c3 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 7 May 2026 03:11:10 +0000 Subject: [PATCH 10/10] Use shared buffer for FP8 row-wise scale-inv and col-wise scale-inv Signed-off-by: Tim Moon --- tests/cpp/test_common.cu | 141 +++++++++++++++++++++------------------ tests/cpp/test_common.h | 71 +++++++++++--------- 2 files changed, 115 insertions(+), 97 deletions(-) diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index 2becbc7302..925fa8b25f 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -309,8 +310,8 @@ Tensor::Tensor(const std::string& name, // Allocate row-wise data if (rowwise) { - data_rowwise_ = Tensor::Buffer(product(shape), type); - tensor_.set_rowwise_data(data_rowwise_.gpu_buffer(), type, shape); + data_rowwise_.emplace(product(shape), type); + tensor_.set_rowwise_data(data_rowwise_->gpu_buffer(), type, shape); } // Allocate column-wise data @@ -345,26 +346,30 @@ Tensor::Tensor(const std::string& name, columnwise_shape_vec.size()); // Allocate buffer - data_columnwise_ = Tensor::Buffer(product(columnwise_shape), type); + data_columnwise_.emplace(product(columnwise_shape), type); // Configure TE tensor - tensor_.set_columnwise_data(data_columnwise_.gpu_buffer(), type, columnwise_shape); + tensor_.set_columnwise_data(data_columnwise_->gpu_buffer(), type, columnwise_shape); } // Allocate recipe-specific buffers switch (scaling_mode) { case NVTE_DELAYED_TENSOR_SCALING: if (isFp8Type(type)) { - amax_rowwise_ = Tensor::Buffer(1, DType::kFloat32); - scale_ = Tensor::Buffer(1, DType::kFloat32); - scale_inv_rowwise_ = Tensor::Buffer(1, DType::kFloat32); - tensor_.set_amax(amax_rowwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); - tensor_.set_scale(scale_.gpu_buffer(), DType::kFloat32, std::vector{1}); + amax_rowwise_.emplace(1, DType::kFloat32); + scale_.emplace(1, DType::kFloat32); + tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); + tensor_.set_scale(scale_->gpu_buffer(), DType::kFloat32, std::vector{1}); + + // Use same buffer for row-wise and column-wise scale-inverse + auto scale_inv = std::make_shared(1, DType::kFloat32); if (rowwise) { - tensor_.set_rowwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); + scale_inv_rowwise_ = scale_inv; + tensor_.set_rowwise_scale_inv(scale_inv_rowwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); } if (columnwise) { - tensor_.set_columnwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); + scale_inv_columnwise_ = scale_inv; + tensor_.set_columnwise_scale_inv(scale_inv_rowwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); } } break; @@ -378,22 +383,22 @@ Tensor::Tensor(const std::string& name, if (rowwise) { const auto scale_shape = rowwise_scale_meta.shape; const auto scale_dtype = rowwise_scale_meta.type; - scale_inv_rowwise_ = Tensor::Buffer(product(scale_shape), scale_dtype); - tensor_.set_rowwise_scale_inv(scale_inv_rowwise_.gpu_buffer(), scale_dtype, scale_shape); + scale_inv_rowwise_ = std::make_shared(product(scale_shape), scale_dtype); + tensor_.set_rowwise_scale_inv(scale_inv_rowwise_->gpu_buffer(), scale_dtype, scale_shape); } if (columnwise) { const auto scale_shape = colwise_scale_meta.shape; const auto scale_dtype = colwise_scale_meta.type; - scale_inv_columnwise_ = Tensor::Buffer(product(scale_shape), scale_dtype); - tensor_.set_columnwise_scale_inv(scale_inv_columnwise_.gpu_buffer(), scale_dtype, scale_shape); + scale_inv_columnwise_ = std::make_shared(product(scale_shape), scale_dtype); + tensor_.set_columnwise_scale_inv(scale_inv_columnwise_->gpu_buffer(), scale_dtype, scale_shape); } // NVFP4 uses amax for tensor scaling if (scaling_mode == NVTE_NVFP4_1D_SCALING) { - amax_rowwise_ = Tensor::Buffer(1, DType::kFloat32); - amax_columnwise_ = Tensor::Buffer(1, DType::kFloat32); - tensor_.set_amax(amax_rowwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); - tensor_.set_columnwise_amax(amax_columnwise_.gpu_buffer(), DType::kFloat32, std::vector{1}); + amax_rowwise_.emplace(1, DType::kFloat32); + amax_columnwise_.emplace(1, DType::kFloat32); + tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); + tensor_.set_columnwise_amax(amax_columnwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); } } break; @@ -403,44 +408,47 @@ Tensor::Tensor(const std::string& name, } void Tensor::to_cpu() { - data_rowwise_.to_cpu(); - data_columnwise_.to_cpu(); - scale_inv_rowwise_.to_cpu(); - scale_inv_columnwise_.to_cpu(); - amax_rowwise_.to_cpu(); - amax_columnwise_.to_cpu(); - scale_.to_cpu(); + if (data_rowwise_) { data_rowwise_->to_cpu(); } + if (data_columnwise_) { data_columnwise_->to_cpu(); } + if (scale_inv_rowwise_) { scale_inv_rowwise_->to_cpu(); } + if (scale_inv_columnwise_) { scale_inv_columnwise_->to_cpu(); } + if (amax_rowwise_) { amax_rowwise_->to_cpu(); } + if (amax_columnwise_) { amax_columnwise_->to_cpu(); } + if (scale_) { scale_->to_cpu(); } } void Tensor::from_cpu() { - data_rowwise_.from_cpu(); - data_columnwise_.from_cpu(); - scale_inv_rowwise_.from_cpu(); - scale_inv_columnwise_.from_cpu(); - amax_rowwise_.from_cpu(); - amax_columnwise_.from_cpu(); - scale_.from_cpu(); + if (data_rowwise_) { data_rowwise_->from_cpu(); } + if (data_columnwise_) { data_columnwise_->from_cpu(); } + if (scale_inv_rowwise_) { scale_inv_rowwise_->from_cpu(); } + if (scale_inv_columnwise_) { scale_inv_columnwise_->from_cpu(); } + if (amax_rowwise_) { amax_rowwise_->from_cpu(); } + if (amax_columnwise_) { amax_columnwise_->from_cpu(); } + if (scale_) { scale_->from_cpu(); } } void Tensor::set_amax(float amax) { - NVTE_CHECK(amax_rowwise_.size() == 1); - NVTE_CHECK(amax_rowwise_.dtype() == DType::kFloat32); - *amax_rowwise_.cpu_buffer() = amax; - amax_rowwise_.from_cpu(); + NVTE_CHECK(amax_rowwise_); + NVTE_CHECK(amax_rowwise_->size() == 1); + NVTE_CHECK(amax_rowwise_->dtype() == DType::kFloat32); + *amax_rowwise_->cpu_buffer() = amax; + amax_rowwise_->from_cpu(); } void Tensor::set_scale(float scale) { - NVTE_CHECK(scale_.size() == 1); - NVTE_CHECK(scale_.dtype() == DType::kFloat32); - *scale_.cpu_buffer() = scale; - scale_.from_cpu(); + NVTE_CHECK(scale_); + NVTE_CHECK(scale_->size() == 1); + NVTE_CHECK(scale_->dtype() == DType::kFloat32); + *scale_->cpu_buffer() = scale; + scale_->from_cpu(); } void Tensor::set_scale_inv(float scale_inv) { - NVTE_CHECK(scale_inv_rowwise_.size() == 1); - NVTE_CHECK(scale_inv_rowwise_.dtype() == DType::kFloat32); - *scale_inv_rowwise_.cpu_buffer() = scale_inv; - scale_inv_rowwise_.from_cpu(); + NVTE_CHECK(scale_inv_rowwise_); + NVTE_CHECK(scale_inv_rowwise_->size() == 1); + NVTE_CHECK(scale_inv_rowwise_->dtype() == DType::kFloat32); + *scale_inv_rowwise_->cpu_buffer() = scale_inv; + scale_inv_rowwise_->from_cpu(); } void Tensor::set_tensor_amax(float amax) { @@ -448,24 +456,25 @@ void Tensor::set_tensor_amax(float amax) { } void Tensor::set_tensor_amax_columnwise(float amax) { - NVTE_CHECK(amax_columnwise_.size() == 1); - NVTE_CHECK(amax_columnwise_.dtype() == DType::kFloat32); - *amax_columnwise_.cpu_buffer() = amax; - amax_columnwise_.from_cpu(); + NVTE_CHECK(amax_columnwise_); + NVTE_CHECK(amax_columnwise_->size() == 1); + NVTE_CHECK(amax_columnwise_->dtype() == DType::kFloat32); + *amax_columnwise_->cpu_buffer() = amax; + amax_columnwise_->from_cpu(); } void Tensor::fill_uniform_rowwise_scale_inv() { - if (scale_inv_rowwise_.size() == 0) { + if (!scale_inv_rowwise_ || scale_inv_rowwise_->size() == 0) { return; } // Generate random scales on CPU - const auto numel = scale_inv_rowwise_.size(); - const auto dtype = scale_inv_rowwise_.dtype(); + const auto numel = scale_inv_rowwise_->size(); + const auto dtype = scale_inv_rowwise_->dtype(); switch (dtype) { case DType::kFloat32: { - auto *cpu_data = scale_inv_rowwise_.cpu_buffer(); + auto *cpu_data = scale_inv_rowwise_->cpu_buffer(); std::uniform_real_distribution dis(-2.0, 1.0); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); @@ -476,7 +485,7 @@ void Tensor::fill_uniform_rowwise_scale_inv() { case DType::kFloat8E8M0: case DType::kByte: { - auto *cpu_data = reinterpret_cast(scale_inv_rowwise_.cpu_buffer()); + auto *cpu_data = reinterpret_cast(scale_inv_rowwise_->cpu_buffer()); std::uniform_int_distribution dis(0, 127); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); @@ -489,21 +498,21 @@ void Tensor::fill_uniform_rowwise_scale_inv() { } // Update GPU tensor - scale_inv_rowwise_.from_cpu(); + scale_inv_rowwise_->from_cpu(); } void Tensor::fill_uniform_columnwise_scale_inv() { - if (scale_inv_columnwise_.size() == 0) { + if (!scale_inv_columnwise_ || scale_inv_columnwise_->size() == 0) { return; } // Generate random scales on CPU - const auto numel = scale_inv_columnwise_.size(); - const auto dtype = scale_inv_columnwise_.dtype(); + const auto numel = scale_inv_columnwise_->size(); + const auto dtype = scale_inv_columnwise_->dtype(); switch (dtype) { case DType::kFloat32: { - auto *cpu_data = scale_inv_columnwise_.cpu_buffer(); + auto *cpu_data = scale_inv_columnwise_->cpu_buffer(); std::uniform_real_distribution dis(-2.0, 1.0); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); @@ -514,7 +523,7 @@ void Tensor::fill_uniform_columnwise_scale_inv() { case DType::kFloat8E8M0: case DType::kByte: { - auto *cpu_data = reinterpret_cast(scale_inv_columnwise_.cpu_buffer()); + auto *cpu_data = reinterpret_cast(scale_inv_columnwise_->cpu_buffer()); std::uniform_int_distribution dis(0, 127); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); @@ -527,25 +536,25 @@ void Tensor::fill_uniform_columnwise_scale_inv() { } // Update GPU tensor - scale_inv_columnwise_.from_cpu(); + scale_inv_columnwise_->from_cpu(); } void Tensor::fill_uniform_scale() { - if (scale_.size() == 0) { + if (!scale_ || scale_->size() == 0) { return; } // Generate random scales on CPU - auto *cpu_data = scale_.cpu_buffer(); - const auto numel = scale_.size(); - NVTE_CHECK(scale_.dtype() == DType::kFloat32); + auto *cpu_data = scale_->cpu_buffer(); + const auto numel = scale_->size(); + NVTE_CHECK(scale_->dtype() == DType::kFloat32); std::uniform_real_distribution dis(-2.0, 1.0); for (size_t i = 0; i < numel; ++i) { cpu_data[i] = dis(gen_); } // Update GPU tensor - scale_.from_cpu(); + scale_->from_cpu(); } using std::to_string; diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index 860fc7d7eb..da467465d7 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -185,56 +186,64 @@ class Tensor { template T *rowwise_cpu_dptr() { - NVTE_CHECK(TypeInfo::dtype == data_rowwise_.dtype(), "Invalid type!"); + NVTE_CHECK(data_rowwise_, "Tensor does not have rowwise data!"); + NVTE_CHECK(TypeInfo::dtype == data_rowwise_->dtype(), "Invalid type!"); NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!"); - return data_rowwise_.cpu_buffer(); + return data_rowwise_->cpu_buffer(); } template T *columnwise_cpu_dptr() { - NVTE_CHECK(TypeInfo::dtype == data_columnwise_.dtype(), "Invalid type!"); + NVTE_CHECK(data_columnwise_, "Tensor does not have columnwise data!"); + NVTE_CHECK(TypeInfo::dtype == data_columnwise_->dtype(), "Invalid type!"); NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!"); - return data_columnwise_.cpu_buffer(); + return data_columnwise_->cpu_buffer(); } float amax() { - NVTE_CHECK(amax_rowwise_.size() == 1); - NVTE_CHECK(amax_rowwise_.dtype() == DType::kFloat32); - amax_rowwise_.to_cpu(); - return *amax_rowwise_.cpu_buffer(); + NVTE_CHECK(amax_rowwise_); + NVTE_CHECK(amax_rowwise_->size() == 1); + NVTE_CHECK(amax_rowwise_->dtype() == DType::kFloat32); + amax_rowwise_->to_cpu(); + return *amax_rowwise_->cpu_buffer(); } float amax_columnwise() { - NVTE_CHECK(amax_columnwise_.size() == 1); - NVTE_CHECK(amax_columnwise_.dtype() == DType::kFloat32); - amax_columnwise_.to_cpu(); - return *amax_columnwise_.cpu_buffer(); + NVTE_CHECK(amax_columnwise_); + NVTE_CHECK(amax_columnwise_->size() == 1); + NVTE_CHECK(amax_columnwise_->dtype() == DType::kFloat32); + amax_columnwise_->to_cpu(); + return *amax_columnwise_->cpu_buffer(); } float scale() { - NVTE_CHECK(scale_.size() == 1); - NVTE_CHECK(scale_.dtype() == DType::kFloat32); - scale_.to_cpu(); - return *scale_.cpu_buffer(); + NVTE_CHECK(scale_); + NVTE_CHECK(scale_->size() == 1); + NVTE_CHECK(scale_->dtype() == DType::kFloat32); + scale_->to_cpu(); + return *scale_->cpu_buffer(); } template T *rowwise_cpu_scale_inv_ptr(){ - scale_inv_rowwise_.to_cpu(); - return scale_inv_rowwise_.cpu_buffer(); + NVTE_CHECK(scale_inv_rowwise_); + scale_inv_rowwise_->to_cpu(); + return scale_inv_rowwise_->cpu_buffer(); } template T *columnwise_cpu_scale_inv_ptr(){ - scale_inv_columnwise_.to_cpu(); - return scale_inv_columnwise_.cpu_buffer(); + NVTE_CHECK(scale_inv_columnwise_); + scale_inv_columnwise_->to_cpu(); + return scale_inv_columnwise_->cpu_buffer(); } float rowwise_scale_inv(){ - NVTE_CHECK(scale_inv_rowwise_.size() == 1); - NVTE_CHECK(scale_inv_rowwise_.dtype() == DType::kFloat32); - scale_inv_rowwise_.to_cpu(); - return *scale_inv_rowwise_.cpu_buffer(); + NVTE_CHECK(scale_inv_rowwise_); + NVTE_CHECK(scale_inv_rowwise_->size() == 1); + NVTE_CHECK(scale_inv_rowwise_->dtype() == DType::kFloat32); + scale_inv_rowwise_->to_cpu(); + return *scale_inv_rowwise_->cpu_buffer(); } bool rowwise() const { @@ -329,13 +338,13 @@ class Tensor { TensorWrapper tensor_; // Data buffers - Buffer data_rowwise_; - Buffer data_columnwise_; - Buffer scale_inv_rowwise_; - Buffer scale_inv_columnwise_; - Buffer amax_rowwise_; - Buffer amax_columnwise_; - Buffer scale_; + std::optional data_rowwise_; + std::optional data_columnwise_; + std::shared_ptr scale_inv_rowwise_; + std::shared_ptr scale_inv_columnwise_; + std::optional amax_rowwise_; + std::optional amax_columnwise_; + std::optional scale_; bool rowwise_; bool columnwise_;