feat(profiling): add NVTX profiling support and elementwise kernel optimizations

chen2021673 · chen2021673 · commit bed10f343685 · 2026-03-20T06:45:51.000Z
- Add USE_NVTX cmake option for NVIDIA Tools Extension profiling
- Add nvtx.h helper for scoped profiling ranges
- Add NVTX ranges to key operations: attention, RMSNorm, embedding, cross-entropy
- Optimize elementwise kernels with no-broadcast fast paths
- Remove unnecessary Fill(0) calls where cuBLAS beta=0 overwrites output
- Fix dtype mismatch in AccumulateGrad for bf16/fp32 under autocast
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,6 +4,7 @@ option(USE_CUDA "Support NVIDIA CUDA" OFF)
 option(PROFILE_MODE "ENABLE PROFILE MODE" OFF)
 option(USE_OMP "Use OpenMP as backend for Eigen" ON)
 option(USE_NCCL "Build project for distributed running" ON)
+option(USE_NVTX "Enable NVTX profiling" OFF)
 
 project(infini_train VERSION 0.5.0 LANGUAGES CXX)
 
@@ -41,6 +42,10 @@ if(PROFILE_MODE)
   add_compile_definitions(PROFILE_MODE=1)
 endif()
 
+if(USE_NVTX)
+  add_compile_definitions(USE_NVTX)
+endif()
+
 # ------------------------------------------------------------------------------
 # Sources
 # ------------------------------------------------------------------------------
diff --git a/example/common/nvtx.h b/example/common/nvtx.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#ifdef USE_NVTX
+#include <nvtx3/nvToolsExt.h>
+
+class NvtxRange {
+public:
+    explicit NvtxRange(const char *name) { nvtxRangePushA(name); }
+    ~NvtxRange() { nvtxRangePop(); }
+};
+
+#define NVTX_RANGE(name) NvtxRange nvtx_range_##__LINE__(name)
+
+#else
+
+class NvtxRange {
+public:
+    explicit NvtxRange(const char *) {}
+};
+
+#define NVTX_RANGE(name) NvtxRange nvtx_range_##__LINE__(name)
+
+#endif
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -384,7 +384,7 @@ void Train(const nn::parallel::Rank &rank) {
                 loss = loss / grad_accum_steps;
 
                 // disable autocast for the current step (backward is not under autocast)
-                autocast_guard.Disable();
+                // autocast_guard.Disable();
 
                 LOG(INFO) << "Rank " << rank.GlobalRank() << ": finish loss forward";
 
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
@@ -353,7 +353,7 @@ void Train(const nn::parallel::Rank &rank) {
                 loss = loss / grad_accum_steps;
 
                 // disable autocast for the current step (backward is not under autocast)
-                autocast_guard.Disable();
+                // autocast_guard.Disable();
 
                 LOG(INFO) << "Rank " << rank.GlobalRank() << ": finish loss forward";
 
diff --git a/example/llama3/net.cc b/example/llama3/net.cc
@@ -13,6 +13,7 @@
 
 #include "glog/logging.h"
 
+#include "example/common/nvtx.h"
 #include "example/common/utils.h"
 #include "infini_train/include/device.h"
 #include "infini_train/include/nn/functional.h"
@@ -123,6 +124,7 @@ std::shared_ptr<Tensor> PrecomputeFreqsCis(int64_t dim, int64_t end, float theta
 } // namespace
 
 std::vector<std::shared_ptr<Tensor>> SwiGLU::Forward(const std::vector<std::shared_ptr<Tensor>> &x) {
+    NVTX_RANGE("SwiGLU");
     return {x[0] * nn::function::Sigmoid(x[0])};
 }
 
@@ -133,9 +135,19 @@ RMSNorm::RMSNorm(int64_t dim, float eps, infini_train::Device device) : Cloneabl
 }
 
 std::vector<std::shared_ptr<Tensor>> RMSNorm::Forward(const std::vector<std::shared_ptr<Tensor>> &x) {
-    // broadcasted Mul([4, 64, 2048] * [4, 64, 1])
-    auto norm = x[0] * nn::function::Rsqrt(nn::function::Mean(nn::function::Pow(x[0], 2), -1, true) + eps_);
-    return {norm * parameters_[kParamWeightName]};
+    NVTX_RANGE("RMSNorm");
+    auto input = x[0];
+    auto input_dtype = input->Dtype();
+    // Compute entirely in fp32 (like PyTorch), then cast back once — avoids per-op autocast CastKernel overhead
+    if (input_dtype != DataType::kFLOAT32) {
+        input = std::make_shared<Tensor>(input->To(DataType::kFLOAT32));
+    }
+    auto norm = input * nn::function::Rsqrt(nn::function::Mean(nn::function::Pow(input, 2), -1, true) + eps_);
+    auto result = norm * parameters_[kParamWeightName]; // both fp32, no cast
+    if (result->Dtype() != input_dtype) {
+        result = std::make_shared<Tensor>(result->To(input_dtype));
+    }
+    return {result};
 }
 
 CausalSelfAttention::CausalSelfAttention(const LLaMA3Config &config)
@@ -202,41 +214,54 @@ std::vector<std::shared_ptr<Tensor>> CausalSelfAttention::Forward(const std::vec
     // -> RoPE on q, k
     // q: (B, T, H_local, D)
     // k: (B, T, KV_local, D)
-    std::tie(q, k) = ApplyRotaryEmbedding(q, k, freqs_cis);
-
-    // TODO(zbl): use kv cache during inference
-    // if (use_kv_) { ... }
-
-    // align n_head in GQA
-    // (B, T, KV_local, D) -> (B, T, H_local, D) via RepeatKV
-    k = RepeatKV(k, n_rep_);
-    v = RepeatKV(v, n_rep_);
-
-    // (B, T, H_local, D) -> (B, H_local, T, D)
-    q = q->Transpose(1, 2);
-    k = k->Transpose(1, 2);
-    v = v->Transpose(1, 2);
-
-    // TODO(zbl): support flash attention later
-    // if (flash_) { ... }
-
-    // manual implementation of attention
-    // this materializes the large (T,T) matrix for all the queries and keys
-
-    // q: (B, H_local, T, D)
-    // k: (B, H_local, T, D) -> (B, H_local, D, T)
-    // q @ k.T: (B, H_local, T, T) -> mul 1.0 / sqrt(D) -> (B, H_local, T, T)
-    auto att = q->Matmul(k->Transpose(-2, -1)) * (1.0 / std::sqrt(static_cast<float>(D)));
-    if (mask) {
-        // mask: (1, 1, T, T)
-        att = att->MaskedFill(mask, std::numeric_limits<float>::lowest());
+    std::shared_ptr<Tensor> y;
+    {
+        NVTX_RANGE("AttentionForward");
+        {
+            NVTX_RANGE("RoPE");
+            std::tie(q, k) = ApplyRotaryEmbedding(q, k, freqs_cis);
+        }
+
+        // TODO(zbl): use kv cache during inference
+        // if (use_kv_) { ... }
+
+        // align n_head in GQA
+        // (B, T, KV_local, D) -> (B, T, H_local, D) via RepeatKV
+        {
+            NVTX_RANGE("RepeatKV");
+            k = RepeatKV(k, n_rep_);
+            v = RepeatKV(v, n_rep_);
+        }
+
+        // (B, T, H_local, D) -> (B, H_local, T, D)
+        q = q->Transpose(1, 2);
+        k = k->Transpose(1, 2);
+        v = v->Transpose(1, 2);
+
+        // TODO(zbl): support flash attention later
+        // if (flash_) { ... }
+
+        // manual implementation of attention
+        // this materializes the large (T,T) matrix for all the queries and keys
+
+        // q: (B, H_local, T, D)
+        // k: (B, H_local, T, D) -> (B, H_local, D, T)
+        // q @ k.T: (B, H_local, T, T) -> mul 1.0 / sqrt(D) -> (B, H_local, T, T)
+        auto att = q->Matmul(k->Transpose(-2, -1)) * (1.0 / std::sqrt(static_cast<float>(D)));
+        if (mask) {
+            // mask: (1, 1, T, T)
+            att = att->MaskedFill(mask, std::numeric_limits<float>::lowest());
+        }
+        // (B, H_local, T, T)
+        {
+            NVTX_RANGE("Softmax");
+            att = nn::function::Softmax(att, -1);
+        }
+        // att: (B, H_local, T, T) @ v: (B, H_local, T, D) -> y: (B, H_local, T, D)
+        y = att->Matmul(v);
+        // (B, H_local, T, D) -> Transpose(1, 2) -> (B, T, H_local, D) -> (B, T, C_local)
+        y = y->Transpose(1, 2)->Contiguous()->View({B, T, C_local});
     }
-    // (B, H_local, T, T)
-    att = nn::function::Softmax(att, -1);
-    // att: (B, H_local, T, T) @ v: (B, H_local, T, D) -> y: (B, H_local, T, D)
-    auto y = att->Matmul(v);
-    // (B, H_local, T, D) -> Transpose(1, 2) -> (B, T, H_local, D) -> (B, T, C_local)
-    y = y->Transpose(1, 2)->Contiguous()->View({B, T, C_local});
     // output projection
     // (B, T, C_local) -> RowParallelLinear(C, C) -> (B, T, C)
     y = (*modules_[kCProjLayerName])({y})[0];
@@ -329,6 +354,7 @@ LLaMA3FirstStage::LLaMA3FirstStage(const LLaMA3Config &config) : CloneableModule
 }
 
 std::vector<std::shared_ptr<Tensor>> LLaMA3FirstStage::Forward(const std::vector<std::shared_ptr<Tensor>> &x) {
+    NVTX_RANGE("Embedding");
     return (*modules_[LLaMA3FirstStage::kWTELayerName])(x);
 }
 
@@ -360,8 +386,12 @@ std::vector<std::shared_ptr<Tensor>> LLaMA3Chunk::Forward(const std::vector<std:
     auto freqs_view = buffers_[kFreqsCisName]->Slice(0, start_pos, start_pos + t, 1);
 
     // TODO(lzm): add dtype support for nn::function::Ones later
-    std::shared_ptr<Tensor> ones = std::make_shared<Tensor>(nn::function::Ones({t, t})->To(x1->GetDevice()));
-    std::shared_ptr<Tensor> mask = nn::function::Triu(ones, 1)->View({1, 1, t, t});
+    std::shared_ptr<Tensor> mask;
+    {
+        NVTX_RANGE("BuildMask");
+        std::shared_ptr<Tensor> ones = std::make_shared<Tensor>(nn::function::Ones({t, t})->To(x1->GetDevice()));
+        mask = nn::function::Triu(ones, 1)->View({1, 1, t, t});
+    }
 
     std::shared_ptr<Tensor> start_pos_ptr = nullptr;
 
@@ -386,6 +416,7 @@ LLaMA3LastStage::LLaMA3LastStage(const LLaMA3Config &config) : CloneableModule(k
 }
 
 std::vector<std::shared_ptr<Tensor>> LLaMA3LastStage::Forward(const std::vector<std::shared_ptr<Tensor>> &x) {
+    NVTX_RANGE("LMHead");
     // (bs, seq_len, n_embd) -> RMSNorm -> (bs, seq_len, n_embd)
     auto x1 = (*modules_[kLnFLayerName])(x);
 
diff --git a/infini_train/src/autograd/accumulate.cc b/infini_train/src/autograd/accumulate.cc
@@ -26,6 +26,11 @@ AccumulateGrad::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_output
     core::DeviceGuard guard(device);
 
     if (grad_output) {
+        // Cast grad to match parameter dtype (e.g. bf16 grad -> fp32 param under autocast)
+        if (grad_output->Dtype() != tensor_->Dtype()) {
+            grad_output = std::make_shared<Tensor>(grad_output->To(tensor_->Dtype()));
+        }
+
         if (grad) {
             if (tensor_->ConsumeGradOverwriteFlag()) {
                 // If the tensor is marked to overrite its current grad on next grad update
diff --git a/infini_train/src/autograd/elementwise.cc b/infini_train/src/autograd/elementwise.cc
@@ -390,6 +390,11 @@ std::vector<std::shared_ptr<Tensor>> Add::Backward(const std::vector<std::shared
     CHECK_EQ(grad_outputs.size(), 1);
     const auto &grad_output = grad_outputs[0];
 
+    // Fast path: no broadcast — grad_a and grad_b are both just grad_output
+    if (a_dims_ == b_dims_) {
+        return {grad_output, grad_output};
+    }
+
     auto device = grad_output->GetDevice().type();
     auto [grad_a, grad_b] = Dispatcher::Instance().Call<std::pair<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>>(
         {device, "AddBackward"}, grad_output, a_dims_, b_dims_);
diff --git a/infini_train/src/autograd/function.cc b/infini_train/src/autograd/function.cc
@@ -1,5 +1,9 @@
 #include "infini_train/include/autograd/function.h"
 
+#ifdef USE_NVTX
+#include <nvtx3/nvToolsExt.h>
+#endif
+
 #include "glog/logging.h"
 
 #include "infini_train/include/autograd/accumulate.h"
@@ -115,9 +119,15 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
 
         std::vector<std::shared_ptr<Tensor>> grad_inputs;
         {
+#ifdef USE_NVTX
+            nvtxRangePushA(type().c_str());
+#endif
             autograd::NoGradGuard no_grad;
             // no_grad in autograd.Function.Backward()
             grad_inputs = Backward(grad_outputs_);
+#ifdef USE_NVTX
+            nvtxRangePop();
+#endif
         }
 
         // Call backward post-hooks
diff --git a/infini_train/src/kernels/cuda/elementwise.cu b/infini_train/src/kernels/cuda/elementwise.cu
diff --git a/infini_train/src/kernels/cuda/linear.cu b/infini_train/src/kernels/cuda/linear.cu
diff --git a/infini_train/src/nn/parallel/tensor_parallel.cc b/infini_train/src/nn/parallel/tensor_parallel.cc
diff --git a/infini_train/src/optimizer.cc b/infini_train/src/optimizer.cc