txus
diff --git a/‎Makefile‎
Lines changed: 2 additions & 2 deletions b/‎Makefile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/tensor/storage.hpp‎
Lines changed: 14 additions & 14 deletions b/‎include/tensor/storage.hpp‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎src/llama/rope.cpp‎
Lines changed: 2 additions & 2 deletions b/‎src/llama/rope.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/tensor/cpu/ops.cpp‎
Lines changed: 15 additions & 16 deletions b/‎src/tensor/cpu/ops.cpp‎
Lines changed: 15 additions & 16 deletions
diff --git a/‎src/tensor/cuda/kernels/add.cu‎
Lines changed: 24 additions & 12 deletions b/‎src/tensor/cuda/kernels/add.cu‎
Lines changed: 24 additions & 12 deletions
diff --git a/‎src/tensor/cuda/kernels/add.cuh‎
Lines changed: 3 additions & 6 deletions b/‎src/tensor/cuda/kernels/add.cuh‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎src/tensor/cuda/kernels/sub.cu‎
Lines changed: 43 additions & 19 deletions b/‎src/tensor/cuda/kernels/sub.cu‎
Lines changed: 43 additions & 19 deletions
diff --git a/‎src/tensor/cuda/kernels/sub.cuh‎
Lines changed: 3 additions & 12 deletions b/‎src/tensor/cuda/kernels/sub.cuh‎
Lines changed: 3 additions & 12 deletions
diff --git a/‎src/tensor/cuda/utils.cuh‎ ‎src/tensor/cuda/kernels/utils.cuh‎src/tensor/cuda/utils.cuh renamed to src/tensor/cuda/kernels/utils.cuh b/‎src/tensor/cuda/utils.cuh‎ ‎src/tensor/cuda/kernels/utils.cuh‎src/tensor/cuda/utils.cuh renamed to src/tensor/cuda/kernels/utils.cuh
diff --git a/‎src/tensor/cuda/loader.cu‎
Lines changed: 1 addition & 1 deletion b/‎src/tensor/cuda/loader.cu‎
Lines changed: 1 addition & 1 deletion
@@ -19,12 +19,12 @@ release:
 .PHONY: prepare_profile
 prepare_profile:
 	@cmake --preset ninja-nvcc -DCMAKE_BUILD_TYPE=Release && cmake --build build --parallel --target test_tensor_cuda
-	@echo 'sudo ncu ctest --kernel-name "add_kernel" --test-dir build -R "^TensorCUDATest.AddBF16"'
+	@echo 'sudo ncu --kernel-name "add_kernel" ctest --test-dir build -R "^TensorCUDATest.AddBF16"'
 
 .PHONY: profile
 profile:
 	@cmake --build build --parallel --target test_tensor_cuda
-	@echo 'sudo ncu ctest --kernel-name "add_kernel" --test-dir build -R "^TensorCUDATest.AddBF16"'
+	@echo 'sudo ncu --kernel-name "add_kernel" ctest --test-dir build -R "^TensorCUDATest.AddBF16"'
 
 .PHONY: app
 app:
 
@@ -18,8 +18,8 @@ template <typename T, typename D> class TensorStorage;
 // Mutable CPU storage - owns or borrows mutable data
 template <typename T> class TensorStorage<T, CPU> {
 private:
-  std::shared_ptr<T[]> data_;
-  size_t size_ = 0;
+  std::shared_ptr<T[]> data_; // NOLINT
+  unsigned int size_ = 0;
 
 public:
   using pointer = T*;
@@ -38,7 +38,7 @@ template <typename T> class TensorStorage<T, CPU> {
   // Non-owning storage - borrows external mutable memory
   static TensorStorage borrow(T* ptr, size_t size) {
     TensorStorage storage;
-    storage.data_ = std::shared_ptr<T[]>(ptr, [](T*) {}); // no-op deleter
+    storage.data_ = std::shared_ptr<T[]>(ptr, [](T*) {}); // no-op deleter // NOLINT
     storage.size_ = size;
     return storage;
   }
@@ -54,7 +54,7 @@ template <typename T> class TensorStorage<T, CPU> {
   }
 
   void resize(size_t size) {
-    data_ = std::shared_ptr<T[]>(new T[size]);
+    data_ = std::shared_ptr<T[]>(new T[size]); // NOLINT
     size_ = size;
   }
   void fill(T value) {
@@ -72,7 +72,7 @@ template <typename T> class TensorStorage<T, CPU> {
 // Const CPU storage - borrows read-only data (e.g., mmap)
 template <typename T> class TensorStorage<const T, CPU> {
 private:
-  std::shared_ptr<const T[]> data_;
+  std::shared_ptr<const T[]> data_; // NOLINT
   size_t size_ = 0;
 
 public:
@@ -84,7 +84,7 @@ template <typename T> class TensorStorage<const T, CPU> {
   // Non-owning storage - borrows external read-only memory (e.g., mmap)
   static TensorStorage borrow(const T* ptr, size_t size) {
     TensorStorage storage;
-    storage.data_ = std::shared_ptr<const T[]>(ptr, [](const T*) {}); // no-op deleter
+    storage.data_ = std::shared_ptr<const T[]>(ptr, [](const T*) {}); // no-op deleter // NOLINT
     storage.size_ = size;
     return storage;
   }
@@ -106,14 +106,14 @@ template <typename T> class TensorStorage<const T, CPU> {
 template <typename T> class TensorStorage<T, CUDA> {
 private:
   T* data_ = nullptr;
-  unsigned int size_ = 0;
+  size_t size_ = 0;
 
 public:
   using pointer = T*;
   using const_pointer = const T*;
 
   TensorStorage() = default;
-  explicit TensorStorage(int size);
+  explicit TensorStorage(size_t size);
   ~TensorStorage();
 
   // no copy, move only
@@ -122,7 +122,7 @@ template <typename T> class TensorStorage<T, CUDA> {
   TensorStorage(TensorStorage&& other) noexcept;
   TensorStorage& operator=(TensorStorage&& other) noexcept;
 
-  [[nodiscard]] int size() const {
+  [[nodiscard]] size_t size() const {
     return size_;
   }
   pointer data() {
@@ -132,7 +132,7 @@ template <typename T> class TensorStorage<T, CUDA> {
     return data_;
   }
 
-  void resize(int size);
+  void resize(size_t size);
   void fill(T value);
 };
 
@@ -141,14 +141,14 @@ template <typename T> class TensorStorage<T, CUDA> {
 template <typename T> class TensorStorage<const T, CUDA> {
 private:
   T* data_ = nullptr;
-  int size_ = 0;
+  size_t size_ = 0;
 
 public:
   using pointer = const T*;
   using const_pointer = const T*;
 
   TensorStorage() = default;
-  explicit TensorStorage(int size);
+  explicit TensorStorage(size_t size);
   ~TensorStorage();
 
   // no copy, move only
@@ -157,7 +157,7 @@ template <typename T> class TensorStorage<const T, CUDA> {
   TensorStorage(TensorStorage&& other) noexcept;
   TensorStorage& operator=(TensorStorage&& other) noexcept;
 
-  [[nodiscard]] int size() const {
+  [[nodiscard]] size_t size() const {
     return size_;
   }
   const_pointer data() const {
@@ -169,7 +169,7 @@ template <typename T> class TensorStorage<const T, CUDA> {
     return data_;
   }
 
-  void resize(int size);
+  void resize(size_t size);
 };
 #endif
 
 
@@ -35,7 +35,7 @@ precompute_rope_values(size_t head_dim, float theta_base, size_t context_length)
   // For each frequency, compute wavelength and apply scaling
   for (size_t i = 0; i < inv_freq_.size(); ++i) {
     float inv_f = inv_freq_.span()[i];
-    float wavelen = 2.0 * M_PI / inv_f;
+    float wavelen = M_PI * 2.0 / inv_f;
 
     if (wavelen < high_freq_wavelen) {
       // High frequency: no scaling
@@ -47,7 +47,7 @@ precompute_rope_values(size_t head_dim, float theta_base, size_t context_length)
       // Medium frequency: smooth interpolation
       float smooth =
           (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor);
-      float scaled_inv_freq = (1.0 - smooth) * (inv_f / factor) + smooth * inv_f;
+      float scaled_inv_freq = ((1.0 - smooth) * (inv_f / factor)) + (smooth * inv_f);
       inv_freq_.span()[i] = scaled_inv_freq;
     }
   }
 
@@ -74,11 +74,12 @@ Tensor<std::remove_const_t<T>, D> add(const TensorView<T, D>& tensor_a,
                                   [](T val_a, T val_b) { return val_a + val_b; });
 }
 
-template Tensor<bfloat16, CPU> add(const TensorView<const bfloat16, CPU>&,
-                                   const TensorView<const bfloat16, CPU>&);
-template Tensor<float, CPU> add(const TensorView<const float, CPU>&,
-                                const TensorView<const float, CPU>&);
-template Tensor<int, CPU> add(const TensorView<const int, CPU>&, const TensorView<const int, CPU>&);
+// template Tensor<bfloat16, CPU> add(const TensorView<const bfloat16, CPU>&,
+//                                    const TensorView<const bfloat16, CPU>&);
+//  template Tensor<float, CPU> add(const TensorView<const float, CPU>&,
+//                                  const TensorView<const float, CPU>&);
+//  template Tensor<int, CPU> add(const TensorView<const int, CPU>&, const TensorView<const int,
+//  CPU>&);
 
 template <typename T, typename D>
 Tensor<std::remove_const_t<T>, D> sub(const TensorView<T, D>& tensor_a,
@@ -566,24 +567,22 @@ template void replace_from_(Tensor<float, CPU>& destination, const TensorView<fl
 // Explicit instantiations for non-const T
 template Tensor<bfloat16, CPU> add(const TensorView<bfloat16, CPU>&,
                                    const TensorView<bfloat16, CPU>&);
-template Tensor<int, CPU> add(const TensorView<int, CPU>&, const TensorView<int, CPU>&);
-template Tensor<float, CPU> add(const TensorView<float, CPU>&, const TensorView<float, CPU>&);
-template Tensor<bfloat16, CPU> sub(const TensorView<bfloat16, CPU>&,
-                                   const TensorView<bfloat16, CPU>&);
+// template Tensor<bfloat16, CPU> sub(const TensorView<bfloat16, CPU>&,
+//                                    const TensorView<bfloat16, CPU>&);
 template Tensor<float, CPU> sub(const TensorView<float, CPU>&, const TensorView<float, CPU>&);
-template Tensor<bfloat16, CPU> div(const TensorView<bfloat16, CPU>&,
-                                   const TensorView<bfloat16, CPU>&);
+// template Tensor<bfloat16, CPU> div(const TensorView<bfloat16, CPU>&,
+//                                    const TensorView<bfloat16, CPU>&);
 template Tensor<float, CPU> div(const TensorView<float, CPU>&, const TensorView<float, CPU>&);
-template Tensor<bfloat16, CPU> div(const TensorView<bfloat16, CPU>&, bfloat16);
+// template Tensor<bfloat16, CPU> div(const TensorView<bfloat16, CPU>&, bfloat16);
 template Tensor<float, CPU> div(const TensorView<float, CPU>&, float);
 template Tensor<bfloat16, CPU> mul(const TensorView<bfloat16, CPU>&, bfloat16);
 template Tensor<bfloat16, CPU> mul(const TensorView<bfloat16, CPU>&,
                                    const TensorView<bfloat16, CPU>&);
-template Tensor<float, CPU> mul(const TensorView<float, CPU>&, float);
-template Tensor<float, CPU> mul(const TensorView<float, CPU>&, const TensorView<float, CPU>&);
-template Tensor<bfloat16, CPU> sum(const TensorView<bfloat16, CPU>&, int, bool);
+// template Tensor<float, CPU> mul(const TensorView<float, CPU>&, float);
+// template Tensor<float, CPU> mul(const TensorView<float, CPU>&, const TensorView<float, CPU>&);
+// template Tensor<bfloat16, CPU> sum(const TensorView<bfloat16, CPU>&, int, bool);
 template Tensor<float, CPU> sum(const TensorView<float, CPU>&, int, bool);
-template Tensor<bfloat16, CPU> max(const TensorView<bfloat16, CPU>&, int, bool);
+// template Tensor<bfloat16, CPU> max(const TensorView<bfloat16, CPU>&, int, bool);
 template Tensor<float, CPU> max(const TensorView<float, CPU>&, int, bool);
 template Tensor<bfloat16, CPU> masked_fill(const TensorView<bfloat16, CPU>&,
                                            const TensorView<int, CPU>&, bfloat16);
 
@@ -1,21 +1,13 @@
 #include "add.cuh"
+#include "utils.cuh"
 #include <cstddef>
 #include <cuda_bf16.hpp>
 
 namespace tensor::kernels {
 
 using namespace dtype;
 
-template<typename DeviceT>
-__global__ void add_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT* tensor_b, size_t n) {
-  size_t idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  if (idx < n) {
-    out[idx] = tensor_a[idx] + tensor_b[idx];
-  }
-}
-
-__global__ void add_kernel_bf16(Cuda<bfloat16>* out, Cuda<bfloat16>* tensor_a, Cuda<bfloat16>* tensor_b, size_t n) {
+__global__ void add_bfloat16_kernel(Cuda<bfloat16>* out, Cuda<bfloat16>* tensor_a, Cuda<bfloat16>* tensor_b, size_t n) {
   // we load 8 bf16 values at a time = 128 bits
   auto base = (blockIdx.x * blockDim.x) + threadIdx.x;
   auto idx = base * 8;
@@ -41,7 +33,27 @@ __global__ void add_kernel_bf16(Cuda<bfloat16>* out, Cuda<bfloat16>* tensor_a, C
   }
 }
 
-template __global__ void add_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>*, size_t);
-template __global__ void add_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>*, size_t);
+Tensor<bfloat16, CUDA> add_bfloat16(const TensorView<bfloat16, CUDA>& tensor_a, const TensorView<bfloat16, CUDA>& tensor_b) {
+  assert(tensor_a.is_contiguous() && tensor_b.is_contiguous() && "the two tensors should be contiguous");
+  assert(tensor_a.shape == tensor_b.shape && "the two tensors should be the same shape");
+
+  size_t n_elements = tensor_a.data_size;
+  TensorStorage<std::remove_const_t<bfloat16>, CUDA> storage(n_elements);
+
+  Tensor<std::remove_const_t<bfloat16>, CUDA> out{tensor_a.shape, std::move(storage)};
+
+  int block_size = 512;
+  // each thread handles 8 elements
+  int grid_size = cuda::get_grid_size(n_elements / 8, block_size);
+
+  // Convert to device-native types for kernel call
+  auto* out_d = reinterpret_cast<Cuda<bfloat16>*>(out.data()); // NOLINT
+  auto* a_d = reinterpret_cast<Cuda<bfloat16>*>(tensor_a.data); // NOLINT
+  auto* b_d = reinterpret_cast<Cuda<bfloat16>*>(tensor_b.data); // NOLINT
+
+  add_bfloat16_kernel<<<grid_size, block_size>>>(out_d, a_d, b_d, n_elements);
+
+  return out;
+}
 
 } // namespace tensor::kernels
@@ -2,18 +2,15 @@
 
 #include <cuda_runtime.h>
 #include <tensor/device_type.hpp>
+#include <tensor/tensor.hpp>
 #include <cstddef>
 
 namespace tensor::kernels {
 
 using namespace dtype;
 
-template<typename DeviceT>
-__global__ void add_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT* tensor_b, size_t n);
+__global__ void add_bfloat16_kernel(Cuda<bfloat16>* out, Cuda<bfloat16>* tensor_a, Cuda<bfloat16>* tensor_b, size_t n);
 
-
-extern template __global__ void add_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>*, size_t);
-extern template __global__ void add_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>*, size_t);
-__global__ void add_kernel_bf16(Cuda<bfloat16>* out, Cuda<bfloat16>* tensor_a, Cuda<bfloat16>* tensor_b, size_t n);
+Tensor<bfloat16, CUDA> add_bfloat16(const TensorView<bfloat16, CUDA>& tensor_a, const TensorView<bfloat16, CUDA>& tensor_b);
 
 } // namespace tensor::kernels
@@ -1,34 +1,58 @@
-#include "sub.cuh"
 #include <cstddef>
+#include "sub.cuh"
+#include "utils.cuh"
 
 namespace tensor::kernels {
 
 using namespace dtype;
 
-template<typename DeviceT>
-__global__ void sub_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT* tensor_b, size_t n) {
-  size_t idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+__global__ void sub_float_kernel(Cuda<float>* out, Cuda<float>* tensor_a, Cuda<float>* tensor_b, size_t n) {
+  // we load 4 fp32 values at a time = 128 bits
+  auto base = (blockIdx.x * blockDim.x) + threadIdx.x;
+  auto idx = base * 4;
 
-  if (idx < n) {
-    out[idx] = tensor_a[idx] - tensor_b[idx];
-  }
-}
+  if (idx + 3 < n) {
+    // load 2 doubles = 4 floats = 128 bits
+    double2 a_vec = reinterpret_cast<double2*>(tensor_a)[base]; // NOLINT
+    double2 b_vec = reinterpret_cast<double2*>(tensor_b)[base]; // NOLINT
 
-template __global__ void sub_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>*, size_t);
-template __global__ void sub_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>*, size_t);
-template __global__ void sub_kernel<Cuda<bfloat16>>(Cuda<bfloat16>*, Cuda<bfloat16>*, Cuda<bfloat16>*, size_t);
+    // reinterpret as a pair of floats
+    float* a2 = reinterpret_cast<float*>(&a_vec); // NOLINT
+    float* b2 = reinterpret_cast<float*>(&b_vec); // NOLINT
 
-template<typename DeviceT>
-__global__ void sub_scalar_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT scalar, size_t n) {
-  size_t idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    double2 out_vec;
+    float* out2 = reinterpret_cast<float*>(&out_vec); // NOLINT
 
-  if (idx < n) {
-    out[idx] = tensor_a[idx] - scalar;
+    out2[0] = a2[0] - b2[0];
+    out2[1] = a2[1] - b2[1];
+    out2[2] = a2[2] - b2[2];
+    out2[3] = a2[3] - b2[3];
+
+    reinterpret_cast<double2*>(out)[base] = out_vec; // NOLINT
   }
 }
 
-template __global__ void sub_scalar_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>, size_t);
-template __global__ void sub_scalar_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>, size_t);
-template __global__ void sub_scalar_kernel<Cuda<bfloat16>>(Cuda<bfloat16>*, Cuda<bfloat16>*, Cuda<bfloat16>, size_t);
+
+Tensor<float, CUDA> sub_float(const TensorView<float, CUDA>& tensor_a, const TensorView<float, CUDA>& tensor_b) {
+  assert(tensor_a.is_contiguous() && tensor_b.is_contiguous() && "the two tensors should be contiguous");
+  assert(tensor_a.shape == tensor_b.shape && "the two tensors should be the same shape");
+
+  size_t n_elements = tensor_a.data_size;
+  TensorStorage<std::remove_const_t<float>, CUDA> storage(n_elements);
+
+  Tensor<std::remove_const_t<float>, CUDA> out{tensor_a.shape, std::move(storage)};
+
+  int block_size = 512;
+  // each thread handles 4 elements
+  int grid_size = cuda::get_grid_size(n_elements / 4, block_size);
+
+  auto* out_d = reinterpret_cast<Cuda<float>*>(out.data()); // NOLINT
+  auto* a_d = reinterpret_cast<Cuda<float>*>(tensor_a.data); // NOLINT
+  auto* b_d = reinterpret_cast<Cuda<float>*>(tensor_b.data); // NOLINT
+
+  sub_float_kernel<<<grid_size, block_size>>>(out_d, a_d, b_d, n_elements);
+
+  return out;
+}
 
 } // namespace tensor::kernels
@@ -2,24 +2,15 @@
 
 #include <cuda_runtime.h>
 #include <tensor/device_type.hpp>
+#include <tensor/tensor.hpp>
 #include <cstddef>
 
 namespace tensor::kernels {
 
 using namespace dtype;
 
-template<typename DeviceT>
-__global__ void sub_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT* tensor_b, size_t n);
+__global__ void sub_float_kernel(Cuda<float>* out, Cuda<float>* tensor_a, Cuda<float>* tensor_b, size_t n);
 
-extern template __global__ void sub_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>*, size_t);
-extern template __global__ void sub_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>*, size_t);
-extern template __global__ void sub_kernel<Cuda<bfloat16>>(Cuda<bfloat16>*, Cuda<bfloat16>*, Cuda<bfloat16>*, size_t);
-
-template<typename DeviceT>
-__global__ void sub_scalar_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT scalar, size_t n);
-
-extern template __global__ void sub_scalar_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>, size_t);
-extern template __global__ void sub_scalar_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>, size_t);
-extern template __global__ void sub_scalar_kernel<Cuda<bfloat16>>(Cuda<bfloat16>*, Cuda<bfloat16>*, Cuda<bfloat16>, size_t);
+Tensor<float, CUDA> sub_float(const TensorView<float, CUDA>& tensor_a, const TensorView<float, CUDA>& tensor_b);
 
 } // namespace tensor::kernels
@@ -2,7 +2,7 @@
 #include <tensor/tensor.hpp>
 
 #include "../common/utils.h"
-#include "utils.cuh"
+#include "kernels/utils.cuh"
 
 namespace tensor {