Skip to content

Commit b263936

Browse files
committed
Vectorized vector subtraction in fp32
1 parent e904536 commit b263936

14 files changed

Lines changed: 126 additions & 249 deletions

File tree

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@ release:
1919
.PHONY: prepare_profile
2020
prepare_profile:
2121
@cmake --preset ninja-nvcc -DCMAKE_BUILD_TYPE=Release && cmake --build build --parallel --target test_tensor_cuda
22-
@echo 'sudo ncu ctest --kernel-name "add_kernel" --test-dir build -R "^TensorCUDATest.AddBF16"'
22+
@echo 'sudo ncu --kernel-name "add_kernel" ctest --test-dir build -R "^TensorCUDATest.AddBF16"'
2323

2424
.PHONY: profile
2525
profile:
2626
@cmake --build build --parallel --target test_tensor_cuda
27-
@echo 'sudo ncu ctest --kernel-name "add_kernel" --test-dir build -R "^TensorCUDATest.AddBF16"'
27+
@echo 'sudo ncu --kernel-name "add_kernel" ctest --test-dir build -R "^TensorCUDATest.AddBF16"'
2828

2929
.PHONY: app
3030
app:

include/tensor/storage.hpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ template <typename T, typename D> class TensorStorage;
1818
// Mutable CPU storage - owns or borrows mutable data
1919
template <typename T> class TensorStorage<T, CPU> {
2020
private:
21-
std::shared_ptr<T[]> data_;
22-
size_t size_ = 0;
21+
std::shared_ptr<T[]> data_; // NOLINT
22+
unsigned int size_ = 0;
2323

2424
public:
2525
using pointer = T*;
@@ -38,7 +38,7 @@ template <typename T> class TensorStorage<T, CPU> {
3838
// Non-owning storage - borrows external mutable memory
3939
static TensorStorage borrow(T* ptr, size_t size) {
4040
TensorStorage storage;
41-
storage.data_ = std::shared_ptr<T[]>(ptr, [](T*) {}); // no-op deleter
41+
storage.data_ = std::shared_ptr<T[]>(ptr, [](T*) {}); // no-op deleter // NOLINT
4242
storage.size_ = size;
4343
return storage;
4444
}
@@ -54,7 +54,7 @@ template <typename T> class TensorStorage<T, CPU> {
5454
}
5555

5656
void resize(size_t size) {
57-
data_ = std::shared_ptr<T[]>(new T[size]);
57+
data_ = std::shared_ptr<T[]>(new T[size]); // NOLINT
5858
size_ = size;
5959
}
6060
void fill(T value) {
@@ -72,7 +72,7 @@ template <typename T> class TensorStorage<T, CPU> {
7272
// Const CPU storage - borrows read-only data (e.g., mmap)
7373
template <typename T> class TensorStorage<const T, CPU> {
7474
private:
75-
std::shared_ptr<const T[]> data_;
75+
std::shared_ptr<const T[]> data_; // NOLINT
7676
size_t size_ = 0;
7777

7878
public:
@@ -84,7 +84,7 @@ template <typename T> class TensorStorage<const T, CPU> {
8484
// Non-owning storage - borrows external read-only memory (e.g., mmap)
8585
static TensorStorage borrow(const T* ptr, size_t size) {
8686
TensorStorage storage;
87-
storage.data_ = std::shared_ptr<const T[]>(ptr, [](const T*) {}); // no-op deleter
87+
storage.data_ = std::shared_ptr<const T[]>(ptr, [](const T*) {}); // no-op deleter // NOLINT
8888
storage.size_ = size;
8989
return storage;
9090
}
@@ -106,14 +106,14 @@ template <typename T> class TensorStorage<const T, CPU> {
106106
template <typename T> class TensorStorage<T, CUDA> {
107107
private:
108108
T* data_ = nullptr;
109-
unsigned int size_ = 0;
109+
size_t size_ = 0;
110110

111111
public:
112112
using pointer = T*;
113113
using const_pointer = const T*;
114114

115115
TensorStorage() = default;
116-
explicit TensorStorage(int size);
116+
explicit TensorStorage(size_t size);
117117
~TensorStorage();
118118

119119
// no copy, move only
@@ -122,7 +122,7 @@ template <typename T> class TensorStorage<T, CUDA> {
122122
TensorStorage(TensorStorage&& other) noexcept;
123123
TensorStorage& operator=(TensorStorage&& other) noexcept;
124124

125-
[[nodiscard]] int size() const {
125+
[[nodiscard]] size_t size() const {
126126
return size_;
127127
}
128128
pointer data() {
@@ -132,7 +132,7 @@ template <typename T> class TensorStorage<T, CUDA> {
132132
return data_;
133133
}
134134

135-
void resize(int size);
135+
void resize(size_t size);
136136
void fill(T value);
137137
};
138138

@@ -141,14 +141,14 @@ template <typename T> class TensorStorage<T, CUDA> {
141141
template <typename T> class TensorStorage<const T, CUDA> {
142142
private:
143143
T* data_ = nullptr;
144-
int size_ = 0;
144+
size_t size_ = 0;
145145

146146
public:
147147
using pointer = const T*;
148148
using const_pointer = const T*;
149149

150150
TensorStorage() = default;
151-
explicit TensorStorage(int size);
151+
explicit TensorStorage(size_t size);
152152
~TensorStorage();
153153

154154
// no copy, move only
@@ -157,7 +157,7 @@ template <typename T> class TensorStorage<const T, CUDA> {
157157
TensorStorage(TensorStorage&& other) noexcept;
158158
TensorStorage& operator=(TensorStorage&& other) noexcept;
159159

160-
[[nodiscard]] int size() const {
160+
[[nodiscard]] size_t size() const {
161161
return size_;
162162
}
163163
const_pointer data() const {
@@ -169,7 +169,7 @@ template <typename T> class TensorStorage<const T, CUDA> {
169169
return data_;
170170
}
171171

172-
void resize(int size);
172+
void resize(size_t size);
173173
};
174174
#endif
175175

src/llama/rope.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ precompute_rope_values(size_t head_dim, float theta_base, size_t context_length)
3535
// For each frequency, compute wavelength and apply scaling
3636
for (size_t i = 0; i < inv_freq_.size(); ++i) {
3737
float inv_f = inv_freq_.span()[i];
38-
float wavelen = 2.0 * M_PI / inv_f;
38+
float wavelen = M_PI * 2.0 / inv_f;
3939

4040
if (wavelen < high_freq_wavelen) {
4141
// High frequency: no scaling
@@ -47,7 +47,7 @@ precompute_rope_values(size_t head_dim, float theta_base, size_t context_length)
4747
// Medium frequency: smooth interpolation
4848
float smooth =
4949
(old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor);
50-
float scaled_inv_freq = (1.0 - smooth) * (inv_f / factor) + smooth * inv_f;
50+
float scaled_inv_freq = ((1.0 - smooth) * (inv_f / factor)) + (smooth * inv_f);
5151
inv_freq_.span()[i] = scaled_inv_freq;
5252
}
5353
}

src/tensor/cpu/ops.cpp

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,12 @@ Tensor<std::remove_const_t<T>, D> add(const TensorView<T, D>& tensor_a,
7474
[](T val_a, T val_b) { return val_a + val_b; });
7575
}
7676

77-
template Tensor<bfloat16, CPU> add(const TensorView<const bfloat16, CPU>&,
78-
const TensorView<const bfloat16, CPU>&);
79-
template Tensor<float, CPU> add(const TensorView<const float, CPU>&,
80-
const TensorView<const float, CPU>&);
81-
template Tensor<int, CPU> add(const TensorView<const int, CPU>&, const TensorView<const int, CPU>&);
77+
// template Tensor<bfloat16, CPU> add(const TensorView<const bfloat16, CPU>&,
78+
// const TensorView<const bfloat16, CPU>&);
79+
// template Tensor<float, CPU> add(const TensorView<const float, CPU>&,
80+
// const TensorView<const float, CPU>&);
81+
// template Tensor<int, CPU> add(const TensorView<const int, CPU>&, const TensorView<const int,
82+
// CPU>&);
8283

8384
template <typename T, typename D>
8485
Tensor<std::remove_const_t<T>, D> sub(const TensorView<T, D>& tensor_a,
@@ -566,24 +567,22 @@ template void replace_from_(Tensor<float, CPU>& destination, const TensorView<fl
566567
// Explicit instantiations for non-const T
567568
template Tensor<bfloat16, CPU> add(const TensorView<bfloat16, CPU>&,
568569
const TensorView<bfloat16, CPU>&);
569-
template Tensor<int, CPU> add(const TensorView<int, CPU>&, const TensorView<int, CPU>&);
570-
template Tensor<float, CPU> add(const TensorView<float, CPU>&, const TensorView<float, CPU>&);
571-
template Tensor<bfloat16, CPU> sub(const TensorView<bfloat16, CPU>&,
572-
const TensorView<bfloat16, CPU>&);
570+
// template Tensor<bfloat16, CPU> sub(const TensorView<bfloat16, CPU>&,
571+
// const TensorView<bfloat16, CPU>&);
573572
template Tensor<float, CPU> sub(const TensorView<float, CPU>&, const TensorView<float, CPU>&);
574-
template Tensor<bfloat16, CPU> div(const TensorView<bfloat16, CPU>&,
575-
const TensorView<bfloat16, CPU>&);
573+
// template Tensor<bfloat16, CPU> div(const TensorView<bfloat16, CPU>&,
574+
// const TensorView<bfloat16, CPU>&);
576575
template Tensor<float, CPU> div(const TensorView<float, CPU>&, const TensorView<float, CPU>&);
577-
template Tensor<bfloat16, CPU> div(const TensorView<bfloat16, CPU>&, bfloat16);
576+
// template Tensor<bfloat16, CPU> div(const TensorView<bfloat16, CPU>&, bfloat16);
578577
template Tensor<float, CPU> div(const TensorView<float, CPU>&, float);
579578
template Tensor<bfloat16, CPU> mul(const TensorView<bfloat16, CPU>&, bfloat16);
580579
template Tensor<bfloat16, CPU> mul(const TensorView<bfloat16, CPU>&,
581580
const TensorView<bfloat16, CPU>&);
582-
template Tensor<float, CPU> mul(const TensorView<float, CPU>&, float);
583-
template Tensor<float, CPU> mul(const TensorView<float, CPU>&, const TensorView<float, CPU>&);
584-
template Tensor<bfloat16, CPU> sum(const TensorView<bfloat16, CPU>&, int, bool);
581+
// template Tensor<float, CPU> mul(const TensorView<float, CPU>&, float);
582+
// template Tensor<float, CPU> mul(const TensorView<float, CPU>&, const TensorView<float, CPU>&);
583+
// template Tensor<bfloat16, CPU> sum(const TensorView<bfloat16, CPU>&, int, bool);
585584
template Tensor<float, CPU> sum(const TensorView<float, CPU>&, int, bool);
586-
template Tensor<bfloat16, CPU> max(const TensorView<bfloat16, CPU>&, int, bool);
585+
// template Tensor<bfloat16, CPU> max(const TensorView<bfloat16, CPU>&, int, bool);
587586
template Tensor<float, CPU> max(const TensorView<float, CPU>&, int, bool);
588587
template Tensor<bfloat16, CPU> masked_fill(const TensorView<bfloat16, CPU>&,
589588
const TensorView<int, CPU>&, bfloat16);

src/tensor/cuda/kernels/add.cu

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,13 @@
11
#include "add.cuh"
2+
#include "utils.cuh"
23
#include <cstddef>
34
#include <cuda_bf16.hpp>
45

56
namespace tensor::kernels {
67

78
using namespace dtype;
89

9-
template<typename DeviceT>
10-
__global__ void add_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT* tensor_b, size_t n) {
11-
size_t idx = (blockIdx.x * blockDim.x) + threadIdx.x;
12-
13-
if (idx < n) {
14-
out[idx] = tensor_a[idx] + tensor_b[idx];
15-
}
16-
}
17-
18-
__global__ void add_kernel_bf16(Cuda<bfloat16>* out, Cuda<bfloat16>* tensor_a, Cuda<bfloat16>* tensor_b, size_t n) {
10+
__global__ void add_bfloat16_kernel(Cuda<bfloat16>* out, Cuda<bfloat16>* tensor_a, Cuda<bfloat16>* tensor_b, size_t n) {
1911
// we load 8 bf16 values at a time = 128 bits
2012
auto base = (blockIdx.x * blockDim.x) + threadIdx.x;
2113
auto idx = base * 8;
@@ -41,7 +33,27 @@ __global__ void add_kernel_bf16(Cuda<bfloat16>* out, Cuda<bfloat16>* tensor_a, C
4133
}
4234
}
4335

44-
template __global__ void add_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>*, size_t);
45-
template __global__ void add_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>*, size_t);
36+
Tensor<bfloat16, CUDA> add_bfloat16(const TensorView<bfloat16, CUDA>& tensor_a, const TensorView<bfloat16, CUDA>& tensor_b) {
37+
assert(tensor_a.is_contiguous() && tensor_b.is_contiguous() && "the two tensors should be contiguous");
38+
assert(tensor_a.shape == tensor_b.shape && "the two tensors should be the same shape");
39+
40+
size_t n_elements = tensor_a.data_size;
41+
TensorStorage<std::remove_const_t<bfloat16>, CUDA> storage(n_elements);
42+
43+
Tensor<std::remove_const_t<bfloat16>, CUDA> out{tensor_a.shape, std::move(storage)};
44+
45+
int block_size = 512;
46+
// each thread handles 8 elements
47+
int grid_size = cuda::get_grid_size(n_elements / 8, block_size);
48+
49+
// Convert to device-native types for kernel call
50+
auto* out_d = reinterpret_cast<Cuda<bfloat16>*>(out.data()); // NOLINT
51+
auto* a_d = reinterpret_cast<Cuda<bfloat16>*>(tensor_a.data); // NOLINT
52+
auto* b_d = reinterpret_cast<Cuda<bfloat16>*>(tensor_b.data); // NOLINT
53+
54+
add_bfloat16_kernel<<<grid_size, block_size>>>(out_d, a_d, b_d, n_elements);
55+
56+
return out;
57+
}
4658

4759
} // namespace tensor::kernels

src/tensor/cuda/kernels/add.cuh

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,15 @@
22

33
#include <cuda_runtime.h>
44
#include <tensor/device_type.hpp>
5+
#include <tensor/tensor.hpp>
56
#include <cstddef>
67

78
namespace tensor::kernels {
89

910
using namespace dtype;
1011

11-
template<typename DeviceT>
12-
__global__ void add_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT* tensor_b, size_t n);
12+
__global__ void add_bfloat16_kernel(Cuda<bfloat16>* out, Cuda<bfloat16>* tensor_a, Cuda<bfloat16>* tensor_b, size_t n);
1313

14-
15-
extern template __global__ void add_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>*, size_t);
16-
extern template __global__ void add_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>*, size_t);
17-
__global__ void add_kernel_bf16(Cuda<bfloat16>* out, Cuda<bfloat16>* tensor_a, Cuda<bfloat16>* tensor_b, size_t n);
14+
Tensor<bfloat16, CUDA> add_bfloat16(const TensorView<bfloat16, CUDA>& tensor_a, const TensorView<bfloat16, CUDA>& tensor_b);
1815

1916
} // namespace tensor::kernels

src/tensor/cuda/kernels/sub.cu

Lines changed: 43 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,58 @@
1-
#include "sub.cuh"
21
#include <cstddef>
2+
#include "sub.cuh"
3+
#include "utils.cuh"
34

45
namespace tensor::kernels {
56

67
using namespace dtype;
78

8-
template<typename DeviceT>
9-
__global__ void sub_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT* tensor_b, size_t n) {
10-
size_t idx = (blockIdx.x * blockDim.x) + threadIdx.x;
9+
__global__ void sub_float_kernel(Cuda<float>* out, Cuda<float>* tensor_a, Cuda<float>* tensor_b, size_t n) {
10+
// we load 4 fp32 values at a time = 128 bits
11+
auto base = (blockIdx.x * blockDim.x) + threadIdx.x;
12+
auto idx = base * 4;
1113

12-
if (idx < n) {
13-
out[idx] = tensor_a[idx] - tensor_b[idx];
14-
}
15-
}
14+
if (idx + 3 < n) {
15+
// load 2 doubles = 4 floats = 128 bits
16+
double2 a_vec = reinterpret_cast<double2*>(tensor_a)[base]; // NOLINT
17+
double2 b_vec = reinterpret_cast<double2*>(tensor_b)[base]; // NOLINT
1618

17-
template __global__ void sub_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>*, size_t);
18-
template __global__ void sub_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>*, size_t);
19-
template __global__ void sub_kernel<Cuda<bfloat16>>(Cuda<bfloat16>*, Cuda<bfloat16>*, Cuda<bfloat16>*, size_t);
19+
// reinterpret as a pair of floats
20+
float* a2 = reinterpret_cast<float*>(&a_vec); // NOLINT
21+
float* b2 = reinterpret_cast<float*>(&b_vec); // NOLINT
2022

21-
template<typename DeviceT>
22-
__global__ void sub_scalar_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT scalar, size_t n) {
23-
size_t idx = (blockIdx.x * blockDim.x) + threadIdx.x;
23+
double2 out_vec;
24+
float* out2 = reinterpret_cast<float*>(&out_vec); // NOLINT
2425

25-
if (idx < n) {
26-
out[idx] = tensor_a[idx] - scalar;
26+
out2[0] = a2[0] - b2[0];
27+
out2[1] = a2[1] - b2[1];
28+
out2[2] = a2[2] - b2[2];
29+
out2[3] = a2[3] - b2[3];
30+
31+
reinterpret_cast<double2*>(out)[base] = out_vec; // NOLINT
2732
}
2833
}
2934

30-
template __global__ void sub_scalar_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>, size_t);
31-
template __global__ void sub_scalar_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>, size_t);
32-
template __global__ void sub_scalar_kernel<Cuda<bfloat16>>(Cuda<bfloat16>*, Cuda<bfloat16>*, Cuda<bfloat16>, size_t);
35+
36+
Tensor<float, CUDA> sub_float(const TensorView<float, CUDA>& tensor_a, const TensorView<float, CUDA>& tensor_b) {
37+
assert(tensor_a.is_contiguous() && tensor_b.is_contiguous() && "the two tensors should be contiguous");
38+
assert(tensor_a.shape == tensor_b.shape && "the two tensors should be the same shape");
39+
40+
size_t n_elements = tensor_a.data_size;
41+
TensorStorage<std::remove_const_t<float>, CUDA> storage(n_elements);
42+
43+
Tensor<std::remove_const_t<float>, CUDA> out{tensor_a.shape, std::move(storage)};
44+
45+
int block_size = 512;
46+
// each thread handles 4 elements
47+
int grid_size = cuda::get_grid_size(n_elements / 4, block_size);
48+
49+
auto* out_d = reinterpret_cast<Cuda<float>*>(out.data()); // NOLINT
50+
auto* a_d = reinterpret_cast<Cuda<float>*>(tensor_a.data); // NOLINT
51+
auto* b_d = reinterpret_cast<Cuda<float>*>(tensor_b.data); // NOLINT
52+
53+
sub_float_kernel<<<grid_size, block_size>>>(out_d, a_d, b_d, n_elements);
54+
55+
return out;
56+
}
3357

3458
} // namespace tensor::kernels

src/tensor/cuda/kernels/sub.cuh

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,15 @@
22

33
#include <cuda_runtime.h>
44
#include <tensor/device_type.hpp>
5+
#include <tensor/tensor.hpp>
56
#include <cstddef>
67

78
namespace tensor::kernels {
89

910
using namespace dtype;
1011

11-
template<typename DeviceT>
12-
__global__ void sub_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT* tensor_b, size_t n);
12+
__global__ void sub_float_kernel(Cuda<float>* out, Cuda<float>* tensor_a, Cuda<float>* tensor_b, size_t n);
1313

14-
extern template __global__ void sub_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>*, size_t);
15-
extern template __global__ void sub_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>*, size_t);
16-
extern template __global__ void sub_kernel<Cuda<bfloat16>>(Cuda<bfloat16>*, Cuda<bfloat16>*, Cuda<bfloat16>*, size_t);
17-
18-
template<typename DeviceT>
19-
__global__ void sub_scalar_kernel(DeviceT* out, DeviceT* tensor_a, DeviceT scalar, size_t n);
20-
21-
extern template __global__ void sub_scalar_kernel<Cuda<float>>(Cuda<float>*, Cuda<float>*, Cuda<float>, size_t);
22-
extern template __global__ void sub_scalar_kernel<Cuda<int>>(Cuda<int>*, Cuda<int>*, Cuda<int>, size_t);
23-
extern template __global__ void sub_scalar_kernel<Cuda<bfloat16>>(Cuda<bfloat16>*, Cuda<bfloat16>*, Cuda<bfloat16>, size_t);
14+
Tensor<float, CUDA> sub_float(const TensorView<float, CUDA>& tensor_a, const TensorView<float, CUDA>& tensor_b);
2415

2516
} // namespace tensor::kernels

src/tensor/cuda/loader.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#include <tensor/tensor.hpp>
33

44
#include "../common/utils.h"
5-
#include "utils.cuh"
5+
#include "kernels/utils.cuh"
66

77
namespace tensor {
88

0 commit comments

Comments
 (0)