txus
diff --git a/‎CMakeLists.txt‎
Lines changed: 87 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎benchmarks/tensor/cpu/bm_ops.cpp‎
Lines changed: 52 additions & 0 deletions b/‎benchmarks/tensor/cpu/bm_ops.cpp‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎benchmarks/tensor/cuda/bm_ops.cpp‎
Lines changed: 52 additions & 0 deletions b/‎benchmarks/tensor/cuda/bm_ops.cpp‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎src/tensor/cpu/ops.cpp‎
Lines changed: 3 additions & 0 deletions b/‎src/tensor/cpu/ops.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/tensor/cuda/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎src/tensor/cuda/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/tensor/cuda/kernels/max.cu‎
Lines changed: 115 additions & 0 deletions b/‎src/tensor/cuda/kernels/max.cu‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎src/tensor/cuda/kernels/max.cuh‎
Lines changed: 16 additions & 0 deletions b/‎src/tensor/cuda/kernels/max.cuh‎
Lines changed: 16 additions & 0 deletions
@@ -0,0 +1,87 @@
+cmake_minimum_required(VERSION 3.19..4.0)
+
+project(
+  Forward
+  VERSION 0.1
+  DESCRIPTION "A simple transformer inference engine"
+  LANGUAGES CXX
+)
+
+set(FETCHCONTENT_BASE_DIR "${CMAKE_SOURCE_DIR}/.cmake/fetchcontent")
+set(FETCHCONTENT_UPDATES_DISCONNECTED ON)
+
+# msgpack in tokenizers_cpp is doing weird stuff
+set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
+
+set(CMAKE_C_COMPILER_LAUNCHER ccache)
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache)
+
+if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
+  set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+  set(CMAKE_CXX_EXTENSIONS ON)
+  set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+  include(CTest)
+
+  find_package(Doxygen)
+  if(Doxygen_FOUND)
+    add_subdirectory(docs)
+  else()
+    message(STATUS "Doxygen not found, not building docs")
+  endif()
+endif()
+
+include(FetchContent)
+
+# Declare all external dependencies in one place
+FetchContent_Declare(
+  fmtlib
+  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
+  GIT_TAG 12.1.0
+)
+
+FetchContent_Declare(
+  json
+  GIT_REPOSITORY https://github.com/nlohmann/json
+  GIT_TAG v3.12.0
+)
+
+FetchContent_Declare(
+  safetensors_cpp
+  GIT_REPOSITORY https://github.com/syoyo/safetensors-cpp.git
+  GIT_TAG 10f7d8f
+)
+set(SAFETENSORS_CPP_CXX_EXCEPTIONS ON CACHE BOOL "" FORCE)
+
+FetchContent_Declare(
+  tokenizers_cpp
+  GIT_REPOSITORY https://github.com/mlc-ai/tokenizers-cpp
+  GIT_TAG 55d53aa
+)
+
+# On NixOS with Clang, clangd uses a different resource-dir than clang++
+# Set the correct resource-dir for all C++ compilation
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND DEFINED CLANG_RESOURCE_DIR AND NOT CLANG_RESOURCE_DIR STREQUAL "")
+  add_compile_options(-resource-dir=${CLANG_RESOURCE_DIR})
+endif()
+
+
+# Make dependencies available
+FetchContent_MakeAvailable(fmtlib json safetensors_cpp tokenizers_cpp)
+
+# compiled library code
+add_subdirectory(src/tensor)
+add_subdirectory(src/nn)
+add_subdirectory(src/llama)
+add_subdirectory(src/forward)
+
+add_subdirectory(benchmarks)
+
+# executable code
+add_subdirectory(apps)
+
+# testing only if this is the main app
+if((CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME OR MODERN_CMAKE_BUILD_TESTING)
+  AND BUILD_TESTING)
+  add_subdirectory(tests)
+endif()
+
@@ -35,3 +35,55 @@ BENCHMARK(BM_CPU_AddBf16)
     ->Args({65536, 2048})
     ->Unit(kMillisecond)
     ->UseRealTime();
+
+static void BM_CPU_SumFp32LastDim(State& state) {
+  Tensor<float, CPU> tensor(
+      {static_cast<size_t>(state.range(0)), static_cast<size_t>(state.range(1))});
+
+  tensor.fill_(float(1.0));
+
+  auto view = tensor.view();
+
+  for (auto _ : state)
+    DoNotOptimize(sum(view, -1, true));
+
+  int64_t flops = 0;
+
+  flops += state.iterations() * state.range(0) * state.range(1);
+  state.counters["FLOPs"] = Counter(flops, Counter::kIsRate);
+  auto bytes_per_element = 4;
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
+                          state.range(1) * bytes_per_element);
+}
+
+BENCHMARK(BM_CPU_SumFp32LastDim)
+    ->Args({16384, 2048})
+    ->Args({65536, 2048})
+    ->Unit(kMillisecond)
+    ->UseRealTime();
+
+static void BM_CPU_SumFp32FirstDim(State& state) {
+  Tensor<float, CPU> tensor(
+      {static_cast<size_t>(state.range(0)), static_cast<size_t>(state.range(1))});
+
+  tensor.fill_(float(1.0));
+
+  auto view = tensor.view();
+
+  for (auto _ : state)
+    DoNotOptimize(sum(view, 0, true));
+
+  int64_t flops = 0;
+
+  flops += state.iterations() * state.range(0) * state.range(1);
+  state.counters["FLOPs"] = Counter(flops, Counter::kIsRate);
+  auto bytes_per_element = 4;
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
+                          state.range(1) * bytes_per_element);
+}
+
+BENCHMARK(BM_CPU_SumFp32FirstDim)
+    ->Args({16384, 2048})
+    ->Args({65536, 2048})
+    ->Unit(kMillisecond)
+    ->UseRealTime();
@@ -37,3 +37,55 @@ BENCHMARK(BM_CUDA_AddBf16)
     ->Args({262144, 2048})
     ->Unit(kMillisecond)
     ->UseRealTime();
+
+static void BM_CUDA_SumFp32LastDim(State& state) {
+  Tensor<float, CUDA> tensor(
+      {static_cast<size_t>(state.range(0)), static_cast<size_t>(state.range(1))});
+
+  tensor.fill_(float(1.0));
+
+  auto view = tensor.view();
+
+  for (auto _ : state)
+    DoNotOptimize(sum(view, -1, true));
+
+  int64_t flops = 0;
+
+  flops += state.iterations() * state.range(0) * state.range(1);
+  state.counters["FLOPs"] = Counter(flops, Counter::kIsRate);
+  auto bytes_per_element = 4;
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
+                          state.range(1) * bytes_per_element);
+}
+
+BENCHMARK(BM_CUDA_SumFp32LastDim)
+    ->Args({16384, 2048})
+    ->Args({65536, 2048})
+    ->Unit(kMillisecond)
+    ->UseRealTime();
+
+static void BM_CUDA_SumFp32FirstDim(State& state) {
+  Tensor<float, CUDA> tensor(
+      {static_cast<size_t>(state.range(0)), static_cast<size_t>(state.range(1))});
+
+  tensor.fill_(float(1.0));
+
+  auto view = tensor.view();
+
+  for (auto _ : state)
+    DoNotOptimize(sum(view, 0, true));
+
+  int64_t flops = 0;
+
+  flops += state.iterations() * state.range(0) * state.range(1);
+  state.counters["FLOPs"] = Counter(flops, Counter::kIsRate);
+  auto bytes_per_element = 4;
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
+                          state.range(1) * bytes_per_element);
+}
+
+BENCHMARK(BM_CUDA_SumFp32FirstDim)
+    ->Args({16384, 2048})
+    ->Args({65536, 2048})
+    ->Unit(kMillisecond)
+    ->UseRealTime();
@@ -567,7 +567,10 @@ template Tensor<bfloat16, CPU> mul(const TensorView<bfloat16, CPU>&, bfloat16);
 template Tensor<bfloat16, CPU> mul(const TensorView<bfloat16, CPU>&,
                                    const TensorView<bfloat16, CPU>&);
 
+// sum
 template Tensor<float, CPU> sum(const TensorView<float, CPU>&, int, bool);
+
+// max
 template Tensor<float, CPU> max(const TensorView<float, CPU>&, int, bool);
 template Tensor<bfloat16, CPU> masked_fill(const TensorView<bfloat16, CPU>&,
                                            const TensorView<int, CPU>&, bfloat16);
 
@@ -1,6 +1,6 @@
 file(GLOB HEADER_LIST CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/include/tensor/*.hpp")
 
-add_library(tensor_cuda STATIC storage.cu loader.cu ops.cu kernels/fill.cu kernels/arange.cu kernels/add.cu kernels/sub.cu kernels/div.cu kernels/mul.cu)
+add_library(tensor_cuda STATIC storage.cu loader.cu ops.cu kernels/fill.cu kernels/arange.cu kernels/add.cu kernels/sub.cu kernels/div.cu kernels/mul.cu kernels/sum.cu kernels/max.cu)
 
 set_target_properties(tensor_cuda PROPERTIES
   CUDA_SEPARABLE_COMPILATION ON
 
@@ -0,0 +1,115 @@
+#include "max.cuh"
+#include "utils.cuh"
+#include <cstddef>
+#include <cuda_bf16.hpp>
+#include <limits>
+
+namespace tensor::kernels {
+
+using namespace dtype;
+
+const int blockThreads = 256;
+
+__global__ void max_float_kernel(Cuda<float>* out, Cuda<float>* input, size_t num_reductions, size_t reduce_size, size_t reduce_stride) {
+  __shared__ Cuda<float> shmem[blockThreads]; // NOLINT
+  size_t tid = threadIdx.x;
+
+  size_t reduction_idx = blockIdx.x; // which reduction are we doing?
+
+  // decompose into outer and inner indices
+  size_t outer_idx = reduction_idx / reduce_stride;
+  size_t inner_idx = reduction_idx % reduce_stride;
+
+  // base pointer for this reduction
+  size_t base = (outer_idx * reduce_size * reduce_stride) + inner_idx;
+
+  // reduce with a grid stride loop to handle reduce_size > blockThreads
+  float thread_max = -std::numeric_limits<float>::infinity();
+  for (size_t element = tid; element < reduce_size; element += blockDim.x) {
+    thread_max = max(thread_max, input[base + (element * reduce_stride)]);
+  }
+  // now we only have to reduce 'blockThreads' elements, which is easy within a block
+
+  // load partial maxs onto shmem
+  shmem[tid] = thread_max;
+  __syncthreads();
+
+  // reduce in shared memory
+  for (int stride = blockDim.x / 2; stride > 32; stride >>= 1) { // NOLINT
+    if (tid < stride) { shmem[tid] = max(shmem[tid], shmem[tid + stride]); }
+    __syncthreads();
+  }
+
+  // warp shuffle for the final warp-level reduction
+  if (tid < 32) {
+    float val = max(shmem[tid], shmem[tid + 32]);
+    for (int offset = 16; offset > 0; offset >>= 1) {
+      val = max(val, __shfl_down_sync(0xffffffff, val, offset));
+    }
+
+    if (tid == 0) {
+      out[reduction_idx] = val;
+    }
+  }
+}
+
+Tensor<float, CUDA> max_float(const TensorView<float, CUDA>& input, int dim, bool keepdim) {
+  assert(input.is_contiguous() && "the tensor should be contiguous");
+
+  auto shape = input.shape;
+
+  if (dim < 0) {
+    dim = shape.size() + dim;
+  }
+
+  assert(dim >= 0 && static_cast<size_t>(dim) < shape.size());
+
+  size_t outer_size = 1; // how many reductions will we perform? ("batch size")
+  size_t inner_size = 1; // what's the distance between elements to reduce?
+  size_t reduce_size = 1; // how many elements each reduction needs to reduce over
+
+  bool found_dim = false;
+
+  // Output shape
+  Shape out_shape;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (i == static_cast<size_t>(dim)) {
+      if (keepdim) {
+        out_shape.push_back(1);
+      }
+      reduce_size = shape[dim];
+      found_dim = true;
+    } else {
+      if (!found_dim) {
+        outer_size *= shape[i];
+      } else {
+        inner_size *= shape[i];
+      }
+
+      out_shape.push_back(shape[i]);
+    }
+  }
+
+  if (out_shape.empty()) {
+    out_shape.push_back(1);
+  }
+
+  auto n_elements = outer_size * inner_size;
+
+  auto input_strides = get_all_strides(shape);
+
+  TensorStorage<float, CUDA> storage(n_elements);
+  Tensor<float, CUDA> out{out_shape, std::move(storage)};
+
+  int block_size = blockThreads;
+
+  // Convert to device-native types for kernel call
+  auto* out_d = reinterpret_cast<Cuda<float>*>(out.data()); // NOLINT
+  auto* input_d = reinterpret_cast<Cuda<float>*>(input.data); // NOLINT
+
+  max_float_kernel<<<n_elements, block_size>>>(out_d, input_d, n_elements, reduce_size, inner_size);
+
+  return out;
+}
+
+} // namespace tensor::kernels
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <tensor/device_type.hpp>
+#include <tensor/tensor.hpp>
+#include <cstddef>
+
+namespace tensor::kernels {
+
+using namespace dtype;
+
+__global__ void max_float_kernel(Cuda<float>* out, Cuda<float>* input, size_t num_reductions, size_t reduce_size, size_t reduce_stride);
+
+Tensor<float, CUDA> max_float(const TensorView<float, CUDA>& input, int dim, bool keepdim);
+
+} // namespace tensor::kernels