txus
diff --git a/‎CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 17 additions & 0 deletions b/‎Makefile‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎apps/forward.cpp‎
Lines changed: 40 additions & 19 deletions b/‎apps/forward.cpp‎
Lines changed: 40 additions & 19 deletions
diff --git a/‎cmake/CUDAConfig.cmake‎
Lines changed: 88 additions & 0 deletions b/‎cmake/CUDAConfig.cmake‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎include/forward/sampler.hpp‎
Lines changed: 8 additions & 2 deletions b/‎include/forward/sampler.hpp‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎include/llama/rope.hpp‎
Lines changed: 11 additions & 5 deletions b/‎include/llama/rope.hpp‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎include/tensor/device.hpp‎
Lines changed: 1 addition & 1 deletion b/‎include/tensor/device.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/tensor/device_type.hpp‎
Lines changed: 3 additions & 3 deletions b/‎include/tensor/device_type.hpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎include/tensor/storage.hpp‎
Lines changed: 1 addition & 1 deletion b/‎include/tensor/storage.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/tensor/tensor.hpp‎
Lines changed: 3 additions & 3 deletions b/‎include/tensor/tensor.hpp‎
Lines changed: 3 additions & 3 deletions
@@ -7,6 +7,26 @@ project(
   LANGUAGES CXX
 )
 
+# Add cmake module path for our custom modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
+
+# Backend detection
+if(APPLE)
+  set(BACKEND_METAL ON)
+  set(BACKEND_CUDA OFF)
+else()
+  set(BACKEND_CUDA ON)
+  set(BACKEND_METAL OFF)
+endif()
+
+# Propagate backend flags as compile definitions
+if(BACKEND_CUDA)
+  add_compile_definitions(BACKEND_CUDA)
+endif()
+if(BACKEND_METAL)
+  add_compile_definitions(BACKEND_METAL)
+endif()
+
 set(FETCHCONTENT_BASE_DIR "${CMAKE_SOURCE_DIR}/.cmake/fetchcontent")
 set(FETCHCONTENT_UPDATES_DISCONNECTED ON)
 
 
@@ -36,6 +36,12 @@ inspect:
 	@cmake --build build --target inspect
 	@./build/apps/inspect
 
+.PHONY: tensor
+tensor:
+	@cmake --build build --target tensor
+	@ctest --test-dir build -R "^TensorCPU" --output-on-failure
+	@ctest --test-dir build -R "^TensorCUDA" --output-on-failure
+
 .PHONY: tensor_cpu
 tensor_cpu:
 	@cmake --build build --target test_tensor_cpu
@@ -46,6 +52,12 @@ tensor_cuda:
 	@cmake --build build --target test_tensor_cuda
 	@ctest --test-dir build -R "^TensorCUDA" --output-on-failure
 
+.PHONY: nn
+nn:
+	@cmake --build build --target nn
+	@ctest --test-dir build -R "^NNCPU" --output-on-failure
+	@ctest --test-dir build -R "^NNCUDA" --output-on-failure
+
 .PHONY: nn_cpu
 nn_cpu:
 	@cmake --build build --target test_nn_cpu
@@ -61,6 +73,11 @@ llama:
 	@cmake --build build --target test_llama
 	@ctest --test-dir build -R "^Llama" --output-on-failure
 
+.PHONY: llama_cuda
+llama_cuda:
+	@cmake --build build --target test_llama
+	@ctest --test-dir build -R "^LlamaCUDA" --output-on-failure
+
 .PHONY: forward
 forward:
 	@cmake --build build --target test_forward
 
@@ -1,4 +1,5 @@
 #include <fmt/format.h>
+#include <cstring>
 
 #include <forward/sampler.hpp>
 #include <forward/tokenizer.hpp>
@@ -9,25 +10,17 @@
 using namespace llama;
 using namespace tensor;
 
-int main(int argc, char* argv[]) {
-  const auto* path = "./tests/model";
-  if (argc > 1) {
-    path = argv[1];
-  }
-
+template <Device D>
+void run_inference(const char* path, tokenizer::Tokenizer& tok) {
   size_t max_tokens = 128;
   size_t kv_cache_size = max_tokens;
 
-  tokenizer::Tokenizer tok("./tests/model/tokenizer.json");
-
-  sampler::GreedySampler<bfloat16, CPU> sampler{sampler::GreedyConfig{}, tok};
-
-  Model<bfloat16, CPU> mod("./tests/model/config.json", max_tokens, kv_cache_size);
+  sampler::GreedySampler<bfloat16, D> sampler{sampler::GreedyConfig{}, tok};
 
-  // loader::inspect_safetensors("./tests/model/model.safetensors");
+  Model<bfloat16, D> mod(fmt::format("{}/config.json", path), max_tokens, kv_cache_size);
 
   fmt::println("Loading weights...");
-  Loader<bfloat16, CPU> loader{"./tests/model/model.safetensors"};
+  Loader<bfloat16, D> loader{fmt::format("{}/model.safetensors", path)};
   mod.load_weights(loader);
 
   fmt::println("Weights loaded! Performing inference...");
@@ -36,16 +29,44 @@ int main(int argc, char* argv[]) {
 
   fmt::println("Prompt: {}", prompt);
 
-  auto gen_and_tok_s = sampler.generate(mod, prompt, 12);
+  auto [out, stats] = sampler.generate(mod, prompt, 12);
 
-  auto out = std::get<0>(gen_and_tok_s);
-  auto tok_s = std::get<1>(gen_and_tok_s);
+  auto colored_out = fmt::format(fmt::fg(fmt::color::aqua), "{}", out);
 
-  out = fmt::format(fmt::fg(fmt::color::aqua), "{}", out);
+  fmt::println("{}{}", prompt, colored_out);
 
-  fmt::println("{}{}", prompt, out);
+  fmt::println("");
+  fmt::println("TTFT:         {:.2f} ms", stats.ttft_ms);
+  fmt::println("Avg ITL:      {:.2f} ms", stats.avg_itl_ms);
+  fmt::println("Tokens / sec: {:.2f}", stats.tokens_per_sec);
+}
 
-  fmt::println("Tokens / sec: {}", tok_s);
+int main(int argc, char* argv[]) {
+  const char* path = "./tests/model";
+  bool use_cuda = false;
+
+  for (int i = 1; i < argc; ++i) {
+    if (std::strcmp(argv[i], "--cuda") == 0) {
+      use_cuda = true;
+    } else {
+      path = argv[i];
+    }
+  }
+
+  tokenizer::Tokenizer tok(fmt::format("{}/tokenizer.json", path));
+
+  if (use_cuda) {
+#ifdef BACKEND_CUDA
+    fmt::println("Using CUDA backend");
+    run_inference<CUDA>(path, tok);
+#else
+    fmt::println("Error: CUDA backend not available. Rebuild with CUDA support.");
+    return 1;
+#endif
+  } else {
+    fmt::println("Using CPU backend");
+    run_inference<CPU>(path, tok);
+  }
 
   return 0;
 }
@@ -0,0 +1,88 @@
+# Common CUDA configuration for all CUDA targets
+#
+# Usage:
+#   include(CUDAConfig)
+#   configure_cuda_target(my_cuda_target)
+
+# Target CUDA architectures
+#   70 = V100
+#   75 = RTX 20xx, T4
+#   80 = A100
+#   86 = RTX 30xx
+#   89 = RTX 40xx
+#   90 = H100
+#   100 = B200
+#   120 = 5090 RTX
+
+function(configure_cuda_target TARGET_NAME)
+  set_target_properties(${TARGET_NAME} PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+  )
+
+  # Note: CUDA_ARCHITECTURES doesn't support generator expressions
+  # Use CMAKE_BUILD_TYPE to control this at configure time
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES 120)
+  else()
+    # RelWithDebInfo and Release: include PTX for future architectures
+    set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES 120-real 120-virtual)
+  endif()
+
+  if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+    # CUDA compile options for nvcc
+    target_compile_options(${TARGET_NAME} PRIVATE
+      $<$<COMPILE_LANGUAGE:CUDA>:
+        # Debug builds: Full debug info
+        $<$<CONFIG:Debug>:
+          -G              # Generate device debug info (disables optimizations)
+          -g              # Generate host debug info
+        >
+
+        # RelWithDebInfo: Debug info + optimizations
+        $<$<CONFIG:RelWithDebInfo>:
+          -G              # Device debug info
+          -g              # Host debug info
+          --use_fast_math # Fast math even in debug
+        >
+
+        # Release: Maximum optimization
+        $<$<CONFIG:Release>:
+          -lineinfo
+          --use_fast_math
+        >
+
+        # Common flags for all builds
+        --expt-relaxed-constexpr
+        -Xcompiler=-fPIC
+      >
+    )
+  elseif(CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
+    # Clang CUDA flags (for clangd compatibility)
+    # Use cuda-merged package which has complete headers, not just nvcc
+    # Also specify resource-dir for NixOS where clangd uses a different resource directory
+    set(CLANG_CUDA_FLAGS -fPIC)
+
+    # Include cuda_compat.h if it exists in the source directory
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/cuda_compat.h")
+      list(APPEND CLANG_CUDA_FLAGS -include ${CMAKE_CURRENT_SOURCE_DIR}/cuda_compat.h)
+    endif()
+
+    if(DEFINED ENV{CUDA_PATH})
+      list(APPEND CLANG_CUDA_FLAGS --cuda-path=$ENV{CUDA_PATH})
+    endif()
+
+    if(DEFINED CLANG_RESOURCE_DIR AND NOT CLANG_RESOURCE_DIR STREQUAL "")
+      list(APPEND CLANG_CUDA_FLAGS -resource-dir=${CLANG_RESOURCE_DIR})
+    endif()
+
+    # NixOS clang wrapper injects GCC C++ includes which conflict with libc++
+    # Use -nostdinc++ to disable auto-injection, then explicitly add libc++ headers
+    # _ALLOW_UNSUPPORTED_LIBCPP bypasses CUDA's "libc++ not supported on x86" error
+    if(DEFINED LIBCXX_INCLUDE AND NOT LIBCXX_INCLUDE STREQUAL "")
+      list(APPEND CLANG_CUDA_FLAGS -nostdinc++ -cxx-isystem${LIBCXX_INCLUDE} -D_ALLOW_UNSUPPORTED_LIBCPP)
+    endif()
+
+    target_compile_options(${TARGET_NAME} PRIVATE ${CLANG_CUDA_FLAGS})
+  endif()
+endfunction()
@@ -6,6 +6,12 @@
 
 namespace sampler {
 
+struct GenerationStats {
+  float tokens_per_sec;   // Overall throughput
+  float ttft_ms;          // Time to first token (ms)
+  float avg_itl_ms;       // Average inter-token latency (ms)
+};
+
 struct GreedyConfig {};
 
 template <typename T> struct is_config : std::false_type {};
@@ -25,8 +31,8 @@ template <tensor::DType T, tensor::Device D, Config C> struct Sampler {
   explicit Sampler(C config, tokenizer::Tokenizer& tokenizer);
   virtual ~Sampler() = default;
 
-  std::tuple<std::string, float> generate(llama::Model<T, D>& model, std::string_view prompt,
-                                          size_t max_num_tokens);
+  std::tuple<std::string, GenerationStats> generate(llama::Model<T, D>& model, std::string_view prompt,
+                                                    size_t max_num_tokens);
 };
 
 template <tensor::DType T, tensor::Device D> struct GreedySampler : Sampler<T, D, GreedyConfig> {
 
@@ -5,9 +5,15 @@
 
 namespace llama {
 
+using namespace tensor;
+
+template <typename D>
+void apply_rope_scaling_(Tensor<float, D>& inv_freq, float factor, float low_freq_factor,
+                         float high_freq_factor, float old_context_len);
+
 template <typename T, typename D> class RoPE {
 private:
-  std::tuple<tensor::Tensor<float, D>, tensor::Tensor<float, D>> cos_sin; // float32
+  std::tuple<Tensor<float, D>, Tensor<float, D>> cos_sin; // float32
 
 public:
   explicit RoPE(const llama::ModelConfig& config);
@@ -17,10 +23,10 @@ template <typename T, typename D> class RoPE {
   RoPE(const RoPE&) = delete;
   RoPE& operator=(const RoPE&) = delete;
 
-  tensor::TensorView<const float, D> cos() const;
-  tensor::TensorView<const float, D> sin() const;
+  TensorView<const float, D> cos() const;
+  TensorView<const float, D> sin() const;
 
-  tensor::Tensor<std::remove_const_t<T>, D> forward(tensor::TensorView<T, D> inputs,
-                                                    size_t position_offset = 0) const;
+  Tensor<std::remove_const_t<T>, D> forward(TensorView<T, D> inputs,
+                                            size_t position_offset = 0) const;
 };
 } // namespace llama
@@ -14,7 +14,7 @@ template <> struct device_name<CPU> {
   static constexpr const char* value = "CPU";
 };
 
-#ifdef TENSOR_HAS_CUDA
+#ifdef BACKEND_CUDA
 struct CUDA {};
 template <> struct is_device<CUDA> : std::true_type {};
 template <> struct device_name<CUDA> {
 
@@ -3,7 +3,7 @@
 #include <tensor/device.hpp>
 #include <tensor/dtype.hpp>
 
-#ifdef TENSOR_HAS_CUDA
+#ifdef BACKEND_CUDA
 #include <cuda_bf16.h>
 #endif
 
@@ -24,13 +24,13 @@ struct device_type {
 template <typename T, typename D>
 using device_type_t = typename device_type<T, D>::type;
 
-#ifdef TENSOR_HAS_CUDA
+#ifdef BACKEND_CUDA
 // Short alias for CUDA device types: Cuda<float> -> float, Cuda<bfloat16> -> __nv_bfloat16
 template <typename T>
 using Cuda = device_type_t<T, CUDA>;
 #endif
 
-#ifdef TENSOR_HAS_CUDA
+#ifdef BACKEND_CUDA
 // CUDA specialization: bfloat16 -> __nv_bfloat16
 template <>
 struct device_type<bfloat16, CUDA> {
 
@@ -101,7 +101,7 @@ template <typename T> class TensorStorage<const T, CPU> {
   }
 };
 
-#ifdef TENSOR_HAS_CUDA
+#ifdef BACKEND_CUDA
 // Mutable CUDA storage - owns device memory
 template <typename T> class TensorStorage<T, CUDA> {
 private:
 
@@ -15,7 +15,7 @@
 #include <utility>
 #include <vector>
 
-#ifdef TENSOR_HAS_CUDA
+#ifdef BACKEND_CUDA
 #include <cuda_runtime.h>
 #endif
 
@@ -134,7 +134,7 @@ template <DType T, Device D> struct TensorView {
     return std::span<const T>(data, data_size);
   }
 
-#ifdef TENSOR_HAS_CUDA
+#ifdef BACKEND_CUDA
   T operator[](int idx) const
     requires std::same_as<D, device::CUDA>
   {
@@ -465,7 +465,7 @@ template <DType T, Device D> class Tensor {
     storage_.fill(value);
   }
 
-#ifdef TENSOR_HAS_CUDA
+#ifdef BACKEND_CUDA
   // Device transfer methods
 
   Tensor<std::remove_const_t<T>, CUDA> cuda() const
Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ template <typename T> class TensorStorage<const T, CPU> {`
`101`	`101`	`}`
`102`	`102`	`};`
`103`	`103`
`104`		`-#ifdef TENSOR_HAS_CUDA`
	`104`	`+#ifdef BACKEND_CUDA`
`105`	`105`	`// Mutable CUDA storage - owns device memory`
`106`	`106`	`template <typename T> class TensorStorage<T, CUDA> {`
`107`	`107`	`private:`