Skip to content

Commit 42df051

Browse files
committed
Llama generates tokens on CUDA!
1 parent bc784df commit 42df051

49 files changed

Lines changed: 1221 additions & 410 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,26 @@ project(
77
LANGUAGES CXX
88
)
99

10+
# Add cmake module path for our custom modules
11+
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
12+
13+
# Backend detection
14+
if(APPLE)
15+
set(BACKEND_METAL ON)
16+
set(BACKEND_CUDA OFF)
17+
else()
18+
set(BACKEND_CUDA ON)
19+
set(BACKEND_METAL OFF)
20+
endif()
21+
22+
# Propagate backend flags as compile definitions
23+
if(BACKEND_CUDA)
24+
add_compile_definitions(BACKEND_CUDA)
25+
endif()
26+
if(BACKEND_METAL)
27+
add_compile_definitions(BACKEND_METAL)
28+
endif()
29+
1030
set(FETCHCONTENT_BASE_DIR "${CMAKE_SOURCE_DIR}/.cmake/fetchcontent")
1131
set(FETCHCONTENT_UPDATES_DISCONNECTED ON)
1232

Makefile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ inspect:
3636
@cmake --build build --target inspect
3737
@./build/apps/inspect
3838

39+
.PHONY: tensor
40+
tensor:
41+
@cmake --build build --target tensor
42+
@ctest --test-dir build -R "^TensorCPU" --output-on-failure
43+
@ctest --test-dir build -R "^TensorCUDA" --output-on-failure
44+
3945
.PHONY: tensor_cpu
4046
tensor_cpu:
4147
@cmake --build build --target test_tensor_cpu
@@ -46,6 +52,12 @@ tensor_cuda:
4652
@cmake --build build --target test_tensor_cuda
4753
@ctest --test-dir build -R "^TensorCUDA" --output-on-failure
4854

55+
.PHONY: nn
56+
nn:
57+
@cmake --build build --target nn
58+
@ctest --test-dir build -R "^NNCPU" --output-on-failure
59+
@ctest --test-dir build -R "^NNCUDA" --output-on-failure
60+
4961
.PHONY: nn_cpu
5062
nn_cpu:
5163
@cmake --build build --target test_nn_cpu
@@ -61,6 +73,11 @@ llama:
6173
@cmake --build build --target test_llama
6274
@ctest --test-dir build -R "^Llama" --output-on-failure
6375

76+
.PHONY: llama_cuda
77+
llama_cuda:
78+
@cmake --build build --target test_llama
79+
@ctest --test-dir build -R "^LlamaCUDA" --output-on-failure
80+
6481
.PHONY: forward
6582
forward:
6683
@cmake --build build --target test_forward

apps/forward.cpp

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include <fmt/format.h>
2+
#include <cstring>
23

34
#include <forward/sampler.hpp>
45
#include <forward/tokenizer.hpp>
@@ -9,25 +10,17 @@
910
using namespace llama;
1011
using namespace tensor;
1112

12-
int main(int argc, char* argv[]) {
13-
const auto* path = "./tests/model";
14-
if (argc > 1) {
15-
path = argv[1];
16-
}
17-
13+
template <Device D>
14+
void run_inference(const char* path, tokenizer::Tokenizer& tok) {
1815
size_t max_tokens = 128;
1916
size_t kv_cache_size = max_tokens;
2017

21-
tokenizer::Tokenizer tok("./tests/model/tokenizer.json");
22-
23-
sampler::GreedySampler<bfloat16, CPU> sampler{sampler::GreedyConfig{}, tok};
24-
25-
Model<bfloat16, CPU> mod("./tests/model/config.json", max_tokens, kv_cache_size);
18+
sampler::GreedySampler<bfloat16, D> sampler{sampler::GreedyConfig{}, tok};
2619

27-
// loader::inspect_safetensors("./tests/model/model.safetensors");
20+
Model<bfloat16, D> mod(fmt::format("{}/config.json", path), max_tokens, kv_cache_size);
2821

2922
fmt::println("Loading weights...");
30-
Loader<bfloat16, CPU> loader{"./tests/model/model.safetensors"};
23+
Loader<bfloat16, D> loader{fmt::format("{}/model.safetensors", path)};
3124
mod.load_weights(loader);
3225

3326
fmt::println("Weights loaded! Performing inference...");
@@ -36,16 +29,44 @@ int main(int argc, char* argv[]) {
3629

3730
fmt::println("Prompt: {}", prompt);
3831

39-
auto gen_and_tok_s = sampler.generate(mod, prompt, 12);
32+
auto [out, stats] = sampler.generate(mod, prompt, 12);
4033

41-
auto out = std::get<0>(gen_and_tok_s);
42-
auto tok_s = std::get<1>(gen_and_tok_s);
34+
auto colored_out = fmt::format(fmt::fg(fmt::color::aqua), "{}", out);
4335

44-
out = fmt::format(fmt::fg(fmt::color::aqua), "{}", out);
36+
fmt::println("{}{}", prompt, colored_out);
4537

46-
fmt::println("{}{}", prompt, out);
38+
fmt::println("");
39+
fmt::println("TTFT: {:.2f} ms", stats.ttft_ms);
40+
fmt::println("Avg ITL: {:.2f} ms", stats.avg_itl_ms);
41+
fmt::println("Tokens / sec: {:.2f}", stats.tokens_per_sec);
42+
}
4743

48-
fmt::println("Tokens / sec: {}", tok_s);
44+
int main(int argc, char* argv[]) {
45+
const char* path = "./tests/model";
46+
bool use_cuda = false;
47+
48+
for (int i = 1; i < argc; ++i) {
49+
if (std::strcmp(argv[i], "--cuda") == 0) {
50+
use_cuda = true;
51+
} else {
52+
path = argv[i];
53+
}
54+
}
55+
56+
tokenizer::Tokenizer tok(fmt::format("{}/tokenizer.json", path));
57+
58+
if (use_cuda) {
59+
#ifdef BACKEND_CUDA
60+
fmt::println("Using CUDA backend");
61+
run_inference<CUDA>(path, tok);
62+
#else
63+
fmt::println("Error: CUDA backend not available. Rebuild with CUDA support.");
64+
return 1;
65+
#endif
66+
} else {
67+
fmt::println("Using CPU backend");
68+
run_inference<CPU>(path, tok);
69+
}
4970

5071
return 0;
5172
}

cmake/CUDAConfig.cmake

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# Common CUDA configuration for all CUDA targets
2+
#
3+
# Usage:
4+
# include(CUDAConfig)
5+
# configure_cuda_target(my_cuda_target)
6+
7+
# Target CUDA architectures
8+
# 70 = V100
9+
# 75 = RTX 20xx, T4
10+
# 80 = A100
11+
# 86 = RTX 30xx
12+
# 89 = RTX 40xx
13+
# 90 = H100
14+
# 100 = B200
15+
# 120 = 5090 RTX
16+
17+
function(configure_cuda_target TARGET_NAME)
18+
set_target_properties(${TARGET_NAME} PROPERTIES
19+
CUDA_SEPARABLE_COMPILATION ON
20+
CUDA_RESOLVE_DEVICE_SYMBOLS ON
21+
)
22+
23+
# Note: CUDA_ARCHITECTURES doesn't support generator expressions
24+
# Use CMAKE_BUILD_TYPE to control this at configure time
25+
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
26+
set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES 120)
27+
else()
28+
# RelWithDebInfo and Release: include PTX for future architectures
29+
set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES 120-real 120-virtual)
30+
endif()
31+
32+
if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
33+
# CUDA compile options for nvcc
34+
target_compile_options(${TARGET_NAME} PRIVATE
35+
$<$<COMPILE_LANGUAGE:CUDA>:
36+
# Debug builds: Full debug info
37+
$<$<CONFIG:Debug>:
38+
-G # Generate device debug info (disables optimizations)
39+
-g # Generate host debug info
40+
>
41+
42+
# RelWithDebInfo: Debug info + optimizations
43+
$<$<CONFIG:RelWithDebInfo>:
44+
-G # Device debug info
45+
-g # Host debug info
46+
--use_fast_math # Fast math even in debug
47+
>
48+
49+
# Release: Maximum optimization
50+
$<$<CONFIG:Release>:
51+
-lineinfo
52+
--use_fast_math
53+
>
54+
55+
# Common flags for all builds
56+
--expt-relaxed-constexpr
57+
-Xcompiler=-fPIC
58+
>
59+
)
60+
elseif(CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
61+
# Clang CUDA flags (for clangd compatibility)
62+
# Use cuda-merged package which has complete headers, not just nvcc
63+
# Also specify resource-dir for NixOS where clangd uses a different resource directory
64+
set(CLANG_CUDA_FLAGS -fPIC)
65+
66+
# Include cuda_compat.h if it exists in the source directory
67+
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/cuda_compat.h")
68+
list(APPEND CLANG_CUDA_FLAGS -include ${CMAKE_CURRENT_SOURCE_DIR}/cuda_compat.h)
69+
endif()
70+
71+
if(DEFINED ENV{CUDA_PATH})
72+
list(APPEND CLANG_CUDA_FLAGS --cuda-path=$ENV{CUDA_PATH})
73+
endif()
74+
75+
if(DEFINED CLANG_RESOURCE_DIR AND NOT CLANG_RESOURCE_DIR STREQUAL "")
76+
list(APPEND CLANG_CUDA_FLAGS -resource-dir=${CLANG_RESOURCE_DIR})
77+
endif()
78+
79+
# NixOS clang wrapper injects GCC C++ includes which conflict with libc++
80+
# Use -nostdinc++ to disable auto-injection, then explicitly add libc++ headers
81+
# _ALLOW_UNSUPPORTED_LIBCPP bypasses CUDA's "libc++ not supported on x86" error
82+
if(DEFINED LIBCXX_INCLUDE AND NOT LIBCXX_INCLUDE STREQUAL "")
83+
list(APPEND CLANG_CUDA_FLAGS -nostdinc++ -cxx-isystem${LIBCXX_INCLUDE} -D_ALLOW_UNSUPPORTED_LIBCPP)
84+
endif()
85+
86+
target_compile_options(${TARGET_NAME} PRIVATE ${CLANG_CUDA_FLAGS})
87+
endif()
88+
endfunction()

include/forward/sampler.hpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66

77
namespace sampler {
88

9+
struct GenerationStats {
10+
float tokens_per_sec; // Overall throughput
11+
float ttft_ms; // Time to first token (ms)
12+
float avg_itl_ms; // Average inter-token latency (ms)
13+
};
14+
915
struct GreedyConfig {};
1016

1117
template <typename T> struct is_config : std::false_type {};
@@ -25,8 +31,8 @@ template <tensor::DType T, tensor::Device D, Config C> struct Sampler {
2531
explicit Sampler(C config, tokenizer::Tokenizer& tokenizer);
2632
virtual ~Sampler() = default;
2733

28-
std::tuple<std::string, float> generate(llama::Model<T, D>& model, std::string_view prompt,
29-
size_t max_num_tokens);
34+
std::tuple<std::string, GenerationStats> generate(llama::Model<T, D>& model, std::string_view prompt,
35+
size_t max_num_tokens);
3036
};
3137

3238
template <tensor::DType T, tensor::Device D> struct GreedySampler : Sampler<T, D, GreedyConfig> {

include/llama/rope.hpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,15 @@
55

66
namespace llama {
77

8+
using namespace tensor;
9+
10+
template <typename D>
11+
void apply_rope_scaling_(Tensor<float, D>& inv_freq, float factor, float low_freq_factor,
12+
float high_freq_factor, float old_context_len);
13+
814
template <typename T, typename D> class RoPE {
915
private:
10-
std::tuple<tensor::Tensor<float, D>, tensor::Tensor<float, D>> cos_sin; // float32
16+
std::tuple<Tensor<float, D>, Tensor<float, D>> cos_sin; // float32
1117

1218
public:
1319
explicit RoPE(const llama::ModelConfig& config);
@@ -17,10 +23,10 @@ template <typename T, typename D> class RoPE {
1723
RoPE(const RoPE&) = delete;
1824
RoPE& operator=(const RoPE&) = delete;
1925

20-
tensor::TensorView<const float, D> cos() const;
21-
tensor::TensorView<const float, D> sin() const;
26+
TensorView<const float, D> cos() const;
27+
TensorView<const float, D> sin() const;
2228

23-
tensor::Tensor<std::remove_const_t<T>, D> forward(tensor::TensorView<T, D> inputs,
24-
size_t position_offset = 0) const;
29+
Tensor<std::remove_const_t<T>, D> forward(TensorView<T, D> inputs,
30+
size_t position_offset = 0) const;
2531
};
2632
} // namespace llama

include/tensor/device.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ template <> struct device_name<CPU> {
1414
static constexpr const char* value = "CPU";
1515
};
1616

17-
#ifdef TENSOR_HAS_CUDA
17+
#ifdef BACKEND_CUDA
1818
struct CUDA {};
1919
template <> struct is_device<CUDA> : std::true_type {};
2020
template <> struct device_name<CUDA> {

include/tensor/device_type.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#include <tensor/device.hpp>
44
#include <tensor/dtype.hpp>
55

6-
#ifdef TENSOR_HAS_CUDA
6+
#ifdef BACKEND_CUDA
77
#include <cuda_bf16.h>
88
#endif
99

@@ -24,13 +24,13 @@ struct device_type {
2424
template <typename T, typename D>
2525
using device_type_t = typename device_type<T, D>::type;
2626

27-
#ifdef TENSOR_HAS_CUDA
27+
#ifdef BACKEND_CUDA
2828
// Short alias for CUDA device types: Cuda<float> -> float, Cuda<bfloat16> -> __nv_bfloat16
2929
template <typename T>
3030
using Cuda = device_type_t<T, CUDA>;
3131
#endif
3232

33-
#ifdef TENSOR_HAS_CUDA
33+
#ifdef BACKEND_CUDA
3434
// CUDA specialization: bfloat16 -> __nv_bfloat16
3535
template <>
3636
struct device_type<bfloat16, CUDA> {

include/tensor/storage.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ template <typename T> class TensorStorage<const T, CPU> {
101101
}
102102
};
103103

104-
#ifdef TENSOR_HAS_CUDA
104+
#ifdef BACKEND_CUDA
105105
// Mutable CUDA storage - owns device memory
106106
template <typename T> class TensorStorage<T, CUDA> {
107107
private:

include/tensor/tensor.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#include <utility>
1616
#include <vector>
1717

18-
#ifdef TENSOR_HAS_CUDA
18+
#ifdef BACKEND_CUDA
1919
#include <cuda_runtime.h>
2020
#endif
2121

@@ -134,7 +134,7 @@ template <DType T, Device D> struct TensorView {
134134
return std::span<const T>(data, data_size);
135135
}
136136

137-
#ifdef TENSOR_HAS_CUDA
137+
#ifdef BACKEND_CUDA
138138
T operator[](int idx) const
139139
requires std::same_as<D, device::CUDA>
140140
{
@@ -465,7 +465,7 @@ template <DType T, Device D> class Tensor {
465465
storage_.fill(value);
466466
}
467467

468-
#ifdef TENSOR_HAS_CUDA
468+
#ifdef BACKEND_CUDA
469469
// Device transfer methods
470470

471471
Tensor<std::remove_const_t<T>, CUDA> cuda() const

0 commit comments

Comments
 (0)