Skip to content

Commit d971604

Browse files
committed
GH actions build
1 parent 42df051 commit d971604

11 files changed

Lines changed: 67 additions & 75 deletions

File tree

.github/workflows/ci.yml

Lines changed: 28 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ permissions:
1313

1414
jobs:
1515
build-and-test:
16-
name: ${{ matrix.os }}-${{ matrix.compiler }}-${{ matrix.build_type }}
16+
name: ${{ matrix.os }}-${{ matrix.compiler }}-${{ matrix.backend }}-${{ matrix.build_type }}
1717
runs-on: ${{ matrix.os }}
1818

1919
strategy:
@@ -22,10 +22,26 @@ jobs:
2222
os: [ubuntu-latest]
2323
compiler: [clang-20, gcc-14]
2424
build_type: [Release, Debug]
25+
backend: [cuda, cpu]
26+
exclude:
27+
# Only test CPU backend with one compiler in Debug to save CI time
28+
- backend: cpu
29+
compiler: clang-20
30+
build_type: Debug
2531

2632
steps:
2733
- uses: actions/checkout@v4
2834

35+
- name: Install CUDA Toolkit
36+
if: matrix.backend == 'cuda'
37+
uses: Jimver/cuda-toolkit@v0.2.21
38+
id: cuda-toolkit
39+
with:
40+
cuda: '12.8.0'
41+
method: 'network'
42+
sub-packages: '["nvcc", "cudart", "thrust"]'
43+
non-cuda-sub-packages: '["libcublas", "libcublas-dev"]'
44+
2945
- name: Set up compiler (Clang)
3046
if: matrix.compiler == 'clang-20'
3147
uses: egor-tensin/setup-clang@v1
@@ -102,12 +118,19 @@ jobs:
102118
uses: actions/cache@v4
103119
with:
104120
path: .cmake/fetchcontent
105-
key: ${{ runner.os }}-${{ matrix.compiler }}-cmake-${{ hashFiles('**/CMakeLists.txt') }}
121+
key: ${{ runner.os }}-${{ matrix.compiler }}-${{ matrix.backend }}-cmake-${{ hashFiles('**/CMakeLists.txt') }}
106122
restore-keys: |
107-
${{ runner.os }}-${{ matrix.compiler }}-cmake-
123+
${{ runner.os }}-${{ matrix.compiler }}-${{ matrix.backend }}-cmake-
108124
109125
- name: Configure CMake
110-
run: cmake -S . -B build -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
126+
run: |
127+
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}"
128+
if [[ "${{ matrix.backend }}" == "cuda" ]]; then
129+
CMAKE_ARGS="$CMAKE_ARGS -DBACKEND_CUDA=ON -DSKIP_CUDA_TESTS=ON -DCMAKE_CUDA_ARCHITECTURES=120"
130+
else
131+
CMAKE_ARGS="$CMAKE_ARGS -DBACKEND_CUDA=OFF"
132+
fi
133+
cmake -S . -B build $CMAKE_ARGS
111134
112135
- name: Build
113136
run: cmake --build build --config ${{ matrix.build_type }} --parallel
@@ -120,48 +143,4 @@ jobs:
120143
if: always()
121144
with:
122145
files: build/test-results.xml
123-
check_name: Test Results (${{ matrix.os }}-${{ matrix.compiler }}-${{ matrix.build_type }})
124-
125-
# CUDA compilation check (no GPU required, just verifies code compiles)
126-
cuda-build:
127-
name: CUDA Build Check
128-
runs-on: ubuntu-latest
129-
130-
steps:
131-
- uses: actions/checkout@v4
132-
133-
- name: Install CUDA Toolkit
134-
uses: Jimver/cuda-toolkit@v0.2.21
135-
id: cuda-toolkit
136-
with:
137-
cuda: '12.8.0'
138-
method: 'network'
139-
sub-packages: '["nvcc", "cudart", "thrust"]'
140-
non-cuda-sub-packages: '["libcublas", "libcublas-dev"]'
141-
142-
- name: Install OpenMP
143-
run: |
144-
sudo apt-get update
145-
sudo apt-get install -y libgomp1
146-
147-
- name: Cache CMake dependencies
148-
uses: actions/cache@v4
149-
with:
150-
path: .cmake/fetchcontent
151-
key: ${{ runner.os }}-cuda-cmake-${{ hashFiles('**/CMakeLists.txt') }}
152-
restore-keys: |
153-
${{ runner.os }}-cuda-cmake-
154-
155-
- name: Configure CMake with CUDA
156-
run: |
157-
cmake -S . -B build \
158-
-DCMAKE_BUILD_TYPE=Release \
159-
-DTENSOR_BUILD_CUDA=ON \
160-
-DSKIP_CUDA_TESTS=ON \
161-
-DCMAKE_CUDA_ARCHITECTURES=89
162-
163-
- name: Build (including CUDA)
164-
run: cmake --build build --config Release --parallel
165-
166-
- name: Run tests (CUDA tests will be skipped)
167-
run: ctest --test-dir build --output-on-failure
146+
check_name: Test Results (${{ matrix.os }}-${{ matrix.compiler }}-${{ matrix.backend }}-${{ matrix.build_type }})

CMakeLists.txt

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,25 @@ project(
1010
# Add cmake module path for our custom modules
1111
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
1212

13-
# Backend detection
13+
# Backend detection - can be overridden via -DBACKEND_CUDA=ON/OFF
1414
if(APPLE)
15-
set(BACKEND_METAL ON)
16-
set(BACKEND_CUDA OFF)
15+
option(BACKEND_METAL "Enable Metal backend" ON)
16+
option(BACKEND_CUDA "Enable CUDA backend" OFF)
1717
else()
18-
set(BACKEND_CUDA ON)
19-
set(BACKEND_METAL OFF)
18+
option(BACKEND_CUDA "Enable CUDA backend" ON)
19+
option(BACKEND_METAL "Enable Metal backend" OFF)
2020
endif()
2121

2222
# Propagate backend flags as compile definitions
2323
if(BACKEND_CUDA)
2424
add_compile_definitions(BACKEND_CUDA)
25+
message(STATUS "Backend: CUDA enabled")
26+
else()
27+
message(STATUS "Backend: CUDA disabled")
2528
endif()
2629
if(BACKEND_METAL)
2730
add_compile_definitions(BACKEND_METAL)
31+
message(STATUS "Backend: Metal enabled")
2832
endif()
2933

3034
set(FETCHCONTENT_BASE_DIR "${CMAKE_SOURCE_DIR}/.cmake/fetchcontent")
@@ -33,8 +37,13 @@ set(FETCHCONTENT_UPDATES_DISCONNECTED ON)
3337
# msgpack in tokenizers_cpp is doing weird stuff
3438
set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
3539

36-
set(CMAKE_C_COMPILER_LAUNCHER ccache)
37-
set(CMAKE_CXX_COMPILER_LAUNCHER ccache)
40+
# Use ccache if available
41+
find_program(CCACHE_PROGRAM ccache)
42+
if(CCACHE_PROGRAM)
43+
set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
44+
set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
45+
message(STATUS "Using ccache: ${CCACHE_PROGRAM}")
46+
endif()
3847

3948
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
4049
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

CMakePresets.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
"CMAKE_CUDA_ARCHITECTURES": "120",
2828
"CMAKE_CUDA_COMPILER_TOOLKIT_ROOT": "$env{CUDA_PATH}",
2929
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
30-
"TENSOR_BUILD_CUDA": "ON",
30+
"BACKEND_CUDA": "ON",
3131
"CUDAToolkit_ROOT": "$env{CUDA_PATH}",
3232
"OpenMP_ROOT": "$env{OPENMP_ROOT}",
3333
"CLANG_RESOURCE_DIR": "$env{CLANG_RESOURCE_DIR}",
@@ -44,7 +44,7 @@
4444
"CMAKE_BUILD_TYPE": "Debug",
4545
"CMAKE_CXX_COMPILER": "$env{CLANGXX_PATH}",
4646
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
47-
"TENSOR_BUILD_CUDA": "OFF",
47+
"BACKEND_CUDA": "OFF",
4848
"OpenMP_ROOT": "$env{OPENMP_ROOT}"
4949
}
5050
}

benchmarks/CMakeLists.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,4 @@ FetchContent_Declare(
99

1010
FetchContent_MakeAvailable(googlebenchmark)
1111

12-
if(APPLE)
13-
option(TENSOR_CUDA "Benchmark tensor_cuda library" OFF)
14-
else()
15-
option(TENSOR_CUDA "Benchmark tensor_cuda library" ON)
16-
endif()
17-
1812
add_subdirectory(tensor)

benchmarks/tensor/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
if(TENSOR_CUDA)
1+
if(BACKEND_CUDA)
22
add_executable(bm_tensor cpu/bm_ops.cpp cuda/bm_ops.cpp)
33
target_link_libraries(bm_tensor PRIVATE tensor_core tensor_cpu tensor_cuda benchmark::benchmark_main)
44
else()

src/forward/sampler.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,12 @@ std::tuple<std::string, GenerationStats> Sampler<T, D, C>::generate(llama::Model
3737
Tensor<int, device::CPU> inputs_cpu({1, token_ids.size()}, std::vector<int>(token_ids));
3838

3939
Tensor<int, D> inputs = [&]() {
40+
#ifdef BACKEND_CUDA
4041
if constexpr (std::same_as<D, device::CUDA>) {
4142
return inputs_cpu.cuda();
42-
} else {
43+
} else
44+
#endif
45+
{
4346
return std::move(inputs_cpu);
4447
}
4548
}();
@@ -56,9 +59,12 @@ std::tuple<std::string, GenerationStats> Sampler<T, D, C>::generate(llama::Model
5659

5760
// Transfer sampled ids to CPU to read values
5861
Tensor<int, device::CPU> sampled_ids_cpu = [&]() {
62+
#ifdef BACKEND_CUDA
5963
if constexpr (std::same_as<D, device::CUDA>) {
6064
return sampled_ids.cpu();
61-
} else {
65+
} else
66+
#endif
67+
{
6268
return std::move(sampled_ids);
6369
}
6470
}();

tests/CMakeLists.txt

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,8 @@ configure_file(
1515
${CMAKE_CURRENT_BINARY_DIR}/common/test_config.h
1616
)
1717

18-
# compiled library code
19-
if(APPLE)
20-
option(BUILD_CUDA "Build CUDA tests" OFF)
21-
else()
22-
option(BUILD_CUDA "Build CUDA tests" ON)
23-
endif()
24-
25-
if(BUILD_CUDA)
18+
# Build CUDA tests if BACKEND_CUDA is enabled
19+
if(BACKEND_CUDA)
2620
enable_language(CUDA)
2721
find_package(CUDAToolkit REQUIRED)
2822
add_subdirectory(tensor/cuda)

tests/llama/test_grouped_query_attention.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ TEST(LlamaGQATest, Parity) {
3434
tensor_is_close<bfloat16>(output.view().span(), output_activations.span());
3535
}
3636

37+
#ifdef BACKEND_CUDA
3738
TEST(LlamaCUDAGQATest, Parity) {
3839
SKIP_IF_NO_GPU();
3940
Loader<bfloat16, CUDA> act_loader(TEST_ACTIVATIONS_PATH);
@@ -63,6 +64,7 @@ TEST(LlamaCUDAGQATest, Parity) {
6364
// Use slightly higher tolerance for CUDA due to bf16 precision and kernel ordering differences
6465
tensor_is_close<bfloat16>(output_cpu.view().span(), output_activations.span(), 2e-3f, 2e-3f);
6566
}
67+
#endif
6668

6769
TEST(LlamaGQATest, ParityWithKVCache) {
6870
Loader<bfloat16, CPU> act_loader(TEST_ACTIVATIONS_PATH);
@@ -104,6 +106,7 @@ TEST(LlamaGQATest, ParityWithKVCache) {
104106
EXPECT_EQ(gqa.get_cache_size(), 4);
105107
}
106108

109+
#ifdef BACKEND_CUDA
107110
TEST(LlamaCUDAGQATest, ParityWithKVCache) {
108111
SKIP_IF_NO_GPU();
109112
Loader<bfloat16, CUDA> act_loader(TEST_ACTIVATIONS_PATH);
@@ -148,3 +151,4 @@ TEST(LlamaCUDAGQATest, ParityWithKVCache) {
148151

149152
EXPECT_EQ(gqa.get_cache_size(), 4);
150153
}
154+
#endif

tests/llama/test_layer.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ TEST(LlamaLayerTest, Parity) {
3939
tensor_is_close<bfloat16>(output.view().span(), output_activations.span(), 1e-02);
4040
}
4141

42+
#ifdef BACKEND_CUDA
4243
TEST(LlamaCUDALayerTest, Parity) {
4344
SKIP_IF_NO_GPU();
4445
Loader<bfloat16, CUDA> act_loader(TEST_ACTIVATIONS_PATH);
@@ -72,3 +73,4 @@ TEST(LlamaCUDALayerTest, Parity) {
7273

7374
tensor_is_close<bfloat16>(output_cpu.view().span(), output_activations.span(), 1e-02);
7475
}
76+
#endif

tests/llama/test_mlp.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ TEST(LlamaMLPTest, Parity) {
3030
tensor_is_close<bfloat16>(output.view().span(), output_activations.span());
3131
}
3232

33+
#ifdef BACKEND_CUDA
3334
TEST(LlamaCUDAMLPTest, Parity) {
3435
SKIP_IF_NO_GPU();
3536
Loader<bfloat16, CUDA> act_loader(TEST_ACTIVATIONS_PATH);
@@ -53,3 +54,4 @@ TEST(LlamaCUDAMLPTest, Parity) {
5354
// Use slightly relaxed tolerance for CUDA (3x default) due to cuBLAS precision differences
5455
tensor_is_close<bfloat16>(output_cpu.view().span(), output_activations.span(), 3e-3f, 3e-3f);
5556
}
57+
#endif

0 commit comments

Comments
 (0)