-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCMakeLists.txt
More file actions
191 lines (166 loc) · 8.13 KB
/
CMakeLists.txt
File metadata and controls
191 lines (166 loc) · 8.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
cmake_minimum_required(VERSION 3.24)
project(FastCuda VERSION 0.2.0 LANGUAGES CXX CUDA)
# ── Language standards ──────────────────────────────────────────────
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CUDA_STANDARD 11)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_EXTENSIONS OFF)
# ── Platform guard ──────────────────────────────────────────────────
if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows|Linux")
message(FATAL_ERROR "FastCuda supports only Windows and Linux hosts.")
endif()
# ── CUDA toolkit ────────────────────────────────────────────────────
find_package(CUDAToolkit REQUIRED)
if(NOT CMAKE_CUDA_COMPILER_VERSION)
message(FATAL_ERROR "Unable to determine CUDA compiler version.")
endif()
if(NOT (
(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8" AND
CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "12.9")
OR
(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0" AND
CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "13.1")
))
message(FATAL_ERROR
"FastCuda currently supports CUDA 12.8.x and 13.0.x. "
"Detected: ${CMAKE_CUDA_COMPILER_VERSION}")
endif()
set(FASTCUDA_CUDA_ARCHITECTURES "89;120" CACHE STRING
"CUDA architectures to build for (RTX 4090=89, RTX 5060=120).")
set(CMAKE_CUDA_ARCHITECTURES "${FASTCUDA_CUDA_ARCHITECTURES}")
# ── Options ─────────────────────────────────────────────────────────
option(FASTCUDA_BUILD_PYTHON "Build pybind11 Python module" OFF)
option(FASTCUDA_BUILD_EXAMPLES "Build example executables" ON)
option(FASTCUDA_BUILD_BENCH "Build benchmark executable" ON)
# ── Version header ──────────────────────────────────────────────────
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/include/fastcuda/version.hpp.in
${CMAKE_CURRENT_BINARY_DIR}/generated/fastcuda/version.hpp
@ONLY
)
# ── Source lists ────────────────────────────────────────────────────
set(FASTCUDA_SOURCES
src/gemm/gemm_v1_naive.cu
src/gemm/gemm_v2_shared.cu
src/gemm/gemm_v3_register.cu
src/gemm/gemm_v4_warp.cu
src/gemm/gemm_v5_tf32.cu
src/gemm/gemm_v6_hgemm.cu
src/gemm/gemm_api.cu
src/reduce/reduce_v0_baseline.cu
src/reduce/reduce_v1_no_divergence.cu
src/reduce/reduce_v2_no_bank_conflict.cu
src/reduce/reduce_v3_add_during_load.cu
src/reduce/reduce_v4_unroll_last_warp.cu
src/reduce/reduce_v5_completely_unroll.cu
src/reduce/reduce_v6_multi_add.cu
src/reduce/reduce_v7_shuffle.cu
src/reduce/reduce_api.cu
src/runtime/runtime.cu
)
set(FASTCUDA_PUBLIC_HEADERS
include/fastcuda/export.h
include/fastcuda/types.h
include/fastcuda/gemm.h
include/fastcuda/gemm.hpp
include/fastcuda/reduce.h
include/fastcuda/reduce.hpp
include/fastcuda/runtime.h
include/fastcuda/runtime.hpp
include/fastcuda/fastcuda.h
include/fastcuda/fastcuda.hpp
${CMAKE_CURRENT_BINARY_DIR}/generated/fastcuda/version.hpp
)
# ── Shared library ──────────────────────────────────────────────────
add_library(fastcuda SHARED ${FASTCUDA_SOURCES})
target_compile_definitions(fastcuda PRIVATE FASTCUDA_BUILD_SHARED)
target_include_directories(fastcuda
PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/generated>
$<INSTALL_INTERFACE:include>
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_link_libraries(fastcuda PUBLIC CUDA::cudart)
target_compile_features(fastcuda PUBLIC cxx_std_11)
set_target_properties(fastcuda PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
POSITION_INDEPENDENT_CODE ON
)
# ── Static library ──────────────────────────────────────────────────
add_library(fastcuda_static STATIC ${FASTCUDA_SOURCES})
target_include_directories(fastcuda_static
PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/generated>
$<INSTALL_INTERFACE:include>
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_link_libraries(fastcuda_static PUBLIC CUDA::cudart)
target_compile_features(fastcuda_static PUBLIC cxx_std_11)
set_target_properties(fastcuda_static PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME fastcuda_static
)
# ── Benchmark ───────────────────────────────────────────────────────
if(FASTCUDA_BUILD_BENCH)
add_executable(fastcuda_bench benchmarks/bench_main.cu)
target_link_libraries(fastcuda_bench PRIVATE fastcuda CUDA::cublas)
target_compile_features(fastcuda_bench PRIVATE cxx_std_11)
set_target_properties(fastcuda_bench PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
)
endif()
# ── Examples ────────────────────────────────────────────────────────
if(FASTCUDA_BUILD_EXAMPLES)
add_executable(fastcuda_gemm_example examples/gemm_example.cpp)
add_executable(fastcuda_hgemm_example examples/hgemm_example.cpp)
add_executable(fastcuda_reduce_example examples/reduce_example.cpp)
target_link_libraries(fastcuda_gemm_example PRIVATE fastcuda)
target_link_libraries(fastcuda_hgemm_example PRIVATE fastcuda)
target_link_libraries(fastcuda_reduce_example PRIVATE fastcuda)
target_compile_features(fastcuda_gemm_example PRIVATE cxx_std_11)
target_compile_features(fastcuda_hgemm_example PRIVATE cxx_std_11)
target_compile_features(fastcuda_reduce_example PRIVATE cxx_std_11)
endif()
# ── Python bindings (pybind11) ──────────────────────────────────────
if(FASTCUDA_BUILD_PYTHON)
set(PYBIND11_FINDPYTHON ON)
find_package(Python COMPONENTS Interpreter Development REQUIRED)
find_package(pybind11 CONFIG REQUIRED)
pybind11_add_module(fastcuda_python python/fastcuda_python.cpp)
target_link_libraries(fastcuda_python PRIVATE fastcuda)
target_compile_features(fastcuda_python PRIVATE cxx_std_11)
endif()
# ── Install rules ───────────────────────────────────────────────────
set(_INSTALL_TARGETS fastcuda fastcuda_static)
if(FASTCUDA_BUILD_BENCH)
list(APPEND _INSTALL_TARGETS fastcuda_bench)
endif()
if(FASTCUDA_BUILD_EXAMPLES)
list(APPEND _INSTALL_TARGETS
fastcuda_gemm_example
fastcuda_hgemm_example
fastcuda_reduce_example)
endif()
install(TARGETS ${_INSTALL_TARGETS}
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
)
install(FILES ${FASTCUDA_PUBLIC_HEADERS}
DESTINATION include/fastcuda
)
# ── Summary ─────────────────────────────────────────────────────────
message(STATUS "FastCuda v${PROJECT_VERSION}")
message(STATUS " Host platform : ${CMAKE_SYSTEM_NAME}")
message(STATUS " CUDA compiler : ${CMAKE_CUDA_COMPILER_VERSION}")
message(STATUS " Architectures : ${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS " Build examples: ${FASTCUDA_BUILD_EXAMPLES}")
message(STATUS " Build bench : ${FASTCUDA_BUILD_BENCH}")
message(STATUS " Build python : ${FASTCUDA_BUILD_PYTHON}")