-
Notifications
You must be signed in to change notification settings - Fork 633
Improve handling of the NVTE_CUDA_ARCHS #2665
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,8 +43,6 @@ set(NVTE_SPECIFIC_ARCHS) | |
| # Check for architecture 100 | ||
| list(FIND CMAKE_CUDA_ARCHITECTURES "100" arch_100_index) | ||
| if(NOT arch_100_index EQUAL -1) | ||
| list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "100") | ||
| list(APPEND NVTE_GENERIC_ARCHS "100") | ||
| list(APPEND NVTE_SPECIFIC_ARCHS "100a") | ||
| if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9) | ||
| list(APPEND NVTE_SPECIFIC_ARCHS "103a") | ||
|
|
@@ -54,31 +52,57 @@ endif() | |
| # Check for architecture 101 (if we see this we are in toolkit <= 12.9) | ||
| list(FIND CMAKE_CUDA_ARCHITECTURES "101" arch_101_index) | ||
| if(NOT arch_101_index EQUAL -1) | ||
| list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "101") | ||
| list(APPEND NVTE_GENERIC_ARCHS "101") | ||
| list(APPEND NVTE_SPECIFIC_ARCHS "101a") | ||
| endif() | ||
|
|
||
| # Check for architecture 110 (if we see this we are in toolkit >= 13.0) | ||
| list(FIND CMAKE_CUDA_ARCHITECTURES "110" arch_110_index) | ||
| if(NOT arch_110_index EQUAL -1) | ||
| list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110") | ||
| list(APPEND NVTE_GENERIC_ARCHS "110") | ||
| list(APPEND NVTE_SPECIFIC_ARCHS "110f") | ||
| if(CMAKE_VERSION VERSION_GREATER_EQUAL 4.0.2) | ||
| list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110") | ||
| list(APPEND CMAKE_CUDA_ARCHITECTURES "110f") | ||
| else() | ||
| list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110") | ||
| list(APPEND NVTE_GENERIC_ARCHS "110") | ||
| list(APPEND NVTE_SPECIFIC_ARCHS "110f") | ||
| endif() | ||
| endif() | ||
|
|
||
| # Check for architecture 120 | ||
| list(FIND CMAKE_CUDA_ARCHITECTURES "120" arch_120_index) | ||
| if(NOT arch_120_index EQUAL -1) | ||
| list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "120") | ||
| list(APPEND NVTE_GENERIC_ARCHS "120") | ||
| if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9) | ||
| list(APPEND NVTE_SPECIFIC_ARCHS "120f") | ||
| if(CMAKE_VERSION VERSION_GREATER_EQUAL 4.0.2) | ||
| if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9) | ||
| list(APPEND CMAKE_CUDA_ARCHITECTURES "120f") | ||
| else() | ||
| list(APPEND NVTE_GENERIC_ARCHS "120") | ||
| list(APPEND NVTE_SPECIFIC_ARCHS "120a") | ||
| endif() | ||
| else() | ||
| list(APPEND NVTE_SPECIFIC_ARCHS "120a") | ||
| if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9) | ||
| list(APPEND NVTE_GENERIC_ARCHS "120") | ||
| list(APPEND NVTE_SPECIFIC_ARCHS "120f") | ||
| else() | ||
| list(APPEND NVTE_GENERIC_ARCHS "120") | ||
| list(APPEND NVTE_SPECIFIC_ARCHS "120a") | ||
| endif() | ||
| endif() | ||
| endif() | ||
|
|
||
| if(CMAKE_VERSION VERSION_LESS 4.0.2) | ||
| if(NOT CMAKE_CUDA_ARCHITECTURES) | ||
| message(WARNING | ||
| "CMAKE_CUDA_ARCHITECTURES is empty after replacing arch-specific targets. " | ||
| "Please upgrade to CMake 4.0.2+ for native 'f' architecture support. " | ||
| "Adding sm_75 target in addition to the specified target to avoid configuration " | ||
| "errors - this will result in longer build time, but does not affect correctness.") | ||
| set(CMAKE_CUDA_ARCHITECTURES 75) | ||
| endif() | ||
|
Comment on lines
+93
to
101
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fallback wipes user archs On CMake < 4.0.2, if the arch list becomes empty you
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @greptileai If the list is empty, then setting is the same as appending. |
||
| endif() | ||
|
|
||
| set(NVTE_ARCH_SPECIFIC_TARGETS TRUE) | ||
|
|
||
| # cuDNN frontend API | ||
| set(CUDNN_FRONTEND_INCLUDE_DIR | ||
| "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include") | ||
|
|
@@ -193,7 +217,6 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources) | |
| foreach(arch IN LISTS NVTE_GENERIC_ARCHS) | ||
| list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") | ||
| endforeach() | ||
|
|
||
| if(arch_compile_options) | ||
| set_property( | ||
| SOURCE ${cuda_source} | ||
|
|
@@ -204,7 +227,6 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources) | |
| endif() | ||
| endforeach() | ||
|
|
||
| # Set compile options for CUDA sources with specific architectures | ||
| foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources) | ||
| set(arch_compile_options) | ||
| foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS) | ||
|
|
@@ -221,6 +243,17 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources) | |
| endif() | ||
| endforeach() | ||
|
|
||
| if(NVTE_ARCH_SPECIFIC_TARGETS) | ||
| foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources) | ||
| set_property( | ||
| SOURCE ${cuda_source} | ||
| APPEND | ||
| PROPERTY | ||
| COMPILE_DEFINITIONS NVTE_HAS_ARCH_SPECIFIC_TARGETS=1 | ||
| ) | ||
| endforeach() | ||
| endif() | ||
|
|
||
| if (NVTE_WITH_CUBLASMP) | ||
| list(APPEND transformer_engine_SOURCES | ||
| comm_gemm/comm_gemm.cpp) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Arch 100/101 not removed
When
"100"/"101"are present inCMAKE_CUDA_ARCHITECTURES, this block only appends100a/101atoNVTE_SPECIFIC_ARCHSbut never removes the base arch fromCMAKE_CUDA_ARCHITECTURESnor adds it toNVTE_GENERIC_ARCHS. As a result, the build will still compile all sources forsm_100/sm_101(viaCMAKE_CUDA_ARCHITECTURES) while also compiling arch-specific sources forsm_100a/sm_101a(via--generate-code), which defeats the “generic + specific” split and can cause unexpected extra build work / incorrect targeting. This differs from the110/120handling below, which removes the base arch and routes it into the explicit lists.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@greptileai That is because arches 110/120 are routed to the "f" (like 120f) variant where the entire family is covered, whereas in the case of 100, it routes to "a" variants (like 100a) - this is intentional behavior, but in order to support different Arches in the same family, we need to still compile for the regular 100 then.