Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 46 additions & 13 deletions transformer_engine/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ set(NVTE_SPECIFIC_ARCHS)
# Check for architecture 100
list(FIND CMAKE_CUDA_ARCHITECTURES "100" arch_100_index)
if(NOT arch_100_index EQUAL -1)
list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "100")
list(APPEND NVTE_GENERIC_ARCHS "100")
list(APPEND NVTE_SPECIFIC_ARCHS "100a")
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9)
list(APPEND NVTE_SPECIFIC_ARCHS "103a")
Expand All @@ -54,31 +52,57 @@ endif()
# Check for architecture 101 (if we see this we are in toolkit <= 12.9)
list(FIND CMAKE_CUDA_ARCHITECTURES "101" arch_101_index)
if(NOT arch_101_index EQUAL -1)
list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "101")
list(APPEND NVTE_GENERIC_ARCHS "101")
list(APPEND NVTE_SPECIFIC_ARCHS "101a")
endif()
Comment on lines 43 to 56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arch 100/101 not removed

When "100" / "101" are present in CMAKE_CUDA_ARCHITECTURES, this block only appends 100a/101a to NVTE_SPECIFIC_ARCHS but never removes the base arch from CMAKE_CUDA_ARCHITECTURES nor adds it to NVTE_GENERIC_ARCHS. As a result, the build will still compile all sources for sm_100 / sm_101 (via CMAKE_CUDA_ARCHITECTURES) while also compiling arch-specific sources for sm_100a / sm_101a (via --generate-code), which defeats the “generic + specific” split and can cause unexpected extra build work / incorrect targeting. This differs from the 110/120 handling below, which removes the base arch and routes it into the explicit lists.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@greptileai That is because arches 110/120 are routed to the "f" (like 120f) variant where the entire family is covered, whereas in the case of 100, it routes to "a" variants (like 100a) - this is intentional behavior, but in order to support different Arches in the same family, we need to still compile for the regular 100 then.


# Check for architecture 110 (if we see this we are in toolkit >= 13.0)
list(FIND CMAKE_CUDA_ARCHITECTURES "110" arch_110_index)
if(NOT arch_110_index EQUAL -1)
list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110")
list(APPEND NVTE_GENERIC_ARCHS "110")
list(APPEND NVTE_SPECIFIC_ARCHS "110f")
if(CMAKE_VERSION VERSION_GREATER_EQUAL 4.0.2)
list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110")
list(APPEND CMAKE_CUDA_ARCHITECTURES "110f")
else()
list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110")
list(APPEND NVTE_GENERIC_ARCHS "110")
list(APPEND NVTE_SPECIFIC_ARCHS "110f")
endif()
endif()

# Check for architecture 120
list(FIND CMAKE_CUDA_ARCHITECTURES "120" arch_120_index)
if(NOT arch_120_index EQUAL -1)
list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "120")
list(APPEND NVTE_GENERIC_ARCHS "120")
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9)
list(APPEND NVTE_SPECIFIC_ARCHS "120f")
if(CMAKE_VERSION VERSION_GREATER_EQUAL 4.0.2)
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9)
list(APPEND CMAKE_CUDA_ARCHITECTURES "120f")
else()
list(APPEND NVTE_GENERIC_ARCHS "120")
list(APPEND NVTE_SPECIFIC_ARCHS "120a")
endif()
else()
list(APPEND NVTE_SPECIFIC_ARCHS "120a")
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9)
list(APPEND NVTE_GENERIC_ARCHS "120")
list(APPEND NVTE_SPECIFIC_ARCHS "120f")
else()
list(APPEND NVTE_GENERIC_ARCHS "120")
list(APPEND NVTE_SPECIFIC_ARCHS "120a")
endif()
endif()
endif()

if(CMAKE_VERSION VERSION_LESS 4.0.2)
if(NOT CMAKE_CUDA_ARCHITECTURES)
message(WARNING
"CMAKE_CUDA_ARCHITECTURES is empty after replacing arch-specific targets. "
"Please upgrade to CMake 4.0.2+ for native 'f' architecture support. "
"Adding sm_75 target in addition to the specified target to avoid configuration "
"errors - this will result in longer build time, but does not affect correctness.")
set(CMAKE_CUDA_ARCHITECTURES 75)
endif()
Comment on lines +93 to 101
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fallback wipes user archs

On CMake < 4.0.2, if the arch list becomes empty you set(CMAKE_CUDA_ARCHITECTURES 75) (dropping the user’s originally requested arch entirely). Given the warning text says “add sm_75 target in addition to the specified target”, this looks like a functional mismatch: the fallback should preserve the originally selected arch(es) and append 75, not replace everything with 75.

Copy link
Member Author

@ptrendx ptrendx Feb 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@greptileai If the list is empty, then setting is the same as appending.

endif()

set(NVTE_ARCH_SPECIFIC_TARGETS TRUE)

# cuDNN frontend API
set(CUDNN_FRONTEND_INCLUDE_DIR
"${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include")
Expand Down Expand Up @@ -193,7 +217,6 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
foreach(arch IN LISTS NVTE_GENERIC_ARCHS)
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
endforeach()

if(arch_compile_options)
set_property(
SOURCE ${cuda_source}
Expand All @@ -204,7 +227,6 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
endif()
endforeach()

# Set compile options for CUDA sources with specific architectures
foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources)
set(arch_compile_options)
foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS)
Expand All @@ -221,6 +243,17 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources)
endif()
endforeach()

if(NVTE_ARCH_SPECIFIC_TARGETS)
foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources)
set_property(
SOURCE ${cuda_source}
APPEND
PROPERTY
COMPILE_DEFINITIONS NVTE_HAS_ARCH_SPECIFIC_TARGETS=1
)
endforeach()
endif()

if (NVTE_WITH_CUBLASMP)
list(APPEND transformer_engine_SOURCES
comm_gemm/comm_gemm.cpp)
Expand Down
8 changes: 8 additions & 0 deletions transformer_engine/common/util/ptx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,15 @@ struct ArchSpecific {
template <int CurrentArch, int ArchSpecific, int FamilySpecific>
constexpr static bool compatible() {
if constexpr (CurrentArch == id) {
#if !defined(NVTE_HAS_ARCH_SPECIFIC_TARGETS)
static_assert(ArchSpecific == CurrentArch,
"Compiled for the generic architecture, while utilizing arch-specific "
"features. Please compile for smXXXa architecture instead of smXXX "
"architecture.");
return true;
#else
return ArchSpecific == CurrentArch;
#endif
} else {
return false;
}
Expand All @@ -49,11 +53,15 @@ struct FamilySpecific {
template <int CurrentArch, int ArchSpecific, int FamilySpecific>
constexpr static bool compatible() {
if constexpr ((CurrentArch / 100) == (id / 100)) {
#if !defined(NVTE_HAS_ARCH_SPECIFIC_TARGETS)
static_assert(FamilySpecific == CurrentArch,
"Compiled for the generic architecture, while utilizing family-specific "
"features. Please compile for smXXXf architecture instead of smXXX "
"architecture.");
return true;
#else
return FamilySpecific == CurrentArch;
#endif
} else {
return false;
}
Expand Down
Loading