From 8f9804a662bfeae40da4dd395f700c4b147f8270 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 26 Jan 2026 15:57:49 -0800 Subject: [PATCH 1/4] Trial and error Signed-off-by: Przemek Tredak --- transformer_engine/common/CMakeLists.txt | 78 +++++++++++++++--------- transformer_engine/common/util/ptx.cuh | 4 ++ 2 files changed, 54 insertions(+), 28 deletions(-) diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index a83cbe3e30..7b74dd7098 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -36,15 +36,14 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) endif() endif() -# Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures -set(NVTE_GENERIC_ARCHS) +# Process CMAKE_CUDA_ARCHITECTURES to identify specific architectures set(NVTE_SPECIFIC_ARCHS) +set(NVTE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) # Check for architecture 100 -list(FIND CMAKE_CUDA_ARCHITECTURES "100" arch_100_index) +list(FIND NVTE_CUDA_ARCHITECTURES "100" arch_100_index) if(NOT arch_100_index EQUAL -1) - list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "100") - list(APPEND NVTE_GENERIC_ARCHS "100") + list(REMOVE_ITEM NVTE_CUDA_ARCHITECTURES "100") list(APPEND NVTE_SPECIFIC_ARCHS "100a") if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9) list(APPEND NVTE_SPECIFIC_ARCHS "103a") @@ -52,26 +51,23 @@ if(NOT arch_100_index EQUAL -1) endif() # Check for architecture 101 (if we see this we are in toolkit <= 12.9) -list(FIND CMAKE_CUDA_ARCHITECTURES "101" arch_101_index) +list(FIND NVTE_CUDA_ARCHITECTURES "101" arch_101_index) if(NOT arch_101_index EQUAL -1) - list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "101") - list(APPEND NVTE_GENERIC_ARCHS "101") + list(REMOVE_ITEM NVTE_CUDA_ARCHITECTURES "101") list(APPEND NVTE_SPECIFIC_ARCHS "101a") endif() # Check for architecture 110 (if we see this we are in toolkit >= 13.0) -list(FIND CMAKE_CUDA_ARCHITECTURES "110" arch_110_index) +list(FIND NVTE_CUDA_ARCHITECTURES "110" arch_110_index) if(NOT arch_110_index EQUAL -1) - list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110") - list(APPEND NVTE_GENERIC_ARCHS "110") + list(REMOVE_ITEM NVTE_CUDA_ARCHITECTURES "110") list(APPEND NVTE_SPECIFIC_ARCHS "110f") endif() # Check for architecture 120 -list(FIND CMAKE_CUDA_ARCHITECTURES "120" arch_120_index) +list(FIND NVTE_CUDA_ARCHITECTURES "120" arch_120_index) if(NOT arch_120_index EQUAL -1) - list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "120") - list(APPEND NVTE_GENERIC_ARCHS "120") + list(REMOVE_ITEM NVTE_CUDA_ARCHITECTURES "120") if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9) list(APPEND NVTE_SPECIFIC_ARCHS "120f") else() @@ -79,6 +75,35 @@ if(NOT arch_120_index EQUAL -1) endif() endif() +if(CMAKE_VERSION VERSION_GREATER_EQUAL 4.0.2) + list(APPEND NVTE_CUDA_ARCHITECTURES ${NVTE_SPECIFIC_ARCHS}) + list(REMOVE_DUPLICATES NVTE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES ${NVTE_CUDA_ARCHITECTURES}) + set(NVTE_SPECIFIC_ARCHS_TO_BUILD) +else() + set(NVTE_SPECIFIC_ARCHS_TO_BUILD ${NVTE_SPECIFIC_ARCHS}) + if(NVTE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES ${NVTE_CUDA_ARCHITECTURES}) + else() + message(WARNING + "CMAKE_CUDA_ARCHITECTURES is empty after replacing arch-specific targets. " + "Please upgrade to CMake 4.0.2+ for native 'a'/'f' architecture support. " + "Falling back to sm_75 to avoid configuration errors.") + set(CMAKE_CUDA_ARCHITECTURES 75) + endif() +endif() + +# Detect whether any arch-specific targets are present +set(NVTE_ARCH_SPECIFIC_TARGETS FALSE) +if(NVTE_SPECIFIC_ARCHS_TO_BUILD) + set(NVTE_ARCH_SPECIFIC_TARGETS TRUE) +endif() +foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + if(arch MATCHES "^[0-9]+[af]$") + set(NVTE_ARCH_SPECIFIC_TARGETS TRUE) + endif() +endforeach() + # cuDNN frontend API set(CUDNN_FRONTEND_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include") @@ -187,10 +212,13 @@ list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_s ${transformer_engine_cuda_sources} ${transformer_engine_cpp_sources}) -# Set compile options for CUDA sources with generic architectures -foreach(cuda_source IN LISTS transformer_engine_cuda_sources) +# Set compile options for CUDA sources with specific architectures +set(transformer_engine_cuda_all_sources + ${transformer_engine_cuda_sources} + ${transformer_engine_cuda_arch_specific_sources}) +foreach(cuda_source IN LISTS transformer_engine_cuda_all_sources) set(arch_compile_options) - foreach(arch IN LISTS NVTE_GENERIC_ARCHS) + foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS_TO_BUILD) list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") endforeach() @@ -204,22 +232,16 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources) endif() endforeach() -# Set compile options for CUDA sources with specific architectures -foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources) - set(arch_compile_options) - foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS) - list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") - endforeach() - - if(arch_compile_options) +if(NVTE_ARCH_SPECIFIC_TARGETS) + foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources) set_property( SOURCE ${cuda_source} APPEND PROPERTY - COMPILE_OPTIONS ${arch_compile_options} + COMPILE_DEFINITIONS NVTE_HAS_ARCH_SPECIFIC_TARGETS=1 ) - endif() -endforeach() + endforeach() +endif() if (NVTE_WITH_CUBLASMP) list(APPEND transformer_engine_SOURCES diff --git a/transformer_engine/common/util/ptx.cuh b/transformer_engine/common/util/ptx.cuh index 9bcf6e2289..50a97fa8d5 100644 --- a/transformer_engine/common/util/ptx.cuh +++ b/transformer_engine/common/util/ptx.cuh @@ -31,10 +31,12 @@ struct ArchSpecific { template constexpr static bool compatible() { if constexpr (CurrentArch == id) { +#if !defined(NVTE_HAS_ARCH_SPECIFIC_TARGETS) static_assert(ArchSpecific == CurrentArch, "Compiled for the generic architecture, while utilizing arch-specific " "features. Please compile for smXXXa architecture instead of smXXX " "architecture."); +#endif return true; } else { return false; @@ -49,10 +51,12 @@ struct FamilySpecific { template constexpr static bool compatible() { if constexpr ((CurrentArch / 100) == (id / 100)) { +#if !defined(NVTE_HAS_ARCH_SPECIFIC_TARGETS) static_assert(FamilySpecific == CurrentArch, "Compiled for the generic architecture, while utilizing family-specific " "features. Please compile for smXXXf architecture instead of smXXX " "architecture."); +#endif return true; } else { return false; From 09e6ca882304f703a4b13015e4b1d81fedce9325 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 26 Jan 2026 15:58:06 -0800 Subject: [PATCH 2/4] Should be ok Signed-off-by: Przemek Tredak --- transformer_engine/common/CMakeLists.txt | 87 +++++++++++++----------- transformer_engine/common/util/ptx.cuh | 8 ++- 2 files changed, 55 insertions(+), 40 deletions(-) diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index 7b74dd7098..71950fff7e 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -36,14 +36,13 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) endif() endif() -# Process CMAKE_CUDA_ARCHITECTURES to identify specific architectures +-# Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures +set(NVTE_GENERIC_ARCHS) set(NVTE_SPECIFIC_ARCHS) -set(NVTE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) # Check for architecture 100 -list(FIND NVTE_CUDA_ARCHITECTURES "100" arch_100_index) +list(FIND CMAKE_CUDA_ARCHITECTURES "100" arch_100_index) if(NOT arch_100_index EQUAL -1) - list(REMOVE_ITEM NVTE_CUDA_ARCHITECTURES "100") list(APPEND NVTE_SPECIFIC_ARCHS "100a") if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9) list(APPEND NVTE_SPECIFIC_ARCHS "103a") @@ -51,39 +50,48 @@ if(NOT arch_100_index EQUAL -1) endif() # Check for architecture 101 (if we see this we are in toolkit <= 12.9) -list(FIND NVTE_CUDA_ARCHITECTURES "101" arch_101_index) +list(FIND CMAKE_CUDA_ARCHITECTURES "101" arch_101_index) if(NOT arch_101_index EQUAL -1) - list(REMOVE_ITEM NVTE_CUDA_ARCHITECTURES "101") list(APPEND NVTE_SPECIFIC_ARCHS "101a") endif() # Check for architecture 110 (if we see this we are in toolkit >= 13.0) -list(FIND NVTE_CUDA_ARCHITECTURES "110" arch_110_index) +list(FIND CMAKE_CUDA_ARCHITECTURES "110" arch_110_index) if(NOT arch_110_index EQUAL -1) - list(REMOVE_ITEM NVTE_CUDA_ARCHITECTURES "110") - list(APPEND NVTE_SPECIFIC_ARCHS "110f") + if(CMAKE_VERSION VERSION_GREATER_EQUAL 4.0.2) + list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110") + list(APPEND CMAKE_CUDA_ARCHITECTURES "110f") + else() + list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110") + list(APPEND NVTE_GENERIC_ARCHS "110") + list(APPEND NVTE_SPECIFIC_ARCHS "110f") + endif() endif() # Check for architecture 120 -list(FIND NVTE_CUDA_ARCHITECTURES "120" arch_120_index) +list(FIND CMAKE_CUDA_ARCHITECTURES "120" arch_120_index) if(NOT arch_120_index EQUAL -1) - list(REMOVE_ITEM NVTE_CUDA_ARCHITECTURES "120") - if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9) - list(APPEND NVTE_SPECIFIC_ARCHS "120f") + list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "120") + if(CMAKE_VERSION VERSION_GREATER_EQUAL 4.0.2) + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9) + list(APPEND CMAKE_CUDA_ARCHITECTURES "120f") + else() + list(APPEND NVTE_GENERIC_ARCHS "120") + list(APPEND NVTE_SPECIFIC_ARCHS "120a") + endif() else() - list(APPEND NVTE_SPECIFIC_ARCHS "120a") + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9) + list(APPEND NVTE_GENERIC_ARCHS "120") + list(APPEND NVTE_SPECIFIC_ARCHS "120f") + else() + list(APPEND NVTE_GENERIC_ARCHS "120") + list(APPEND NVTE_SPECIFIC_ARCHS "120a") + endif() endif() endif() -if(CMAKE_VERSION VERSION_GREATER_EQUAL 4.0.2) - list(APPEND NVTE_CUDA_ARCHITECTURES ${NVTE_SPECIFIC_ARCHS}) - list(REMOVE_DUPLICATES NVTE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES ${NVTE_CUDA_ARCHITECTURES}) - set(NVTE_SPECIFIC_ARCHS_TO_BUILD) -else() - set(NVTE_SPECIFIC_ARCHS_TO_BUILD ${NVTE_SPECIFIC_ARCHS}) - if(NVTE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES ${NVTE_CUDA_ARCHITECTURES}) +if(CMAKE_VERSION VERSION_LESS 4.0.2) + if(CMAKE_CUDA_ARCHITECTURES) else() message(WARNING "CMAKE_CUDA_ARCHITECTURES is empty after replacing arch-specific targets. " @@ -93,16 +101,7 @@ else() endif() endif() -# Detect whether any arch-specific targets are present -set(NVTE_ARCH_SPECIFIC_TARGETS FALSE) -if(NVTE_SPECIFIC_ARCHS_TO_BUILD) - set(NVTE_ARCH_SPECIFIC_TARGETS TRUE) -endif() -foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) - if(arch MATCHES "^[0-9]+[af]$") - set(NVTE_ARCH_SPECIFIC_TARGETS TRUE) - endif() -endforeach() +set(NVTE_ARCH_SPECIFIC_TARGETS TRUE) # cuDNN frontend API set(CUDNN_FRONTEND_INCLUDE_DIR @@ -212,11 +211,23 @@ list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_s ${transformer_engine_cuda_sources} ${transformer_engine_cpp_sources}) -# Set compile options for CUDA sources with specific architectures -set(transformer_engine_cuda_all_sources - ${transformer_engine_cuda_sources} - ${transformer_engine_cuda_arch_specific_sources}) -foreach(cuda_source IN LISTS transformer_engine_cuda_all_sources) +# Set compile options for CUDA sources with generic architectures +foreach(cuda_source IN LISTS transformer_engine_cuda_sources) + set(arch_compile_options) + foreach(arch IN LISTS NVTE_GENERIC_ARCHS) + list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") + endforeach() + if(arch_compile_options) + set_property( + SOURCE ${cuda_source} + APPEND + PROPERTY + COMPILE_OPTIONS ${arch_compile_options} + ) + endif() +endforeach() + +foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources) set(arch_compile_options) foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS_TO_BUILD) list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") diff --git a/transformer_engine/common/util/ptx.cuh b/transformer_engine/common/util/ptx.cuh index 50a97fa8d5..840e0c6a0e 100644 --- a/transformer_engine/common/util/ptx.cuh +++ b/transformer_engine/common/util/ptx.cuh @@ -36,8 +36,10 @@ struct ArchSpecific { "Compiled for the generic architecture, while utilizing arch-specific " "features. Please compile for smXXXa architecture instead of smXXX " "architecture."); -#endif return true; +#else + return ArchSpecific == CurrentArch; +#endif } else { return false; } @@ -56,8 +58,10 @@ struct FamilySpecific { "Compiled for the generic architecture, while utilizing family-specific " "features. Please compile for smXXXf architecture instead of smXXX " "architecture."); -#endif return true; +#else + return FamilySpecific == CurrentArch; +#endif } else { return false; } From 5740300ccf2ed2d965b8be294b7430d56f591a7d Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 9 Feb 2026 14:26:26 -0800 Subject: [PATCH 3/4] Fix one issue and add better message to the fallback path Signed-off-by: Przemek Tredak --- transformer_engine/common/CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index 71950fff7e..55a1bafb14 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -36,7 +36,7 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) endif() endif() --# Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures +# Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures set(NVTE_GENERIC_ARCHS) set(NVTE_SPECIFIC_ARCHS) @@ -95,8 +95,9 @@ if(CMAKE_VERSION VERSION_LESS 4.0.2) else() message(WARNING "CMAKE_CUDA_ARCHITECTURES is empty after replacing arch-specific targets. " - "Please upgrade to CMake 4.0.2+ for native 'a'/'f' architecture support. " - "Falling back to sm_75 to avoid configuration errors.") + "Please upgrade to CMake 4.0.2+ for native 'f' architecture support. " + "Adding sm_75 target in addition to the specified target to avoid configuration " + "errors - this will result in longer build time, but does not affect correctness.") set(CMAKE_CUDA_ARCHITECTURES 75) endif() endif() From 71041ae5a53b3af2c9bea1faa6b8ce67ff9fa5ad Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 9 Feb 2026 14:57:09 -0800 Subject: [PATCH 4/4] Fixes Signed-off-by: Przemek Tredak --- transformer_engine/common/CMakeLists.txt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index 55a1bafb14..3c106b982e 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -91,8 +91,7 @@ if(NOT arch_120_index EQUAL -1) endif() if(CMAKE_VERSION VERSION_LESS 4.0.2) - if(CMAKE_CUDA_ARCHITECTURES) - else() + if(NOT CMAKE_CUDA_ARCHITECTURES) message(WARNING "CMAKE_CUDA_ARCHITECTURES is empty after replacing arch-specific targets. " "Please upgrade to CMake 4.0.2+ for native 'f' architecture support. " @@ -216,8 +215,8 @@ list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_s foreach(cuda_source IN LISTS transformer_engine_cuda_sources) set(arch_compile_options) foreach(arch IN LISTS NVTE_GENERIC_ARCHS) - list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") - endforeach() + list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") + endforeach() if(arch_compile_options) set_property( SOURCE ${cuda_source} @@ -230,7 +229,7 @@ endforeach() foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources) set(arch_compile_options) - foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS_TO_BUILD) + foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS) list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") endforeach()