From a8d8af9901254aafe0dc5c769041d2edc266211e Mon Sep 17 00:00:00 2001 From: Lukas Thomann Date: Wed, 25 Feb 2026 21:56:48 +0100 Subject: [PATCH 1/4] cleaned up the cmake stuff a little bit, added new features for vcpkg to cover the dependencies automatically (for the most part) --- cmake/HalideUtils.cmake | 99 ------------------- cmake/{ => find}/BuildBackwardCpp.cmake | 0 cmake/{ => find}/BuildCereal.cmake | 0 .../{ => find}/BuildFlashlightSequence.cmake | 0 cmake/{ => find}/BuildFlashlightText.cmake | 0 cmake/{ => find}/BuildGloo.cmake | 0 cmake/{ => find}/BuildGoogleTest.cmake | 0 cmake/{ => find}/BuildSndFile.cmake | 0 cmake/{ => find}/BuildStb.cmake | 0 cmake/{ => find}/Buildsox.cmake | 0 cmake/{ => find}/FindCBLAS.cmake | 0 cmake/{ => find}/FindCUDNN.cmake | 0 cmake/{ => find}/FindFFTW3.cmake | 0 cmake/{ => find}/FindFLAC.cmake | 0 cmake/{ => find}/FindFilesystem.cmake | 0 cmake/{ => find}/FindGLOG.cmake | 0 cmake/{ => find}/FindGMock.cmake | 0 cmake/{ => find}/FindMKL.cmake | 0 cmake/{ => find}/FindNCCL.cmake | 0 cmake/{ => find}/FindOgg.cmake | 0 cmake/{ => find}/FindSndFile.cmake | 0 cmake/{ => find}/FindVorbis.cmake | 0 cmake/{ => find}/Findgflags.cmake | 0 cmake/{ => find}/Findsox.cmake | 0 cmake/{ => utils}/InternalUtils.cmake | 32 +++--- cmake/{ => utils}/TestUtils.cmake | 0 cmake/{ => utils}/flashlightConfig.cmake.in | 0 cmake/utils/toolchain.cmake | 10 +- vcpkg.json | 35 ++++++- 29 files changed, 54 insertions(+), 122 deletions(-) delete mode 100644 cmake/HalideUtils.cmake rename cmake/{ => find}/BuildBackwardCpp.cmake (100%) rename cmake/{ => find}/BuildCereal.cmake (100%) rename cmake/{ => find}/BuildFlashlightSequence.cmake (100%) rename cmake/{ => find}/BuildFlashlightText.cmake (100%) rename cmake/{ => find}/BuildGloo.cmake (100%) rename cmake/{ => find}/BuildGoogleTest.cmake (100%) rename cmake/{ => find}/BuildSndFile.cmake (100%) rename cmake/{ => find}/BuildStb.cmake (100%) rename cmake/{ => find}/Buildsox.cmake (100%) rename cmake/{ => find}/FindCBLAS.cmake (100%) rename cmake/{ => find}/FindCUDNN.cmake (100%) rename cmake/{ => find}/FindFFTW3.cmake (100%) rename cmake/{ => find}/FindFLAC.cmake (100%) rename cmake/{ => find}/FindFilesystem.cmake (100%) rename cmake/{ => find}/FindGLOG.cmake (100%) rename cmake/{ => find}/FindGMock.cmake (100%) rename cmake/{ => find}/FindMKL.cmake (100%) rename cmake/{ => find}/FindNCCL.cmake (100%) rename cmake/{ => find}/FindOgg.cmake (100%) rename cmake/{ => find}/FindSndFile.cmake (100%) rename cmake/{ => find}/FindVorbis.cmake (100%) rename cmake/{ => find}/Findgflags.cmake (100%) rename cmake/{ => find}/Findsox.cmake (100%) rename cmake/{ => utils}/InternalUtils.cmake (95%) rename cmake/{ => utils}/TestUtils.cmake (100%) rename cmake/{ => utils}/flashlightConfig.cmake.in (100%) diff --git a/cmake/HalideUtils.cmake b/cmake/HalideUtils.cmake deleted file mode 100644 index d1b0831..0000000 --- a/cmake/HalideUtils.cmake +++ /dev/null @@ -1,99 +0,0 @@ -# Adds a Halide library. Compiles a Halide AOT-generator at compile time, -# then runs the generator to produce header and lib artifacts that -# can be linked to a passed target. -# -# SRC - the src file for the project -# NAME - the name of the resulting target -# LIBS - libraries to which the generated library will be linked -# PREPROC - preprocessor defs to pass to the new target -# LINK_TO - target to which to link the generated pipeline -function(fl_add_and_link_halide_lib) - set(options) - set(oneValueArgs SRC NAME LINK_TO) - set(multiValueArgs LIBS PREPROC) - cmake_parse_arguments(fl_add_and_link_halide_lib - "${options}" - "${oneValueArgs}" - "${multiValueArgs}" - ${ARGN}) - - # Generator binary - set(GENERATOR_TARGET ${fl_add_and_link_halide_lib_NAME}_generator) - # Generator output - set(GENERATED_TARGET generate_${fl_add_and_link_halide_lib_NAME}) - add_executable(${GENERATOR_TARGET} ${fl_add_and_link_halide_lib_SRC}) - target_link_libraries( - ${GENERATOR_TARGET} - PRIVATE - Halide::Halide - ${LIBS}) - target_compile_definitions( - ${GENERATOR_TARGET} - PRIVATE - ${PREPROC}) - - # Run the generator - # LLVM may leak memory during Halide compilation - if building with ASAN, - # the generator might fail. Disable leack checking when executing generators - set(GENERATED_LIB "${fl_add_and_link_halide_lib_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GENERATED_HEADER "${fl_add_and_link_halide_lib_NAME}.h") - add_custom_command(OUTPUT ${GENERATED_HEADER} "${GENERATED_LIB}" - DEPENDS ${GENERATOR_TARGET} - COMMAND ${CMAKE_COMMAND} -E env "ASAN_OPTIONS=detect_leaks=0" $ - VERBATIM) - add_custom_target(${GENERATED_TARGET} - DEPENDS ${GENERATED_HEADER} "${GENERATED_LIB}") - add_dependencies(${GENERATED_TARGET} ${GENERATOR_TARGET}) - - set(LIB_PATH ${CMAKE_CURRENT_BINARY_DIR}/${GENERATED_LIB}) - message(STATUS "Will generate AOT Halide Pipeline ${fl_add_and_link_halide_lib_NAME}") - - # TODO: use an IMPORTED target? Might be cleaner - # add_library(${fl_add_and_link_halide_lib_NAME} STATIC IMPORTED) - # set_target_properties(${fl_add_and_link_halide_lib_NAME} PROPERTIES - # INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CURRENT_BINARY_DIR} - # IMPORTED_LOCATION "${GENERATED_LIB}" - # INTERFACE_LINK_LIBRARIES Halide::Halide) - # add_dependencies(${fl_add_and_link_halide_lib_NAME} ${GENERATED_TARGET}) - - # Link the generated Halide lib to the target - add_dependencies(${fl_add_and_link_halide_lib_LINK_TO} ${GENERATED_TARGET}) - # Ensure we can find generated headers - target_include_directories( - ${fl_add_and_link_halide_lib_LINK_TO} PUBLIC - $) - # For now, this linkeage is private, which means the Flashlight core needs - # to wrap Halide pipelines when exposing them to external binaries. - # Properly installing the Halide lib will facilitate public linkeage. - target_link_libraries(${fl_add_and_link_halide_lib_LINK_TO} PRIVATE ${LIB_PATH}) -endfunction(fl_add_and_link_halide_lib) - -# Adds a Halide library that is linked with Flashlight. -# -# If used from an included CMake list, we won't run into -# cmake_policy(SET CMP0079 NEW) issues. Halide pipelines -# compiled for tests should use fl_add_and_link_halide_lib -# instead since those are called via calls to `add_subdirectories`. -# CMake 3.13 resolves this. -function(fl_add_halide_lib) - set(options) - set(oneValueArgs SRC NAME) - set(multiValueArgs LIBS PREPROC) - cmake_parse_arguments(fl_add_halide_lib "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) - - fl_add_and_link_halide_lib( - SRC ${fl_add_halide_lib_SRC} - NAME ${fl_add_halide_lib_NAME} - LIBS ${fl_add_halide_lib_LIBS} - PREPROC ${fl_add_halide_lib_PREPROC} - LINK_TO flashlight - ) - - # TODO: An IMPORTED target could help with this - # cmake_policy(SET CMP0079 NEW) - # target_link_libraries(flashlight PUBLIC ...) - # add_dependencies(flashlight ${GENERATED_TARGET}) - # Generated Halide libs get installed too - # set(INSTALLABLE_TARGETS ${INSTALLABLE_TARGETS} ${LIB_PATH} PARENT_SCOPE) -endfunction(fl_add_halide_lib) diff --git a/cmake/BuildBackwardCpp.cmake b/cmake/find/BuildBackwardCpp.cmake similarity index 100% rename from cmake/BuildBackwardCpp.cmake rename to cmake/find/BuildBackwardCpp.cmake diff --git a/cmake/BuildCereal.cmake b/cmake/find/BuildCereal.cmake similarity index 100% rename from cmake/BuildCereal.cmake rename to cmake/find/BuildCereal.cmake diff --git a/cmake/BuildFlashlightSequence.cmake b/cmake/find/BuildFlashlightSequence.cmake similarity index 100% rename from cmake/BuildFlashlightSequence.cmake rename to cmake/find/BuildFlashlightSequence.cmake diff --git a/cmake/BuildFlashlightText.cmake b/cmake/find/BuildFlashlightText.cmake similarity index 100% rename from cmake/BuildFlashlightText.cmake rename to cmake/find/BuildFlashlightText.cmake diff --git a/cmake/BuildGloo.cmake b/cmake/find/BuildGloo.cmake similarity index 100% rename from cmake/BuildGloo.cmake rename to cmake/find/BuildGloo.cmake diff --git a/cmake/BuildGoogleTest.cmake b/cmake/find/BuildGoogleTest.cmake similarity index 100% rename from cmake/BuildGoogleTest.cmake rename to cmake/find/BuildGoogleTest.cmake diff --git a/cmake/BuildSndFile.cmake b/cmake/find/BuildSndFile.cmake similarity index 100% rename from cmake/BuildSndFile.cmake rename to cmake/find/BuildSndFile.cmake diff --git a/cmake/BuildStb.cmake b/cmake/find/BuildStb.cmake similarity index 100% rename from cmake/BuildStb.cmake rename to cmake/find/BuildStb.cmake diff --git a/cmake/Buildsox.cmake b/cmake/find/Buildsox.cmake similarity index 100% rename from cmake/Buildsox.cmake rename to cmake/find/Buildsox.cmake diff --git a/cmake/FindCBLAS.cmake b/cmake/find/FindCBLAS.cmake similarity index 100% rename from cmake/FindCBLAS.cmake rename to cmake/find/FindCBLAS.cmake diff --git a/cmake/FindCUDNN.cmake b/cmake/find/FindCUDNN.cmake similarity index 100% rename from cmake/FindCUDNN.cmake rename to cmake/find/FindCUDNN.cmake diff --git a/cmake/FindFFTW3.cmake b/cmake/find/FindFFTW3.cmake similarity index 100% rename from cmake/FindFFTW3.cmake rename to cmake/find/FindFFTW3.cmake diff --git a/cmake/FindFLAC.cmake b/cmake/find/FindFLAC.cmake similarity index 100% rename from cmake/FindFLAC.cmake rename to cmake/find/FindFLAC.cmake diff --git a/cmake/FindFilesystem.cmake b/cmake/find/FindFilesystem.cmake similarity index 100% rename from cmake/FindFilesystem.cmake rename to cmake/find/FindFilesystem.cmake diff --git a/cmake/FindGLOG.cmake b/cmake/find/FindGLOG.cmake similarity index 100% rename from cmake/FindGLOG.cmake rename to cmake/find/FindGLOG.cmake diff --git a/cmake/FindGMock.cmake b/cmake/find/FindGMock.cmake similarity index 100% rename from cmake/FindGMock.cmake rename to cmake/find/FindGMock.cmake diff --git a/cmake/FindMKL.cmake b/cmake/find/FindMKL.cmake similarity index 100% rename from cmake/FindMKL.cmake rename to cmake/find/FindMKL.cmake diff --git a/cmake/FindNCCL.cmake b/cmake/find/FindNCCL.cmake similarity index 100% rename from cmake/FindNCCL.cmake rename to cmake/find/FindNCCL.cmake diff --git a/cmake/FindOgg.cmake b/cmake/find/FindOgg.cmake similarity index 100% rename from cmake/FindOgg.cmake rename to cmake/find/FindOgg.cmake diff --git a/cmake/FindSndFile.cmake b/cmake/find/FindSndFile.cmake similarity index 100% rename from cmake/FindSndFile.cmake rename to cmake/find/FindSndFile.cmake diff --git a/cmake/FindVorbis.cmake b/cmake/find/FindVorbis.cmake similarity index 100% rename from cmake/FindVorbis.cmake rename to cmake/find/FindVorbis.cmake diff --git a/cmake/Findgflags.cmake b/cmake/find/Findgflags.cmake similarity index 100% rename from cmake/Findgflags.cmake rename to cmake/find/Findgflags.cmake diff --git a/cmake/Findsox.cmake b/cmake/find/Findsox.cmake similarity index 100% rename from cmake/Findsox.cmake rename to cmake/find/Findsox.cmake diff --git a/cmake/InternalUtils.cmake b/cmake/utils/InternalUtils.cmake similarity index 95% rename from cmake/InternalUtils.cmake rename to cmake/utils/InternalUtils.cmake index 9f0075f..374f08d 100644 --- a/cmake/InternalUtils.cmake +++ b/cmake/utils/InternalUtils.cmake @@ -9,7 +9,8 @@ function(add_coverage_to_target) -O0 # TODO: reconcile this with CMake modes for something cleaner -g $<$:--coverage> - ) + ) + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.13) target_link_options(${add_coverage_to_target_TARGET} PUBLIC @@ -28,7 +29,8 @@ function(setup_install_targets) "${multiValueArgs}" ${ARGN}) list(LENGTH setup_install_targets_INSTALL_TARGETS TARGETS_LENGTH) - if (${TARGETS_LENGTH} EQUAL 0) + + if(${TARGETS_LENGTH} EQUAL 0) message(FATAL_ERROR "Flashlight setup_install_targets called with " "empty targets list.") endif() @@ -44,7 +46,7 @@ function(setup_install_targets) ARCHIVE DESTINATION ${FL_INSTALL_LIB_DIR} FRAMEWORK DESTINATION framework INCLUDES DESTINATION ${FL_INSTALL_INC_DIR} - ) + ) # Write and install targets file install( @@ -52,44 +54,43 @@ function(setup_install_targets) NAMESPACE flashlight:: DESTINATION ${FL_INSTALL_CMAKE_DIR} COMPONENT flashlight - ) + ) # Write config file (used by projects including fl, such as examples) include(CMakePackageConfigHelpers) set(INCLUDE_DIRS include) set(CMAKE_DIR ${FL_INSTALL_CMAKE_DIR}) configure_package_config_file( - ${PROJECT_SOURCE_DIR}/cmake/flashlightConfig.cmake.in + ${PROJECT_SOURCE_DIR}/cmake/utils/flashlightConfig.cmake.in cmake/install/${FL_CONFIG_CMAKE_BUILD_DIR}/flashlightConfig.cmake INSTALL_DESTINATION ${FL_INSTALL_CMAKE_DIR} PATH_VARS INCLUDE_DIRS CMAKE_DIR - ) + ) write_basic_package_version_file( cmake/install/${FL_CONFIG_CMAKE_BUILD_DIR}/flashlightConfigVersion.cmake COMPATIBILITY SameMajorVersion - ) + ) install(FILES ${PROJECT_BINARY_DIR}/cmake/install/flashlightConfig.cmake ${PROJECT_BINARY_DIR}/cmake/install/flashlightConfigVersion.cmake DESTINATION ${FL_INSTALL_CMAKE_DIR} COMPONENT flashlight - ) + ) set_target_properties(${setup_install_targets_INSTALL_TARGETS} PROPERTIES VERSION "${flashlight_VERSION}" SOVERSION "${flashlight_VERSION_MAJOR}") endfunction(setup_install_targets) function(setup_install_headers HEADER_DIR DEST_DIR) - # Move headers install( DIRECTORY ${HEADER_DIR} COMPONENT headers DESTINATION ${DEST_DIR} FILES_MATCHING # preserve directory structure - PATTERN "*.h" - PATTERN "*.hpp" + PATTERN "*.h" + PATTERN "*.hpp" PATTERN "*.cuh" # TODO: make this conditional, e.g. $ PATTERN "test*" EXCLUDE PATTERN "tests" EXCLUDE @@ -103,17 +104,17 @@ function(setup_install_headers HEADER_DIR DEST_DIR) PATTERN "experimental" EXCLUDE PATTERN "plugincompiler" EXCLUDE PATTERN ".git" EXCLUDE - ) + ) endfunction(setup_install_headers) function(setup_install_find_module CONFIG_PATH) # Only actually move module files if doing a standalone install; otherwise, # assume we're being installed by a package manager - if (FL_BUILD_STANDALONE) + if(FL_BUILD_STANDALONE) install( FILES ${CONFIG_PATH} DESTINATION ${FL_INSTALL_CMAKE_DIR} - ) + ) endif() endfunction() @@ -121,7 +122,7 @@ function(set_executable_output_directory EXEC_TARGET DIRECTORY) set_target_properties(${EXEC_TARGET} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${DIRECTORY} - ) + ) endfunction() # Small utility function which wraps cmake_dependent options and throws an error if the user @@ -143,5 +144,6 @@ function(fl_dependent_option) message(FATAL_ERROR "${_dep} Required to build ${_option}") endif() endforeach() + cmake_dependent_option(${_option} ${_text} "${_val}" "${_deps}" ${_frce}) endfunction() diff --git a/cmake/TestUtils.cmake b/cmake/utils/TestUtils.cmake similarity index 100% rename from cmake/TestUtils.cmake rename to cmake/utils/TestUtils.cmake diff --git a/cmake/flashlightConfig.cmake.in b/cmake/utils/flashlightConfig.cmake.in similarity index 100% rename from cmake/flashlightConfig.cmake.in rename to cmake/utils/flashlightConfig.cmake.in diff --git a/cmake/utils/toolchain.cmake b/cmake/utils/toolchain.cmake index 57071c2..9d75007 100644 --- a/cmake/utils/toolchain.cmake +++ b/cmake/utils/toolchain.cmake @@ -8,12 +8,16 @@ message(STATUS "---- fm_cmake toolchain ----") #append cmake dir to module path set(FM_CMAKE_LIBRARY_DIR ${CMAKE_CURRENT_LIST_DIR}) -list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_LIBRARY_DIR}") -message(STATUS "appended cmake/utils/ to cmake module path") +set(FM_CMAKE_UTILITY_DIR "${FM_CMAKE_LIBRARY_DIR}") +list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_UTILITY_DIR}") +message(VERBOSE "appended utility dir to cmake module path (${FM_CMAKE_UTILITY_DIR})") list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_LIBRARY_DIR}/../") -message(STATUS "appended (${FM_CMAKE_LIBRARY_DIR}/../) cmake/ to cmake module path") +message(VERBOSE "appended (${FM_CMAKE_LIBRARY_DIR}/../) cmake/ to cmake module path") +set(FM_CMAKE_FIND_SCRIPT_DIR "${FM_CMAKE_LIBRARY_DIR}/../find/") +list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_FIND_SCRIPT_DIR}") +message(VERBOSE "appended find scripts to module path (${FM_CMAKE_FIND_SCRIPT_DIR})") include(fm_assertions) diff --git a/vcpkg.json b/vcpkg.json index 7dcf289..0463957 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -6,14 +6,39 @@ "gtest" ], "features": { - "cuda": { - "description": "Dependencies for gpu backend", + "distributed": { + "description": "Dependencies for distributed backend", + "dependencies": [ + "mpi" + ] + }, + "runtime": { + "description": "flashmini runtime helpers", + "dependencies": [ + "glog", + "gflags" + ] + }, + "vision": { + "description": "vision pkg", + "dependencies": [ + "opencv", + "stb" + ] + }, + "text": { + "description": "text pkg", "dependencies": [] }, - "cpu": { - "description": "Dependencies for cpu backend", + "speech": { + "description": "speech pkg", "dependencies": [ - "onednn" + "libsndfile", + "libogg", + "libvorbis", + "libflac", + "fftw3", + "openblas" ] } } From 7898f9a4caf5ec0daee67a4e44e5da77c56c027d Mon Sep 17 00:00:00 2001 From: Lukas Thomann Date: Wed, 25 Feb 2026 22:01:32 +0100 Subject: [PATCH 2/4] renamed "find" to "dependencies" --- cmake/{find => dependencies}/BuildBackwardCpp.cmake | 0 cmake/{find => dependencies}/BuildCereal.cmake | 0 .../BuildFlashlightSequence.cmake | 0 .../{find => dependencies}/BuildFlashlightText.cmake | 0 cmake/{find => dependencies}/BuildGloo.cmake | 0 cmake/{find => dependencies}/BuildGoogleTest.cmake | 0 cmake/{find => dependencies}/BuildSndFile.cmake | 0 cmake/{find => dependencies}/BuildStb.cmake | 0 cmake/{find => dependencies}/Buildsox.cmake | 0 cmake/{find => dependencies}/FindCBLAS.cmake | 0 cmake/{find => dependencies}/FindCUDNN.cmake | 0 cmake/{find => dependencies}/FindFFTW3.cmake | 0 cmake/{find => dependencies}/FindFLAC.cmake | 0 cmake/{find => dependencies}/FindFilesystem.cmake | 0 cmake/{find => dependencies}/FindGLOG.cmake | 0 cmake/{find => dependencies}/FindGMock.cmake | 0 cmake/{find => dependencies}/FindMKL.cmake | 0 cmake/{find => dependencies}/FindNCCL.cmake | 0 cmake/{find => dependencies}/FindOgg.cmake | 0 cmake/{find => dependencies}/FindSndFile.cmake | 0 cmake/{find => dependencies}/FindVorbis.cmake | 0 cmake/{find => dependencies}/Findgflags.cmake | 0 cmake/{find => dependencies}/Findsox.cmake | 0 cmake/utils/toolchain.cmake | 2 +- flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp | 12 ++++-------- 25 files changed, 5 insertions(+), 9 deletions(-) rename cmake/{find => dependencies}/BuildBackwardCpp.cmake (100%) rename cmake/{find => dependencies}/BuildCereal.cmake (100%) rename cmake/{find => dependencies}/BuildFlashlightSequence.cmake (100%) rename cmake/{find => dependencies}/BuildFlashlightText.cmake (100%) rename cmake/{find => dependencies}/BuildGloo.cmake (100%) rename cmake/{find => dependencies}/BuildGoogleTest.cmake (100%) rename cmake/{find => dependencies}/BuildSndFile.cmake (100%) rename cmake/{find => dependencies}/BuildStb.cmake (100%) rename cmake/{find => dependencies}/Buildsox.cmake (100%) rename cmake/{find => dependencies}/FindCBLAS.cmake (100%) rename cmake/{find => dependencies}/FindCUDNN.cmake (100%) rename cmake/{find => dependencies}/FindFFTW3.cmake (100%) rename cmake/{find => dependencies}/FindFLAC.cmake (100%) rename cmake/{find => dependencies}/FindFilesystem.cmake (100%) rename cmake/{find => dependencies}/FindGLOG.cmake (100%) rename cmake/{find => dependencies}/FindGMock.cmake (100%) rename cmake/{find => dependencies}/FindMKL.cmake (100%) rename cmake/{find => dependencies}/FindNCCL.cmake (100%) rename cmake/{find => dependencies}/FindOgg.cmake (100%) rename cmake/{find => dependencies}/FindSndFile.cmake (100%) rename cmake/{find => dependencies}/FindVorbis.cmake (100%) rename cmake/{find => dependencies}/Findgflags.cmake (100%) rename cmake/{find => dependencies}/Findsox.cmake (100%) diff --git a/cmake/find/BuildBackwardCpp.cmake b/cmake/dependencies/BuildBackwardCpp.cmake similarity index 100% rename from cmake/find/BuildBackwardCpp.cmake rename to cmake/dependencies/BuildBackwardCpp.cmake diff --git a/cmake/find/BuildCereal.cmake b/cmake/dependencies/BuildCereal.cmake similarity index 100% rename from cmake/find/BuildCereal.cmake rename to cmake/dependencies/BuildCereal.cmake diff --git a/cmake/find/BuildFlashlightSequence.cmake b/cmake/dependencies/BuildFlashlightSequence.cmake similarity index 100% rename from cmake/find/BuildFlashlightSequence.cmake rename to cmake/dependencies/BuildFlashlightSequence.cmake diff --git a/cmake/find/BuildFlashlightText.cmake b/cmake/dependencies/BuildFlashlightText.cmake similarity index 100% rename from cmake/find/BuildFlashlightText.cmake rename to cmake/dependencies/BuildFlashlightText.cmake diff --git a/cmake/find/BuildGloo.cmake b/cmake/dependencies/BuildGloo.cmake similarity index 100% rename from cmake/find/BuildGloo.cmake rename to cmake/dependencies/BuildGloo.cmake diff --git a/cmake/find/BuildGoogleTest.cmake b/cmake/dependencies/BuildGoogleTest.cmake similarity index 100% rename from cmake/find/BuildGoogleTest.cmake rename to cmake/dependencies/BuildGoogleTest.cmake diff --git a/cmake/find/BuildSndFile.cmake b/cmake/dependencies/BuildSndFile.cmake similarity index 100% rename from cmake/find/BuildSndFile.cmake rename to cmake/dependencies/BuildSndFile.cmake diff --git a/cmake/find/BuildStb.cmake b/cmake/dependencies/BuildStb.cmake similarity index 100% rename from cmake/find/BuildStb.cmake rename to cmake/dependencies/BuildStb.cmake diff --git a/cmake/find/Buildsox.cmake b/cmake/dependencies/Buildsox.cmake similarity index 100% rename from cmake/find/Buildsox.cmake rename to cmake/dependencies/Buildsox.cmake diff --git a/cmake/find/FindCBLAS.cmake b/cmake/dependencies/FindCBLAS.cmake similarity index 100% rename from cmake/find/FindCBLAS.cmake rename to cmake/dependencies/FindCBLAS.cmake diff --git a/cmake/find/FindCUDNN.cmake b/cmake/dependencies/FindCUDNN.cmake similarity index 100% rename from cmake/find/FindCUDNN.cmake rename to cmake/dependencies/FindCUDNN.cmake diff --git a/cmake/find/FindFFTW3.cmake b/cmake/dependencies/FindFFTW3.cmake similarity index 100% rename from cmake/find/FindFFTW3.cmake rename to cmake/dependencies/FindFFTW3.cmake diff --git a/cmake/find/FindFLAC.cmake b/cmake/dependencies/FindFLAC.cmake similarity index 100% rename from cmake/find/FindFLAC.cmake rename to cmake/dependencies/FindFLAC.cmake diff --git a/cmake/find/FindFilesystem.cmake b/cmake/dependencies/FindFilesystem.cmake similarity index 100% rename from cmake/find/FindFilesystem.cmake rename to cmake/dependencies/FindFilesystem.cmake diff --git a/cmake/find/FindGLOG.cmake b/cmake/dependencies/FindGLOG.cmake similarity index 100% rename from cmake/find/FindGLOG.cmake rename to cmake/dependencies/FindGLOG.cmake diff --git a/cmake/find/FindGMock.cmake b/cmake/dependencies/FindGMock.cmake similarity index 100% rename from cmake/find/FindGMock.cmake rename to cmake/dependencies/FindGMock.cmake diff --git a/cmake/find/FindMKL.cmake b/cmake/dependencies/FindMKL.cmake similarity index 100% rename from cmake/find/FindMKL.cmake rename to cmake/dependencies/FindMKL.cmake diff --git a/cmake/find/FindNCCL.cmake b/cmake/dependencies/FindNCCL.cmake similarity index 100% rename from cmake/find/FindNCCL.cmake rename to cmake/dependencies/FindNCCL.cmake diff --git a/cmake/find/FindOgg.cmake b/cmake/dependencies/FindOgg.cmake similarity index 100% rename from cmake/find/FindOgg.cmake rename to cmake/dependencies/FindOgg.cmake diff --git a/cmake/find/FindSndFile.cmake b/cmake/dependencies/FindSndFile.cmake similarity index 100% rename from cmake/find/FindSndFile.cmake rename to cmake/dependencies/FindSndFile.cmake diff --git a/cmake/find/FindVorbis.cmake b/cmake/dependencies/FindVorbis.cmake similarity index 100% rename from cmake/find/FindVorbis.cmake rename to cmake/dependencies/FindVorbis.cmake diff --git a/cmake/find/Findgflags.cmake b/cmake/dependencies/Findgflags.cmake similarity index 100% rename from cmake/find/Findgflags.cmake rename to cmake/dependencies/Findgflags.cmake diff --git a/cmake/find/Findsox.cmake b/cmake/dependencies/Findsox.cmake similarity index 100% rename from cmake/find/Findsox.cmake rename to cmake/dependencies/Findsox.cmake diff --git a/cmake/utils/toolchain.cmake b/cmake/utils/toolchain.cmake index 9d75007..47d0dd8 100644 --- a/cmake/utils/toolchain.cmake +++ b/cmake/utils/toolchain.cmake @@ -15,7 +15,7 @@ message(VERBOSE "appended utility dir to cmake module path (${FM_CMAKE_UTILITY_D list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_LIBRARY_DIR}/../") message(VERBOSE "appended (${FM_CMAKE_LIBRARY_DIR}/../) cmake/ to cmake module path") -set(FM_CMAKE_FIND_SCRIPT_DIR "${FM_CMAKE_LIBRARY_DIR}/../find/") +set(FM_CMAKE_FIND_SCRIPT_DIR "${FM_CMAKE_LIBRARY_DIR}/../dependencies/") list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_FIND_SCRIPT_DIR}") message(VERBOSE "appended find scripts to module path (${FM_CMAKE_FIND_SCRIPT_DIR})") diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp index ec209fd..17b242b 100644 --- a/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp +++ b/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp @@ -167,15 +167,12 @@ std::tuple CudnnAutogradExtension::rnn( TensorDescriptor cyDesc(x.type(), hDims); - size_t workspaceSize = - getWorkspaceSize(handle, rnnDesc, seqLength, xDescs); - size_t reserveSize = - getReserveSize(handle, rnnDesc, seqLength, xDescs); + size_t workspaceSize = getWorkspaceSize(handle, rnnDesc, seqLength, xDescs); + size_t reserveSize = getReserveSize(handle, rnnDesc, seqLength, xDescs); Tensor workspace({static_cast(workspaceSize)}, fl::dtype::b8); // Space must be reused between forward and backward for cuDNN - payload->reserveSpace = - Tensor({static_cast(reserveSize)}, fl::dtype::b8); + payload->reserveSpace = Tensor({static_cast(reserveSize)}, fl::dtype::b8); { auto contiguousX = x.asContiguousTensor(); @@ -265,8 +262,7 @@ std::tuple CudnnAutogradExtension::rnnBackward( int outSize = hiddenSize * (bidirectional ? 2 : 1); DropoutDescriptor dropout(dropProb); - RNNDescriptor rnnDesc( - input.type(), hiddenSize, numLayers, mode, bidirectional, dropout); + RNNDescriptor rnnDesc(input.type(), hiddenSize, numLayers, mode, bidirectional, dropout); setCudnnRnnMathType(input, rnnDesc); TensorDescriptorArray yDesc(seqLength, y.type(), {1, 1, outSize, batchSize}); From 2c798275d277ed27ac2a3d9f9186d822f1817992 Mon Sep 17 00:00:00 2001 From: Lukas Thomann Date: Fri, 27 Feb 2026 20:08:36 +0100 Subject: [PATCH 3/4] accidentally deleted cpu and cuda features --- vcpkg.json | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vcpkg.json b/vcpkg.json index 0463957..06083f2 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -6,6 +6,16 @@ "gtest" ], "features": { + "cuda": { + "description": "Dependencies for gpu backend", + "dependencies": [] + }, + "cpu": { + "description": "Dependencies for cpu backend", + "dependencies": [ + "onednn" + ] + }, "distributed": { "description": "Dependencies for distributed backend", "dependencies": [ @@ -42,4 +52,4 @@ ] } } -} +} \ No newline at end of file From 21a75af4c97e633ba3eb15c6cda6ec5dd3dbc5f8 Mon Sep 17 00:00:00 2001 From: Lukas Thomann Date: Fri, 27 Feb 2026 22:39:40 +0100 Subject: [PATCH 4/4] began to add first stuff --- Folder.DotSettings | 2 + cmake/dependencies/FindCUDNN.cmake | 4 + cmake/utils/flashlightConfig.cmake.in | 2 +- .../tensor/backend/cudnn/BatchNorm.cpp | 32 +- .../autograd/tensor/backend/cudnn/Conv2D.cpp | 74 ++--- .../backend/cudnn/CudnnAutogradExtension.cpp | 12 +- .../tensor/backend/cudnn/CudnnUtils.cpp | 288 +++++++++--------- .../tensor/backend/cudnn/CudnnUtils.h | 67 +++- .../autograd/tensor/backend/cudnn/Pool2D.cpp | 18 +- .../fl/autograd/tensor/backend/cudnn/RNN.cpp | 266 ++++++++-------- 10 files changed, 401 insertions(+), 364 deletions(-) create mode 100644 Folder.DotSettings diff --git a/Folder.DotSettings b/Folder.DotSettings new file mode 100644 index 0000000..069ccf5 --- /dev/null +++ b/Folder.DotSettings @@ -0,0 +1,2 @@ + + <NamingElement Priority="6" Title="Parameters"><Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"><type Name="function parameter" /><type Name="lambda parameter" /></Descriptor><Policy Inspect="True" WarnAboutPrefixesAndSuffixes="False" Prefix="" Suffix="" Style="aaBb"><ExtraRule Prefix="_" Suffix="" Style="aaBb" /></Policy></NamingElement> \ No newline at end of file diff --git a/cmake/dependencies/FindCUDNN.cmake b/cmake/dependencies/FindCUDNN.cmake index a150310..fd77eea 100644 --- a/cmake/dependencies/FindCUDNN.cmake +++ b/cmake/dependencies/FindCUDNN.cmake @@ -76,4 +76,8 @@ if(CUDNN_FOUND) endif() endif() +if (CUDNN_FOUND AND CUDNN_VERSION VERSION_LESS "8.0") + message(FATAL_ERROR "Flashlight requires cuDNN >= 8.0, found ${CUDNN_VERSION}") +endif() + mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY CUDNN_VERSION) diff --git a/cmake/utils/flashlightConfig.cmake.in b/cmake/utils/flashlightConfig.cmake.in index 2bf2550..9d73423 100644 --- a/cmake/utils/flashlightConfig.cmake.in +++ b/cmake/utils/flashlightConfig.cmake.in @@ -49,7 +49,7 @@ if (@FL_BUILD_STANDALONE@) endif() if (@FL_USE_CUDA@) if (@FL_USE_CUDNN@) - find_dependency(CUDNN 7.1) + find_dependency(CUDNN 8) endif() if (@FL_BUILD_DISTRIBUTED@) find_dependency(NCCL) diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp index 9f6b315..2538f88 100644 --- a/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp +++ b/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp @@ -48,15 +48,15 @@ namespace { if(minAxis == 0) { modeOut = CUDNN_BATCHNORM_PER_ACTIVATION; - inDescDimsOut = Shape( + inDescDimsOut = Shape{ { 1, 1, nfeatures, static_cast(input.elements() / nfeatures) } - ); - wtDescDimsOut = Shape({1, 1, nfeatures}); + }; + wtDescDimsOut = Shape{1, 1, nfeatures}; } else { modeOut = CUDNN_BATCHNORM_SPATIAL; #if CUDNN_VERSION >= 7003 @@ -67,15 +67,15 @@ namespace { int batchsz = 1; for(int i = maxAxis + 1; i < input.ndim(); ++i) batchsz *= input.dim(i); - inDescDimsOut = Shape( + inDescDimsOut = Shape{ { 1, static_cast(input.elements() / (nfeatures * batchsz)), nfeatures, batchsz, } - ); - wtDescDimsOut = Shape({1, 1, nfeatures}); + }; + wtDescDimsOut = Shape{1, 1, nfeatures}; } } @@ -101,7 +101,7 @@ Tensor CudnnAutogradExtension::batchnorm( ); FL_TENSOR_DTYPES_MATCH_CHECK(weight, bias, runningMean, runningVar); - auto output = Tensor(input.shape(), input.type()); + auto output = Tensor{input.shape(), input.type()}; cudnnBatchNormMode_t mode; Shape inDescDims, wtDescDims; @@ -122,8 +122,8 @@ Tensor CudnnAutogradExtension::batchnorm( fl::dtype scalarsType = input.type() == fl::dtype::f16 ? fl::dtype::f32 : input.type(); - auto inDesc = TensorDescriptor(input.type(), inDescDims); - auto wtDesc = TensorDescriptor(weightArray.type(), wtDescDims); + auto inDesc = TensorDescriptor{input.type(), inDescDims}; + auto wtDesc = TensorDescriptor{weightArray.type(), wtDescDims}; { DevicePtr inRaw(input); @@ -140,8 +140,8 @@ Tensor CudnnAutogradExtension::batchnorm( ); if(train) { - saveMean = Tensor({wtDescDims[2]}, scalarsType); - saveVar = Tensor({wtDescDims[2]}, scalarsType); + saveMean = Tensor{{wtDescDims[2]}, scalarsType}; + saveVar = Tensor{{wtDescDims[2]}, scalarsType}; DevicePtr saveMeanRaw(saveMean); DevicePtr saveVarRaw(saveVar); @@ -223,13 +223,13 @@ std::tuple CudnnAutogradExtension::batchnormBackward( const void* one1 = kOne(scalarsType); const void* zero0 = kZero(scalarsType); - auto iDesc = TensorDescriptor(input.type(), inDescDims); - auto wDesc = TensorDescriptor(wt.type(), wtDescDims); + auto iDesc = TensorDescriptor{input.type(), inDescDims}; + auto wDesc = TensorDescriptor{wt.type(), wtDescDims}; // CuDNN doesn't support calculating only the gradients // required for batchnorm - auto gradIn = Tensor(input.shape(), input.type()); - auto gradWt = Tensor(wt.shape(), wt.type()); - auto gradBs = Tensor(wt.shape(), wt.type()); + auto gradIn = Tensor{input.shape(), input.type()}; + auto gradWt = Tensor{wt.shape(), wt.type()}; + auto gradBs = Tensor{wt.shape(), wt.type()}; { DevicePtr iRaw(input); DevicePtr wRaw(wt); diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp index bb89e61..b9f63c6 100644 --- a/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp +++ b/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp @@ -314,9 +314,9 @@ Tensor CudnnAutogradExtension::conv2d( auto hasBias = bias.elements() > 0; - auto inDesc = TensorDescriptor(input); - auto wtDesc = FilterDescriptor(weights); - auto convDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups); + auto inDesc = TensorDescriptor{input}; + auto wtDesc = FilterDescriptor{weights}; + auto convDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups}; if(input.type() == fl::dtype::f16) CUDNN_CHECK_ERR( cudnnSetConvolutionMathType( @@ -339,8 +339,8 @@ Tensor CudnnAutogradExtension::conv2d( odims.data() ) ); - auto output = Tensor({odims[3], odims[2], odims[1], odims[0]}, input.type()); - auto outDesc = TensorDescriptor(output); + auto output = Tensor{{odims[3], odims[2], odims[1], odims[0]}, input.type()}; + auto outDesc = TensorDescriptor{output}; auto handle = getCudnnHandle(); const auto& cudnnStream = getCudnnStream(); @@ -357,7 +357,7 @@ Tensor CudnnAutogradExtension::conv2d( try { wspace = - Tensor({static_cast(fwdAlgoBestPerf.memory)}, fl::dtype::b8); + Tensor{{static_cast(fwdAlgoBestPerf.memory)}, fl::dtype::b8}; } catch(const std::exception&) { fwdAlgoBestPerf.algo = kFwdDefaultAlgo; CUDNN_CHECK_ERR( @@ -372,7 +372,7 @@ Tensor CudnnAutogradExtension::conv2d( ) ); wspace = - Tensor({static_cast(fwdAlgoBestPerf.memory)}, fl::dtype::b8); + Tensor{{static_cast(fwdAlgoBestPerf.memory)}, fl::dtype::b8}; } { DevicePtr inPtr(input); @@ -405,7 +405,7 @@ Tensor CudnnAutogradExtension::conv2d( ); if(hasBias) { - auto bsDesc = TensorDescriptor(bias); + auto bsDesc = TensorDescriptor{bias}; DevicePtr bsPtr(bias); // ensure cudnn compute stream waits on stream of bias tensor relativeSync(cudnnStream, {bias}); @@ -453,10 +453,10 @@ Tensor CudnnAutogradExtension::conv2dBackwardData( // benchmarking suggests input or weight casting should occur, these // descriptors may not be used/new ones with the correct types will be // used instead. - auto iDesc = TensorDescriptor(input); - auto wDesc = FilterDescriptor(weight); - auto cDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups); - auto oDesc = TensorDescriptor(gradOutput); + auto iDesc = TensorDescriptor{input}; + auto wDesc = FilterDescriptor{weight}; + auto cDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups}; + auto oDesc = TensorDescriptor{gradOutput}; setDefaultMathType(cDesc, input); @@ -491,10 +491,10 @@ Tensor CudnnAutogradExtension::conv2dBackwardData( Tensor ws; try { - ws = Tensor( + ws = Tensor{ {static_cast(bwdDataAlgoBestPerf.memory)}, fl::dtype::b8 - ); + }; } catch(const std::exception&) { bwdDataAlgoBestPerf.algo = kBwdDataDefaultAlgo; CUDNN_CHECK_ERR( @@ -508,13 +508,13 @@ Tensor CudnnAutogradExtension::conv2dBackwardData( &bwdDataAlgoBestPerf.memory ) ); - ws = Tensor( + ws = Tensor{ {static_cast(bwdDataAlgoBestPerf.memory)}, fl::dtype::b8 - ); + }; } - auto gradInput = Tensor(inTensor.shape(), inTensor.type()); + auto gradInput = Tensor{inTensor.shape(), inTensor.type()}; { DevicePtr gradInputPtr(gradInput); DevicePtr gradResultPtr(gradOutputTensor); @@ -577,11 +577,11 @@ Tensor CudnnAutogradExtension::conv2dBackwardData( /* incrementCount = */ false ); - auto iDescF32 = TensorDescriptor(inTensorF32); - auto wDescF32 = FilterDescriptor(wtTensorF32); + auto iDescF32 = TensorDescriptor{inTensorF32}; + auto wDescF32 = FilterDescriptor{wtTensorF32}; auto cDescF32 = - ConvDescriptor(fl::dtype::f32, px, py, sx, sy, dx, dy, groups); - auto oDescF32 = TensorDescriptor(gradOutputTensorF32); + ConvDescriptor{fl::dtype::f32, px, py, sx, sy, dx, dy, groups}; + auto oDescF32 = TensorDescriptor{gradOutputTensorF32}; // core bwd data computation dataGradBenchmark->audit( [&dataGradOut, @@ -671,10 +671,10 @@ std::pair CudnnAutogradExtension::conv2dBackwardFilterBias( // benchmarking suggests input or weight casting should occur, these // descriptors may not be used/new ones with the correct types will be // used instead. - auto iDesc = TensorDescriptor(input); - auto wDesc = FilterDescriptor(weight); - auto cDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups); - auto oDesc = TensorDescriptor(gradOutput); + auto iDesc = TensorDescriptor{input}; + auto wDesc = FilterDescriptor{weight}; + auto cDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups}; + auto oDesc = TensorDescriptor{gradOutput}; setDefaultMathType(cDesc, input); @@ -708,10 +708,10 @@ std::pair CudnnAutogradExtension::conv2dBackwardFilterBias( Tensor ws; try { - ws = Tensor( + ws = Tensor{ {static_cast(bwdFilterAlgoBestPerf.memory)}, fl::dtype::b8 - ); + }; } catch(const std::exception&) { bwdFilterAlgoBestPerf.algo = kBwdFilterDefaultAlgo; CUDNN_CHECK_ERR( @@ -725,13 +725,13 @@ std::pair CudnnAutogradExtension::conv2dBackwardFilterBias( &bwdFilterAlgoBestPerf.memory ) ); - ws = Tensor( + ws = Tensor{ {static_cast(bwdFilterAlgoBestPerf.memory)}, fl::dtype::b8 - ); + }; } - auto gradWeight = Tensor(wtTensor.shape(), wtTensor.type()); + auto gradWeight = Tensor{wtTensor.shape(), wtTensor.type()}; { DevicePtr gradWeightPtr(gradWeight); DevicePtr gradResultPtr(gradOutputTensor); @@ -794,11 +794,11 @@ std::pair CudnnAutogradExtension::conv2dBackwardFilterBias( /* incrementCount = */ false ); - auto iDescF32 = TensorDescriptor(inTensorF32); - auto wDescF32 = FilterDescriptor(wtTensorF32); + auto iDescF32 = TensorDescriptor{inTensorF32}; + auto wDescF32 = FilterDescriptor{wtTensorF32}; auto cDescF32 = - ConvDescriptor(fl::dtype::f32, px, py, sx, sy, dx, dy, groups); - auto oDescF32 = TensorDescriptor(gradOutputTensorF32); + ConvDescriptor{fl::dtype::f32, px, py, sx, sy, dx, dy, groups}; + auto oDescF32 = TensorDescriptor{gradOutputTensorF32}; // core bwd data computation filterGradBenchmark->audit( [&filterGradOut, @@ -860,13 +860,13 @@ std::pair CudnnAutogradExtension::conv2dBackwardFilterBias( const Tensor& bsTensor, const Tensor& gradOutput, const TensorDescriptor& oDesc) -> Tensor { - auto gradBias = Tensor(bsTensor.shape(), bsTensor.type()); + auto gradBias = Tensor{bsTensor.shape(), bsTensor.type()}; { DevicePtr gradBiasPtr(gradBias); DevicePtr gradResultPtr(gradOutput); // ensure cudnn compute stream waits on gradient tensor streams relativeSync(cudnnStream, {gradOutput, gradBias}); - auto bDesc = TensorDescriptor(bsTensor); + auto bDesc = TensorDescriptor{bsTensor}; CUDNN_CHECK_ERR( cudnnConvolutionBackwardBias( hndl, @@ -911,7 +911,7 @@ std::pair CudnnAutogradExtension::conv2dBackwardFilterBias( }, /* incrementCount = */ false ); - auto oDescF32 = TensorDescriptor(gradOutputF32); + auto oDescF32 = TensorDescriptor{gradOutputF32}; // Perform bias gradient computation biasGradBenchmark->audit( [&biasGradOut, diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp index 305a6cc..eaac140 100644 --- a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp +++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp @@ -16,13 +16,11 @@ namespace fl { std::shared_ptr CudnnAutogradExtension::createBenchmarkOptions() { return std::make_shared( std::make_shared>( - std::vector( - { - KernelMode::F32, - KernelMode::F32_ALLOW_CONVERSION, - KernelMode::F16 - } - ), + std::vector{ + KernelMode::F32, + KernelMode::F32_ALLOW_CONVERSION, + KernelMode::F16 + }, fl::kDynamicBenchmarkDefaultCount ) ); diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp index 82cadcb..804bc6f 100644 --- a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp +++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp @@ -25,16 +25,16 @@ struct DeviceHandle { std::shared_ptr stream; explicit DeviceHandle(std::shared_ptr _stream) : cudnnHandle(nullptr), - stream(_stream) { + stream(_stream) { CUDNN_CHECK_ERR(cudnnCreate(&cudnnHandle)); CUDNN_CHECK_ERR(cudnnSetStream(cudnnHandle, stream->handle())); } ~DeviceHandle() { if(cudnnHandle) { -// See https://git.io/fNQnM - sometimes, at exit, the CUDA context -// (or something) is already destroyed by the time a handle gets destroyed -// because of an issue with the destruction order. + // See https://git.io/fNQnM - sometimes, at exit, the CUDA context + // (or something) is already destroyed by the time a handle gets destroyed + // because of an issue with the destruction order. #ifdef NO_CUDNN_DESTROY_HANDLE #else CUDNN_CHECK_ERR(cudnnDestroy(cudnnHandle)); @@ -43,16 +43,16 @@ struct DeviceHandle { } }; -const float kFloatZero = 0.0; -const float kFloatOne = 1.0; +constexpr float kFloatZero = 0.0; +constexpr float kFloatOne = 1.0; -const double kDoubleZero = 0.0; -const double kDoubleOne = 1.0; +constexpr double kDoubleZero = 0.0; +constexpr double kDoubleOne = 1.0; // TODO: move this to CudnnAutogradExtension if we make it a singleton std::unordered_map handles; -const DeviceHandle& getActiveDeviceHandle() { +DeviceHandle const& getActiveDeviceHandle() { auto& manager = fl::DeviceManager::getInstance(); auto& cudaDevice = manager.getActiveDevice(fl::DeviceType::CUDA).impl(); @@ -88,57 +88,42 @@ namespace fl { void cudnnCheckErr(cudnnStatus_t status) { if(status == CUDNN_STATUS_SUCCESS) return; - const char* err = cudnnGetErrorString(status); + char const* err = cudnnGetErrorString(status); switch(status) { - case CUDNN_STATUS_BAD_PARAM: - throw std::invalid_argument(err); - default: - throw std::runtime_error(err); + case CUDNN_STATUS_BAD_PARAM: throw std::invalid_argument(err); + default: throw std::runtime_error(err); } } -cudnnDataType_t cudnnMapToType(const fl::dtype& t) { +cudnnDataType_t cudnnMapToType(fl::dtype const& t) { switch(t) { - case fl::dtype::f16: - return CUDNN_DATA_HALF; - case fl::dtype::f32: - return CUDNN_DATA_FLOAT; - case fl::dtype::f64: - return CUDNN_DATA_DOUBLE; - default: - throw std::invalid_argument("unsupported data type for cuDNN"); + case fl::dtype::f16: return CUDNN_DATA_HALF; + case fl::dtype::f32: return CUDNN_DATA_FLOAT; + case fl::dtype::f64: return CUDNN_DATA_DOUBLE; + default: throw std::invalid_argument("unsupported data type for cuDNN"); } } -cudnnPoolingMode_t cudnnMapToPoolingMode(const PoolingMode mode) { +cudnnPoolingMode_t cudnnMapToPoolingMode(PoolingMode const mode) { switch(mode) { - case PoolingMode::MAX: - return CUDNN_POOLING_MAX; - case PoolingMode::AVG_INCLUDE_PADDING: - return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - case PoolingMode::AVG_EXCLUDE_PADDING: - return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - default: - throw std::invalid_argument("unsupported pooling mode for cuDNN"); + case PoolingMode::MAX: return CUDNN_POOLING_MAX; + case PoolingMode::AVG_INCLUDE_PADDING: return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + case PoolingMode::AVG_EXCLUDE_PADDING: return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + default: throw std::invalid_argument("unsupported pooling mode for cuDNN"); } } -cudnnRNNMode_t cudnnMapToRNNMode(const RnnMode mode) { +cudnnRNNMode_t cudnnMapToRNNMode(RnnMode const mode) { switch(mode) { - case RnnMode::RELU: - return CUDNN_RNN_RELU; - case RnnMode::TANH: - return CUDNN_RNN_TANH; - case RnnMode::LSTM: - return CUDNN_LSTM; - case RnnMode::GRU: - return CUDNN_GRU; - default: - throw std::invalid_argument("unsupported RNN mode for cuDNN"); + case RnnMode::RELU: return CUDNN_RNN_RELU; + case RnnMode::TANH: return CUDNN_RNN_TANH; + case RnnMode::LSTM: return CUDNN_LSTM; + case RnnMode::GRU: return CUDNN_GRU; + default: throw std::invalid_argument("unsupported RNN mode for cuDNN"); } } -TensorDescriptor::TensorDescriptor(const fl::dtype type, const Shape& flDims) { +TensorDescriptor::TensorDescriptor(fl::dtype const type, Shape const& flDims) { CUDNN_CHECK_ERR(cudnnCreateTensorDescriptor(&descriptor)); cudnnDataType_t cudnntype = cudnnMapToType(type); @@ -165,7 +150,7 @@ TensorDescriptor::TensorDescriptor(const fl::dtype type, const Shape& flDims) { ); } -TensorDescriptor::TensorDescriptor(const Tensor& input) { +TensorDescriptor::TensorDescriptor(Tensor const& input) { CUDNN_CHECK_ERR(cudnnCreateTensorDescriptor(&descriptor)); cudnnDataType_t cudnntype = cudnnMapToType(input.type()); @@ -194,21 +179,19 @@ TensorDescriptor::TensorDescriptor(const Tensor& input) { ); } -TensorDescriptor::~TensorDescriptor() { - CUDNN_CHECK_ERR(cudnnDestroyTensorDescriptor(descriptor)); -} +TensorDescriptor::~TensorDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyTensorDescriptor(descriptor)); } TensorDescriptorArray::TensorDescriptorArray( int size, - const fl::dtype type, - const Shape& dims + fl::dtype const type, + Shape const& dims ) { - desc_vec.reserve(size); + _descVec.reserve(size); for(int i = 0; i < size; i++) { - desc_vec.emplace_back(type, dims); - desc_raw_vec.push_back(desc_vec.back().descriptor); + _descVec.emplace_back(type, dims); + _descRawVec.push_back(_descVec.back().descriptor); } - descriptors = desc_raw_vec.data(); + descriptors = _descRawVec.data(); } TensorDescriptorArray::~TensorDescriptorArray() = default; @@ -241,11 +224,9 @@ PoolingDescriptor::PoolingDescriptor( ); } -PoolingDescriptor::~PoolingDescriptor() { - CUDNN_CHECK_ERR(cudnnDestroyPoolingDescriptor(descriptor)); -} +PoolingDescriptor::~PoolingDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyPoolingDescriptor(descriptor)); } -FilterDescriptor::FilterDescriptor(const Tensor& input) { +FilterDescriptor::FilterDescriptor(Tensor const& input) { CUDNN_CHECK_ERR(cudnnCreateFilterDescriptor(&descriptor)); cudnnDataType_t cudnntype = cudnnMapToType(input.type()); @@ -267,121 +248,146 @@ FilterDescriptor::FilterDescriptor(const Tensor& input) { ); } -FilterDescriptor::~FilterDescriptor() { - CUDNN_CHECK_ERR(cudnnDestroyFilterDescriptor(descriptor)); -} +FilterDescriptor::~FilterDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyFilterDescriptor(descriptor)); } -DropoutDescriptor::DropoutDescriptor(float drop_prob) { +DropoutDescriptor::DropoutDescriptor(float dropProb) { CUDNN_CHECK_ERR(cudnnCreateDropoutDescriptor(&descriptor)); - auto cudnnHandle = getCudnnHandle(); - unsigned long long seed = 0; - size_t state_size; - CUDNN_CHECK_ERR(cudnnDropoutGetStatesSize(cudnnHandle, &state_size)); - auto& dropout_states = getDropoutStates(); - if(dropout_states.isEmpty()) { - dropout_states = - Tensor({static_cast(state_size)}, fl::dtype::b8); - DevicePtr statesraw(dropout_states); + + auto const cudnnHandle = getCudnnHandle(); + constexpr unsigned long long seed = 0; + size_t stateSize; + + CUDNN_CHECK_ERR(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize)); + + auto& dropoutStates = getDropoutStates(); + + if(dropoutStates.isEmpty()) { + dropoutStates = + Tensor{{static_cast(stateSize)}, fl::dtype::b8}; + DevicePtr statesraw(dropoutStates); CUDNN_CHECK_ERR( cudnnSetDropoutDescriptor( descriptor, cudnnHandle, - drop_prob, + dropProb, statesraw.get(), - state_size, + stateSize, seed ) ); - } else { - DevicePtr statesraw(dropout_states); -// See https://git.io/fp9oo for an explanation. -#if CUDNN_VERSION >= 7000 + } + else { + DevicePtr statesraw(dropoutStates); CUDNN_CHECK_ERR( cudnnRestoreDropoutDescriptor( descriptor, cudnnHandle, - drop_prob, + dropProb, statesraw.get(), - state_size, + stateSize, seed ) ); -#else - auto dropout_struct = reinterpret_cast(descriptor); - dropout_struct->dropout = drop_prob; - dropout_struct->nstates = state_size; - dropout_struct->states = statesraw.get(); -#endif } } -DropoutDescriptor::~DropoutDescriptor() { - CUDNN_CHECK_ERR(cudnnDestroyDropoutDescriptor(descriptor)); -} +DropoutDescriptor::~DropoutDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyDropoutDescriptor(descriptor)); } Tensor& DropoutDescriptor::getDropoutStates() { - thread_local Tensor dropout_states; - return dropout_states; + thread_local Tensor dropoutStates; + return dropoutStates; } RNNDescriptor::RNNDescriptor( fl::dtype type, - int hidden_size, - int num_layers, + int inputSize, + int hiddenSize, + int numLayers, RnnMode mode, bool bidirectional, DropoutDescriptor& dropout ) { - CUDNN_CHECK_ERR(cudnnCreateRNNDescriptor(&descriptor)); - - auto cudnnHandle = getCudnnHandle(); + CUDNN_CHECK_ERR(cudnnCreateRNNDescriptor(&_handle)); - cudnnRNNInputMode_t in_mode = CUDNN_LINEAR_INPUT; + constexpr auto inMode = CUDNN_LINEAR_INPUT; + constexpr auto algo = CUDNN_RNN_ALGO_STANDARD; - cudnnDirectionMode_t dir = - bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; + auto const dir = bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; - cudnnRNNMode_t cell = cudnnMapToRNNMode(mode); - cudnnRNNAlgo_t algo = CUDNN_RNN_ALGO_STANDARD; - cudnnDataType_t cudnntype = cudnnMapToType(type); + auto const cell = cudnnMapToRNNMode(mode); + auto const dataType = cudnnMapToType(type); -#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000 CUDNN_CHECK_ERR( - cudnnSetRNNDescriptor( - cudnnHandle, - descriptor, - hidden_size, - num_layers, - dropout.descriptor, - in_mode, - dir, - cell, + //https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-892/api/index.html#cudnnSetRNNDescriptor_v8 + cudnnSetRNNDescriptor_v8( + _handle, algo, - cudnntype + cell, + CUDNN_RNN_DOUBLE_BIAS, //TODO review; double is default for old cudnn + dir, + inMode, + dataType, + dataType, // math precision + mathType(type), + inputSize, + hiddenSize, + hiddenSize, //projection size (unused) + numLayers, + dropout.descriptor, + 0 ) ); -#else +} + +RNNDescriptor::~RNNDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyRNNDescriptor(_handle)); } + +} + +namespace fl { + + +RNNDataDescriptor::RNNDataDescriptor(fl::dtype type, Shape const& dims) { + create(); + auto const inputSize = dims.ndim() > 0 ? static_cast(dims[0]) : 1; + auto const batchSize = dims.ndim() > 1 ? static_cast(dims[1]) : 1; + auto const maxSeqSize = dims.ndim() > 2 ? static_cast(dims[2]) : 1; + + std::vector seqSizes(batchSize, maxSeqSize); + + set( + type, + inputSize, + maxSeqSize, + seqSizes + ); +} + +RNNDataDescriptor::~RNNDataDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyRNNDataDescriptor(_handle)); } +void RNNDataDescriptor::create() { CUDNN_CHECK_ERR(cudnnCreateRNNDataDescriptor(&_handle)); } +void RNNDataDescriptor::set( + fl::dtype type, + int inputSize, + int maxSeqSize, + std::span sequenceSizes +) const { CUDNN_CHECK_ERR( - cudnnSetRNNDescriptor_v6( - cudnnHandle, - descriptor, - hidden_size, - num_layers, - dropout.descriptor, - in_mode, - dir, - cell, - algo, - cudnntype + cudnnSetRNNDataDescriptor( + _handle, + cudnnMapToType(type), + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, + maxSeqSize, + sequenceSizes.size(), //batch size + inputSize, + sequenceSizes.data(), + nullptr //no padding ) ); -#endif } -RNNDescriptor::~RNNDescriptor() { - CUDNN_CHECK_ERR(cudnnDestroyRNNDescriptor(descriptor)); } +namespace fl { + ConvDescriptor::ConvDescriptor( fl::dtype type, int px, @@ -413,35 +419,27 @@ ConvDescriptor::ConvDescriptor( CUDNN_CHECK_ERR(cudnnSetConvolutionGroupCount(descriptor, groups)); } -ConvDescriptor::~ConvDescriptor() { - CUDNN_CHECK_ERR(cudnnDestroyConvolutionDescriptor(descriptor)); -} +ConvDescriptor::~ConvDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyConvolutionDescriptor(descriptor)); } cudnnHandle_t getCudnnHandle() { return getActiveDeviceHandle().cudnnHandle; } -const CUDAStream& getCudnnStream() { return *getActiveDeviceHandle().stream; } +CUDAStream const& getCudnnStream() { return *getActiveDeviceHandle().stream; } -const void* kOne(const fl::dtype t) { +void const* kOne(fl::dtype const t) { switch(t) { case fl::dtype::f16: - case fl::dtype::f32: - return &kFloatOne; - case fl::dtype::f64: - return &kDoubleOne; - default: - throw std::invalid_argument("unsupported data type for cuDNN"); + case fl::dtype::f32: return &kFloatOne; + case fl::dtype::f64: return &kDoubleOne; + default: throw std::invalid_argument("unsupported data type for cuDNN"); } } -const void* kZero(const fl::dtype t) { +void const* kZero(fl::dtype const t) { switch(t) { case fl::dtype::f16: - case fl::dtype::f32: - return &kFloatZero; - case fl::dtype::f64: - return &kDoubleZero; - default: - throw std::invalid_argument("unsupported data type for cuDNN"); + case fl::dtype::f32: return &kFloatZero; + case fl::dtype::f64: return &kDoubleZero; + default: throw std::invalid_argument("unsupported data type for cuDNN"); } } diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h index fca9969..ad83be3 100644 --- a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h +++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h @@ -13,13 +13,15 @@ #include "flashlight/fl/runtime/CUDAStream.h" #include "flashlight/fl/tensor/TensorBase.h" +#include + namespace fl { class TensorDescriptor { public: - explicit TensorDescriptor(const Tensor& a); + explicit TensorDescriptor(Tensor const& a); - TensorDescriptor(const fl::dtype type, const Shape& af_dims); + TensorDescriptor(fl::dtype const type, Shape const& afDims); cudnnTensorDescriptor_t descriptor; ~TensorDescriptor(); @@ -27,19 +29,19 @@ class TensorDescriptor { class TensorDescriptorArray { public: - TensorDescriptorArray(int size, const fl::dtype type, const Shape& dims); + TensorDescriptorArray(int size, fl::dtype const type, Shape const& dims); cudnnTensorDescriptor_t* descriptors; ~TensorDescriptorArray(); private: - std::vector desc_vec; - std::vector desc_raw_vec; + std::vector _descVec; + std::vector _descRawVec; }; class FilterDescriptor { public: - explicit FilterDescriptor(const Tensor& a); + explicit FilterDescriptor(Tensor const& input); cudnnFilterDescriptor_t descriptor; ~FilterDescriptor(); }; @@ -77,7 +79,7 @@ class PoolingDescriptor { class DropoutDescriptor { public: - explicit DropoutDescriptor(float drop_prob); + explicit DropoutDescriptor(float dropProb); cudnnDropoutDescriptor_t descriptor; ~DropoutDescriptor(); @@ -88,28 +90,65 @@ class RNNDescriptor { public: RNNDescriptor( fl::dtype type, - int hidden_size, - int num_layers, + int inputSize, + int hiddenSize, + int numLayers, RnnMode mode, bool bidirectional, DropoutDescriptor& dropout ); - cudnnRNNDescriptor_t descriptor; ~RNNDescriptor(); + +private: + cudnnRNNDescriptor_t _handle = nullptr; + + static constexpr auto mathType(fl::dtype type) { + return type == fl::dtype::f16 ? CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION : CUDNN_DEFAULT_MATH; + } + +public: + /** + * @return descriptor handle + */ + constexpr auto get() const { return _handle; } }; + +class RNNDataDescriptor { +public: + RNNDataDescriptor( + fl::dtype type, + Shape const& dims + ); + + ~RNNDataDescriptor(); + +private: + void create(); + void set(dtype type, int inputSize, int maxSeqSize, std::span sequenceSizes) const; + + cudnnRNNDataDescriptor_t _handle = nullptr; + +public: + /** + * @return descriptor handle + */ + constexpr auto get() const { return _handle; } +}; + + #define CUDNN_CHECK_ERR(expr) ::fl::cudnnCheckErr((expr)) void cudnnCheckErr(cudnnStatus_t status); -cudnnDataType_t cudnnMapToType(const fl::dtype& t); +cudnnDataType_t cudnnMapToType(fl::dtype const& t); -const void* kOne(const fl::dtype t); +void const* kOne(fl::dtype const t); -const void* kZero(const fl::dtype t); +void const* kZero(fl::dtype const t); // TODO: move this to CudnnAutogradExtension if we make it a singleton cudnnHandle_t getCudnnHandle(); -const CUDAStream& getCudnnStream(); +CUDAStream const& getCudnnStream(); } // namespace fl diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp index 24b08c8..ef838bb 100644 --- a/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp +++ b/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp @@ -25,10 +25,10 @@ Tensor CudnnAutogradExtension::pool2d( const PoolingMode mode, std::shared_ptr ) { - auto inDesc = TensorDescriptor(input); + auto inDesc = TensorDescriptor{input}; // init pooling descriptor - auto poolDesc = PoolingDescriptor(wx, wy, sx, sy, px, py, mode); + auto poolDesc = PoolingDescriptor{wx, wy, sx, sy, px, py, mode}; // init output descriptor auto ix = input.dim(0); @@ -36,7 +36,7 @@ Tensor CudnnAutogradExtension::pool2d( auto ox = 1 + (ix + 2 * px - wx) / sx; auto oy = 1 + (iy + 2 * py - wy) / sy; - auto output = Tensor( + auto output = Tensor{ { ox, oy, @@ -44,8 +44,8 @@ Tensor CudnnAutogradExtension::pool2d( input.ndim() < 4 ? 1 : input.dim(3) }, input.type() - ); - auto outDesc = TensorDescriptor(output); + }; + auto outDesc = TensorDescriptor{output}; { DevicePtr inputraw(input); DevicePtr resultraw(output); @@ -90,11 +90,11 @@ Tensor CudnnAutogradExtension::pool2dBackward( const PoolingMode mode, std::shared_ptr ) { - auto i_desc = TensorDescriptor(input); - auto o_desc = TensorDescriptor(poolOutput); - auto p_desc = PoolingDescriptor(wx, wy, sx, sy, px, py, mode); + auto i_desc = TensorDescriptor{input}; + auto o_desc = TensorDescriptor{poolOutput}; + auto p_desc = PoolingDescriptor{wx, wy, sx, sy, px, py, mode}; - auto gradInput = Tensor(input.shape(), input.type()); + auto gradInput = Tensor{input.shape(), input.type()}; auto hndl = getCudnnHandle(); const auto& cudnnStream = getCudnnStream(); diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp index 17b242b..568e4d5 100644 --- a/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp +++ b/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp @@ -15,58 +15,34 @@ namespace fl { namespace { - size_t getWorkspaceSize( - cudnnHandle_t handle, - const RNNDescriptor& rnnDesc, - const int seqLength, - const TensorDescriptorArray& xDescs - ) { - size_t workspaceSize; - CUDNN_CHECK_ERR( - cudnnGetRNNWorkspaceSize( - handle, - rnnDesc.descriptor, - seqLength, - xDescs.descriptors, - &workspaceSize - ) - ); - return workspaceSize; - } + struct temp_space_sizes { + size_t size; + size_t reserveSize; + }; - size_t getReserveSize( + temp_space_sizes rnnTempSpaceSizes( cudnnHandle_t handle, - const RNNDescriptor& rnnDesc, - const int seqLength, - const TensorDescriptorArray& xDescs + RNNDescriptor const& rnnDescriptor, + RNNDataDescriptor const& xDescriptor, + cudnnForwardMode_t mode ) { - size_t reserveSize; + temp_space_sizes sizes{}; + CUDNN_CHECK_ERR( - cudnnGetRNNTrainingReserveSize( + cudnnGetRNNTempSpaceSizes( handle, - rnnDesc.descriptor, - seqLength, - xDescs.descriptors, - &reserveSize + rnnDescriptor.get(), + mode, + xDescriptor.get(), + &sizes.size, + &sizes.reserveSize ) ); - return reserveSize; - } - void setCudnnRnnMathType(const Tensor& input, const RNNDescriptor& rnnDesc) { - if(input.type() == fl::dtype::f16) - CUDNN_CHECK_ERR( - cudnnSetRNNMatrixMathType( - rnnDesc.descriptor, - CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION - ) - ); - else - CUDNN_CHECK_ERR( - cudnnSetRNNMatrixMathType(rnnDesc.descriptor, CUDNN_DEFAULT_MATH) - ); + return sizes; } + struct CudnnRnnAutogradPayload : public detail::AutogradPayloadData { Tensor reserveSpace; }; @@ -74,15 +50,15 @@ namespace { } // namespace std::tuple CudnnAutogradExtension::rnn( - const Tensor& input, - const Tensor& hiddenStateIn, - const Tensor& cellStateIn, - const Tensor& weights, - const int hiddenSize, - const int numLayers, - const RnnMode mode, - const bool bidirectional, - const float dropProb, + Tensor const& input, + Tensor const& hiddenStateIn, + Tensor const& cellStateIn, + Tensor const& weights, + int const hiddenSize, + int const numLayers, + RnnMode const mode, + bool const bidirectional, + float const dropProb, std::shared_ptr autogradPayload ) { FL_TENSOR_DTYPES_MATCH_CHECK(input, hiddenStateIn, cellStateIn, weights); @@ -93,25 +69,33 @@ std::tuple CudnnAutogradExtension::rnn( autogradPayload->data = payload; Tensor x = input.asContiguousTensor(); + RNNDataDescriptor xDesc{x.type(), x.shape()}; + Tensor hiddenState = hiddenStateIn.asContiguousTensor(); Tensor cellState = cellStateIn.asContiguousTensor(); - DropoutDescriptor dropout(dropProb); - RNNDescriptor rnnDesc( - input.type(), hiddenSize, numLayers, mode, bidirectional, dropout); - setCudnnRnnMathType(input, rnnDesc); + DropoutDescriptor dropout{dropProb}; - auto dims = x.shape(); + auto const& dims = x.shape(); int inputSize = dims[0]; int batchSize = dims.ndim() < 2 ? 1 : dims[1]; int seqLength = dims.ndim() < 3 ? 1 : dims[2]; + + RNNDescriptor rnnDesc{ + input.type(), + inputSize, + hiddenSize, + numLayers, + mode, + bidirectional, + dropout + }; + + int totalLayers = numLayers * (bidirectional ? 2 : 1); int outSize = hiddenSize * (bidirectional ? 2 : 1); - TensorDescriptorArray xDescs( - seqLength, x.type(), {1, 1, inputSize, batchSize}); - if(!hiddenState.isEmpty()) { auto hxDims = hiddenState.shape(); int hxHiddenSize = hxDims[0]; @@ -119,31 +103,31 @@ std::tuple CudnnAutogradExtension::rnn( int hxTotalLayers = hiddenState.ndim() < 3 ? 1 : hxDims[2]; if( - !(hxHiddenSize == hiddenSize && hxBatchSize == batchSize - && hxTotalLayers == totalLayers) + hxHiddenSize != hiddenSize || hxBatchSize != batchSize + || hxTotalLayers != totalLayers ) throw std::invalid_argument("invalid hidden state dims for RNN"); } if( !cellState.isEmpty() - && !(mode == RnnMode::LSTM && cellState.dim(0) == hiddenSize - && cellState.dim(1) == batchSize && cellState.dim(2) == totalLayers) + && (mode != RnnMode::LSTM || cellState.dim(0) != hiddenSize + || cellState.dim(1) != batchSize || cellState.dim(2) != totalLayers) ) throw std::invalid_argument("invalid cell state dims for RNN"); Shape hDims = {1, hiddenSize, batchSize, totalLayers}; - TensorDescriptor hxDesc(x.type(), hDims); - TensorDescriptor cxDesc(x.type(), hDims); + TensorDescriptor hxDesc{x.type(), hDims}; + TensorDescriptor cxDesc{x.type(), hDims}; auto handle = getCudnnHandle(); - const auto& cudnnStream = getCudnnStream(); + auto const& cudnnStream = getCudnnStream(); size_t paramSize; CUDNN_CHECK_ERR( cudnnGetRNNParamsSize( handle, - rnnDesc.descriptor, + rnnDesc._handle, xDescs.descriptors[0], ¶mSize, cudnnMapToType(weights.type()) @@ -155,24 +139,24 @@ std::tuple CudnnAutogradExtension::rnn( ); FilterDescriptor wDesc(weights); - Tensor y({outSize, batchSize, seqLength}, input.type()); - TensorDescriptorArray yDesc(seqLength, y.type(), {1, 1, outSize, batchSize}); + Tensor y{{outSize, batchSize, seqLength}, input.type()}; + TensorDescriptorArray yDesc{seqLength, y.type(), {1, 1, outSize, batchSize}}; - Tensor hy({hiddenSize, batchSize, totalLayers}, x.type()); - TensorDescriptor hyDesc(x.type(), hDims); + Tensor hy{{hiddenSize, batchSize, totalLayers}, x.type()}; + TensorDescriptor hyDesc{x.type(), hDims}; - Tensor cy; + Tensor cy{}; if(mode == RnnMode::LSTM) - cy = Tensor(hy.shape(), x.type()); + cy = Tensor{hy.shape(), x.type()}; - TensorDescriptor cyDesc(x.type(), hDims); + TensorDescriptor cyDesc{x.type(), hDims}; - size_t workspaceSize = getWorkspaceSize(handle, rnnDesc, seqLength, xDescs); - size_t reserveSize = getReserveSize(handle, rnnDesc, seqLength, xDescs); + constexpr auto forwardMode = CUDNN_FWD_MODE_TRAINING; + auto [workspaceSize, reserveSize] = rnnTempSpaceSizes(handle, rnnDesc, xDesc, forwardMode); Tensor workspace({static_cast(workspaceSize)}, fl::dtype::b8); // Space must be reused between forward and backward for cuDNN - payload->reserveSpace = Tensor({static_cast(reserveSize)}, fl::dtype::b8); + payload->reserveSpace = Tensor{{static_cast(reserveSize)}, fl::dtype::b8}; { auto contiguousX = x.asContiguousTensor(); @@ -190,34 +174,40 @@ std::tuple CudnnAutogradExtension::rnn( relativeSync( cudnnStream, { - contiguousX, hiddenState, cellState, contiguousWeights, y, hy, cy, - workspace, payload->reserveSpace, + contiguousX, + hiddenState, + cellState, + contiguousWeights, + y, + hy, + cy, + workspace, + payload->reserveSpace, } ); CUDNN_CHECK_ERR( - cudnnRNNForwardTraining( + cudnnRNNForward( handle, - rnnDesc.descriptor, + rnnDesc.get(), seqLength, - xDescs.descriptors, + xDesc.get(), xRaw.get(), - hxDesc.descriptor, - hxRaw.get(), - cxDesc.descriptor, - cxRaw.get(), - wDesc.descriptor, - wRaw.get(), - yDesc.descriptors, - yRaw.get(), - hyDesc.descriptor, - hyRaw.get(), - cyDesc.descriptor, - cyRaw.get(), - workspaceRaw.get(), + yDesc.get(), + yRaw, + hxDesc, + hxRaw, + hyRaw, + cxDesc, + cxRaw, + cyRaw, + weightspace and its size????, + //TEMP continue here + workspaceSize, - reserveSpaceRaw.get(), - reserveSize + workspaceRaw + reserveSize, + reserveSpaceRaw.get() ) ); } @@ -228,17 +218,17 @@ std::tuple CudnnAutogradExtension::rnn( } std::tuple CudnnAutogradExtension::rnnBackward( - const Tensor& input, - const Tensor& hiddenState, - const Tensor& cellState, - const Tensor& weights, - const std::shared_ptr gradData, - const Tensor& output, - const int numLayers, - const int hiddenSize, - const RnnMode mode, - const bool bidirectional, - const float dropProb, + Tensor const& input, + Tensor const& hiddenState, + Tensor const& cellState, + Tensor const& weights, + std::shared_ptr const gradData, + Tensor const& output, + int const numLayers, + int const hiddenSize, + RnnMode const mode, + bool const bidirectional, + float const dropProb, std::shared_ptr autogradPayload ) { if(!autogradPayload) @@ -249,7 +239,7 @@ std::tuple CudnnAutogradExtension::rnnBackward( std::static_pointer_cast(autogradPayload->data); auto handle = getCudnnHandle(); - const auto& cudnnStream = getCudnnStream(); + auto const& cudnnStream = getCudnnStream(); auto x = input.asContiguousTensor(); auto& y = output; @@ -261,29 +251,32 @@ std::tuple CudnnAutogradExtension::rnnBackward( int totalLayers = numLayers * (bidirectional ? 2 : 1); int outSize = hiddenSize * (bidirectional ? 2 : 1); - DropoutDescriptor dropout(dropProb); - RNNDescriptor rnnDesc(input.type(), hiddenSize, numLayers, mode, bidirectional, dropout); + DropoutDescriptor dropout{dropProb}; + RNNDescriptor rnnDesc{input.type(), hiddenSize, numLayers, mode, bidirectional, dropout}; setCudnnRnnMathType(input, rnnDesc); - TensorDescriptorArray yDesc(seqLength, y.type(), {1, 1, outSize, batchSize}); - TensorDescriptorArray dyDesc(seqLength, y.type(), {1, 1, outSize, batchSize}); + TensorDescriptorArray yDesc{seqLength, y.type(), {1, 1, outSize, batchSize}}; + TensorDescriptorArray dyDesc{seqLength, y.type(), {1, 1, outSize, batchSize}}; Shape hDims = {1, hiddenSize, batchSize, totalLayers}; - TensorDescriptor dhyDesc(x.type(), hDims); - TensorDescriptor dcyDesc(x.type(), hDims); - TensorDescriptor hxDesc(x.type(), hDims); - TensorDescriptor cxDesc(x.type(), hDims); + TensorDescriptor dhyDesc{x.type(), hDims}; + TensorDescriptor dcyDesc{x.type(), hDims}; + TensorDescriptor hxDesc{x.type(), hDims}; + TensorDescriptor cxDesc{x.type(), hDims}; - Tensor dhx(hiddenState.shape(), hiddenState.type()); - Tensor dcx(cellState.shape(), cellState.type()); - TensorDescriptor dhxDesc(x.type(), hDims); - TensorDescriptor dcxDesc(x.type(), hDims); + Tensor dhx{hiddenState.shape(), hiddenState.type()}; + Tensor dcx{cellState.shape(), cellState.type()}; + TensorDescriptor dhxDesc{x.type(), hDims}; + TensorDescriptor dcxDesc{x.type(), hDims}; FilterDescriptor wDesc(weights); - Tensor dx(input.shape(), input.type()); - TensorDescriptorArray dxDescs( - seqLength, dx.type(), {1, 1, inputSize, batchSize}); + Tensor dx{input.shape(), input.type()}; + TensorDescriptorArray dxDescs{ + seqLength, + dx.type(), + {1, 1, inputSize, batchSize} + }; size_t workspaceSize = getWorkspaceSize(handle, rnnDesc, seqLength, dxDescs); @@ -325,7 +318,7 @@ std::tuple CudnnAutogradExtension::rnnBackward( CUDNN_CHECK_ERR( cudnnRNNBackwardData( handle, - rnnDesc.descriptor, + rnnDesc._handle, seqLength, yDesc.descriptors, yRaw.get(), @@ -357,17 +350,20 @@ std::tuple CudnnAutogradExtension::rnnBackward( if(input.type() == fl::dtype::f16) CUDNN_CHECK_ERR( - cudnnSetRNNMatrixMathType( - rnnDesc.descriptor, - CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION - ) - ); + cudnnSetRNNMatrixMathType( + rnnDesc._handle, + CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION + ) + ); else CUDNN_CHECK_ERR( - cudnnSetRNNMatrixMathType(rnnDesc.descriptor, CUDNN_DEFAULT_MATH) - ); - TensorDescriptorArray xDescs( - seqLength, x.type(), {1, 1, inputSize, batchSize}); + cudnnSetRNNMatrixMathType(rnnDesc._handle, CUDNN_DEFAULT_MATH) + ); + TensorDescriptorArray xDescs{ + seqLength, + x.type(), + {1, 1, inputSize, batchSize} + }; Tensor dw = fl::full(weights.shape(), 0, weights.type()); FilterDescriptor dwDesc(dw); @@ -382,7 +378,7 @@ std::tuple CudnnAutogradExtension::rnnBackward( CUDNN_CHECK_ERR( cudnnRNNBackwardWeights( handle, - rnnDesc.descriptor, + rnnDesc._handle, seqLength, xDescs.descriptors, xRaw.get(),