From a8d8af9901254aafe0dc5c769041d2edc266211e Mon Sep 17 00:00:00 2001
From: Lukas Thomann <lukas.thomann@outlook.com>
Date: Wed, 25 Feb 2026 21:56:48 +0100
Subject: [PATCH 1/4] cleaned up the cmake stuff a little bit, added new
 features for vcpkg to cover the dependencies automatically (for the most
 part)

---
 cmake/HalideUtils.cmake                       | 99 -------------------
 cmake/{ => find}/BuildBackwardCpp.cmake       |  0
 cmake/{ => find}/BuildCereal.cmake            |  0
 .../{ => find}/BuildFlashlightSequence.cmake  |  0
 cmake/{ => find}/BuildFlashlightText.cmake    |  0
 cmake/{ => find}/BuildGloo.cmake              |  0
 cmake/{ => find}/BuildGoogleTest.cmake        |  0
 cmake/{ => find}/BuildSndFile.cmake           |  0
 cmake/{ => find}/BuildStb.cmake               |  0
 cmake/{ => find}/Buildsox.cmake               |  0
 cmake/{ => find}/FindCBLAS.cmake              |  0
 cmake/{ => find}/FindCUDNN.cmake              |  0
 cmake/{ => find}/FindFFTW3.cmake              |  0
 cmake/{ => find}/FindFLAC.cmake               |  0
 cmake/{ => find}/FindFilesystem.cmake         |  0
 cmake/{ => find}/FindGLOG.cmake               |  0
 cmake/{ => find}/FindGMock.cmake              |  0
 cmake/{ => find}/FindMKL.cmake                |  0
 cmake/{ => find}/FindNCCL.cmake               |  0
 cmake/{ => find}/FindOgg.cmake                |  0
 cmake/{ => find}/FindSndFile.cmake            |  0
 cmake/{ => find}/FindVorbis.cmake             |  0
 cmake/{ => find}/Findgflags.cmake             |  0
 cmake/{ => find}/Findsox.cmake                |  0
 cmake/{ => utils}/InternalUtils.cmake         | 32 +++---
 cmake/{ => utils}/TestUtils.cmake             |  0
 cmake/{ => utils}/flashlightConfig.cmake.in   |  0
 cmake/utils/toolchain.cmake                   | 10 +-
 vcpkg.json                                    | 35 ++++++-
 29 files changed, 54 insertions(+), 122 deletions(-)
 delete mode 100644 cmake/HalideUtils.cmake
 rename cmake/{ => find}/BuildBackwardCpp.cmake (100%)
 rename cmake/{ => find}/BuildCereal.cmake (100%)
 rename cmake/{ => find}/BuildFlashlightSequence.cmake (100%)
 rename cmake/{ => find}/BuildFlashlightText.cmake (100%)
 rename cmake/{ => find}/BuildGloo.cmake (100%)
 rename cmake/{ => find}/BuildGoogleTest.cmake (100%)
 rename cmake/{ => find}/BuildSndFile.cmake (100%)
 rename cmake/{ => find}/BuildStb.cmake (100%)
 rename cmake/{ => find}/Buildsox.cmake (100%)
 rename cmake/{ => find}/FindCBLAS.cmake (100%)
 rename cmake/{ => find}/FindCUDNN.cmake (100%)
 rename cmake/{ => find}/FindFFTW3.cmake (100%)
 rename cmake/{ => find}/FindFLAC.cmake (100%)
 rename cmake/{ => find}/FindFilesystem.cmake (100%)
 rename cmake/{ => find}/FindGLOG.cmake (100%)
 rename cmake/{ => find}/FindGMock.cmake (100%)
 rename cmake/{ => find}/FindMKL.cmake (100%)
 rename cmake/{ => find}/FindNCCL.cmake (100%)
 rename cmake/{ => find}/FindOgg.cmake (100%)
 rename cmake/{ => find}/FindSndFile.cmake (100%)
 rename cmake/{ => find}/FindVorbis.cmake (100%)
 rename cmake/{ => find}/Findgflags.cmake (100%)
 rename cmake/{ => find}/Findsox.cmake (100%)
 rename cmake/{ => utils}/InternalUtils.cmake (95%)
 rename cmake/{ => utils}/TestUtils.cmake (100%)
 rename cmake/{ => utils}/flashlightConfig.cmake.in (100%)

diff --git a/cmake/HalideUtils.cmake b/cmake/HalideUtils.cmake
deleted file mode 100644
index d1b0831..0000000
--- a/cmake/HalideUtils.cmake
+++ /dev/null
@@ -1,99 +0,0 @@
-# Adds a Halide library. Compiles a Halide AOT-generator at compile time,
-# then runs the generator to produce header and lib artifacts that
-# can be linked to a passed target.
-#
-# SRC - the src file for the project
-# NAME - the name of the resulting target
-# LIBS - libraries to which the generated library will be linked
-# PREPROC - preprocessor defs to pass to the new target
-# LINK_TO - target to which to link the generated pipeline
-function(fl_add_and_link_halide_lib)
-  set(options)
-  set(oneValueArgs SRC NAME LINK_TO)
-  set(multiValueArgs LIBS PREPROC)
-  cmake_parse_arguments(fl_add_and_link_halide_lib
-    "${options}"
-    "${oneValueArgs}"
-    "${multiValueArgs}"
-    ${ARGN})
-
-  # Generator binary
-  set(GENERATOR_TARGET ${fl_add_and_link_halide_lib_NAME}_generator)
-  # Generator output
-  set(GENERATED_TARGET generate_${fl_add_and_link_halide_lib_NAME})
-  add_executable(${GENERATOR_TARGET} ${fl_add_and_link_halide_lib_SRC})
-  target_link_libraries(
-    ${GENERATOR_TARGET}
-    PRIVATE
-    Halide::Halide
-    ${LIBS})
-  target_compile_definitions(
-    ${GENERATOR_TARGET}
-    PRIVATE
-    ${PREPROC})
-    
-  # Run the generator
-  # LLVM may leak memory during Halide compilation - if building with ASAN,
-  # the generator might fail. Disable leack checking when executing generators
-  set(GENERATED_LIB "${fl_add_and_link_halide_lib_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set(GENERATED_HEADER "${fl_add_and_link_halide_lib_NAME}.h")
-  add_custom_command(OUTPUT ${GENERATED_HEADER} "${GENERATED_LIB}"
-    DEPENDS ${GENERATOR_TARGET}
-    COMMAND ${CMAKE_COMMAND} -E env "ASAN_OPTIONS=detect_leaks=0" $<TARGET_FILE:${GENERATOR_TARGET}>
-    VERBATIM)
-  add_custom_target(${GENERATED_TARGET}
-    DEPENDS ${GENERATED_HEADER} "${GENERATED_LIB}")
-  add_dependencies(${GENERATED_TARGET} ${GENERATOR_TARGET})
-  
-  set(LIB_PATH ${CMAKE_CURRENT_BINARY_DIR}/${GENERATED_LIB})
-  message(STATUS "Will generate AOT Halide Pipeline ${fl_add_and_link_halide_lib_NAME}")
-
-  # TODO: use an IMPORTED target? Might be cleaner
-  # add_library(${fl_add_and_link_halide_lib_NAME} STATIC IMPORTED)
-  # set_target_properties(${fl_add_and_link_halide_lib_NAME} PROPERTIES
-  #   INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CURRENT_BINARY_DIR}
-  #   IMPORTED_LOCATION "${GENERATED_LIB}"
-  #   INTERFACE_LINK_LIBRARIES Halide::Halide)
-  # add_dependencies(${fl_add_and_link_halide_lib_NAME} ${GENERATED_TARGET})
-
-  # Link the generated Halide lib to the target
-  add_dependencies(${fl_add_and_link_halide_lib_LINK_TO} ${GENERATED_TARGET})
-  # Ensure we can find generated headers
-  target_include_directories(
-    ${fl_add_and_link_halide_lib_LINK_TO} PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
-  # For now, this linkeage is private, which means the Flashlight core needs
-  # to wrap Halide pipelines when exposing them to external binaries.
-  # Properly installing the Halide lib will facilitate public linkeage.
-  target_link_libraries(${fl_add_and_link_halide_lib_LINK_TO} PRIVATE ${LIB_PATH})
-endfunction(fl_add_and_link_halide_lib)
-
-# Adds a Halide library that is linked with Flashlight.
-#
-# If used from an included CMake list, we won't run into
-# cmake_policy(SET CMP0079 NEW) issues. Halide pipelines
-# compiled for tests should use fl_add_and_link_halide_lib
-# instead since those are called via calls to `add_subdirectories`.
-# CMake 3.13 resolves this.
-function(fl_add_halide_lib)
-    set(options)
-  set(oneValueArgs SRC NAME)
-  set(multiValueArgs LIBS PREPROC)
-  cmake_parse_arguments(fl_add_halide_lib "${options}" "${oneValueArgs}"
-    "${multiValueArgs}" ${ARGN})
-
-  fl_add_and_link_halide_lib(
-    SRC ${fl_add_halide_lib_SRC}
-    NAME ${fl_add_halide_lib_NAME}
-    LIBS ${fl_add_halide_lib_LIBS}
-    PREPROC ${fl_add_halide_lib_PREPROC}
-    LINK_TO flashlight
-    )
-
-  # TODO: An IMPORTED target could help with this
-  # cmake_policy(SET CMP0079 NEW)
-  # target_link_libraries(flashlight PUBLIC ...)
-  # add_dependencies(flashlight ${GENERATED_TARGET})
-  # Generated Halide libs get installed too
-  # set(INSTALLABLE_TARGETS ${INSTALLABLE_TARGETS} ${LIB_PATH} PARENT_SCOPE)  
-endfunction(fl_add_halide_lib)
diff --git a/cmake/BuildBackwardCpp.cmake b/cmake/find/BuildBackwardCpp.cmake
similarity index 100%
rename from cmake/BuildBackwardCpp.cmake
rename to cmake/find/BuildBackwardCpp.cmake
diff --git a/cmake/BuildCereal.cmake b/cmake/find/BuildCereal.cmake
similarity index 100%
rename from cmake/BuildCereal.cmake
rename to cmake/find/BuildCereal.cmake
diff --git a/cmake/BuildFlashlightSequence.cmake b/cmake/find/BuildFlashlightSequence.cmake
similarity index 100%
rename from cmake/BuildFlashlightSequence.cmake
rename to cmake/find/BuildFlashlightSequence.cmake
diff --git a/cmake/BuildFlashlightText.cmake b/cmake/find/BuildFlashlightText.cmake
similarity index 100%
rename from cmake/BuildFlashlightText.cmake
rename to cmake/find/BuildFlashlightText.cmake
diff --git a/cmake/BuildGloo.cmake b/cmake/find/BuildGloo.cmake
similarity index 100%
rename from cmake/BuildGloo.cmake
rename to cmake/find/BuildGloo.cmake
diff --git a/cmake/BuildGoogleTest.cmake b/cmake/find/BuildGoogleTest.cmake
similarity index 100%
rename from cmake/BuildGoogleTest.cmake
rename to cmake/find/BuildGoogleTest.cmake
diff --git a/cmake/BuildSndFile.cmake b/cmake/find/BuildSndFile.cmake
similarity index 100%
rename from cmake/BuildSndFile.cmake
rename to cmake/find/BuildSndFile.cmake
diff --git a/cmake/BuildStb.cmake b/cmake/find/BuildStb.cmake
similarity index 100%
rename from cmake/BuildStb.cmake
rename to cmake/find/BuildStb.cmake
diff --git a/cmake/Buildsox.cmake b/cmake/find/Buildsox.cmake
similarity index 100%
rename from cmake/Buildsox.cmake
rename to cmake/find/Buildsox.cmake
diff --git a/cmake/FindCBLAS.cmake b/cmake/find/FindCBLAS.cmake
similarity index 100%
rename from cmake/FindCBLAS.cmake
rename to cmake/find/FindCBLAS.cmake
diff --git a/cmake/FindCUDNN.cmake b/cmake/find/FindCUDNN.cmake
similarity index 100%
rename from cmake/FindCUDNN.cmake
rename to cmake/find/FindCUDNN.cmake
diff --git a/cmake/FindFFTW3.cmake b/cmake/find/FindFFTW3.cmake
similarity index 100%
rename from cmake/FindFFTW3.cmake
rename to cmake/find/FindFFTW3.cmake
diff --git a/cmake/FindFLAC.cmake b/cmake/find/FindFLAC.cmake
similarity index 100%
rename from cmake/FindFLAC.cmake
rename to cmake/find/FindFLAC.cmake
diff --git a/cmake/FindFilesystem.cmake b/cmake/find/FindFilesystem.cmake
similarity index 100%
rename from cmake/FindFilesystem.cmake
rename to cmake/find/FindFilesystem.cmake
diff --git a/cmake/FindGLOG.cmake b/cmake/find/FindGLOG.cmake
similarity index 100%
rename from cmake/FindGLOG.cmake
rename to cmake/find/FindGLOG.cmake
diff --git a/cmake/FindGMock.cmake b/cmake/find/FindGMock.cmake
similarity index 100%
rename from cmake/FindGMock.cmake
rename to cmake/find/FindGMock.cmake
diff --git a/cmake/FindMKL.cmake b/cmake/find/FindMKL.cmake
similarity index 100%
rename from cmake/FindMKL.cmake
rename to cmake/find/FindMKL.cmake
diff --git a/cmake/FindNCCL.cmake b/cmake/find/FindNCCL.cmake
similarity index 100%
rename from cmake/FindNCCL.cmake
rename to cmake/find/FindNCCL.cmake
diff --git a/cmake/FindOgg.cmake b/cmake/find/FindOgg.cmake
similarity index 100%
rename from cmake/FindOgg.cmake
rename to cmake/find/FindOgg.cmake
diff --git a/cmake/FindSndFile.cmake b/cmake/find/FindSndFile.cmake
similarity index 100%
rename from cmake/FindSndFile.cmake
rename to cmake/find/FindSndFile.cmake
diff --git a/cmake/FindVorbis.cmake b/cmake/find/FindVorbis.cmake
similarity index 100%
rename from cmake/FindVorbis.cmake
rename to cmake/find/FindVorbis.cmake
diff --git a/cmake/Findgflags.cmake b/cmake/find/Findgflags.cmake
similarity index 100%
rename from cmake/Findgflags.cmake
rename to cmake/find/Findgflags.cmake
diff --git a/cmake/Findsox.cmake b/cmake/find/Findsox.cmake
similarity index 100%
rename from cmake/Findsox.cmake
rename to cmake/find/Findsox.cmake
diff --git a/cmake/InternalUtils.cmake b/cmake/utils/InternalUtils.cmake
similarity index 95%
rename from cmake/InternalUtils.cmake
rename to cmake/utils/InternalUtils.cmake
index 9f0075f..374f08d 100644
--- a/cmake/InternalUtils.cmake
+++ b/cmake/utils/InternalUtils.cmake
@@ -9,7 +9,8 @@ function(add_coverage_to_target)
       -O0 # TODO: reconcile this with CMake modes for something cleaner
       -g
       $<$<COMPILE_LANGUAGE:CXX>:--coverage>
-      )
+    )
+
     if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.13)
       target_link_options(${add_coverage_to_target_TARGET}
         PUBLIC
@@ -28,7 +29,8 @@ function(setup_install_targets)
     "${multiValueArgs}" ${ARGN})
 
   list(LENGTH setup_install_targets_INSTALL_TARGETS TARGETS_LENGTH)
-  if (${TARGETS_LENGTH} EQUAL 0)
+
+  if(${TARGETS_LENGTH} EQUAL 0)
     message(FATAL_ERROR "Flashlight setup_install_targets called with "
       "empty targets list.")
   endif()
@@ -44,7 +46,7 @@ function(setup_install_targets)
     ARCHIVE DESTINATION ${FL_INSTALL_LIB_DIR}
     FRAMEWORK DESTINATION framework
     INCLUDES DESTINATION ${FL_INSTALL_INC_DIR}
-    )
+  )
 
   # Write and install targets file
   install(
@@ -52,44 +54,43 @@ function(setup_install_targets)
     NAMESPACE flashlight::
     DESTINATION ${FL_INSTALL_CMAKE_DIR}
     COMPONENT flashlight
-    )
+  )
 
   # Write config file (used by projects including fl, such as examples)
   include(CMakePackageConfigHelpers)
   set(INCLUDE_DIRS include)
   set(CMAKE_DIR ${FL_INSTALL_CMAKE_DIR})
   configure_package_config_file(
-    ${PROJECT_SOURCE_DIR}/cmake/flashlightConfig.cmake.in
+    ${PROJECT_SOURCE_DIR}/cmake/utils/flashlightConfig.cmake.in
     cmake/install/${FL_CONFIG_CMAKE_BUILD_DIR}/flashlightConfig.cmake
     INSTALL_DESTINATION
     ${FL_INSTALL_CMAKE_DIR}
     PATH_VARS INCLUDE_DIRS CMAKE_DIR
-    )
+  )
   write_basic_package_version_file(
     cmake/install/${FL_CONFIG_CMAKE_BUILD_DIR}/flashlightConfigVersion.cmake
     COMPATIBILITY SameMajorVersion
-    )
+  )
   install(FILES
     ${PROJECT_BINARY_DIR}/cmake/install/flashlightConfig.cmake
     ${PROJECT_BINARY_DIR}/cmake/install/flashlightConfigVersion.cmake
     DESTINATION ${FL_INSTALL_CMAKE_DIR}
     COMPONENT flashlight
-    )
+  )
   set_target_properties(${setup_install_targets_INSTALL_TARGETS} PROPERTIES
     VERSION "${flashlight_VERSION}"
     SOVERSION "${flashlight_VERSION_MAJOR}")
 endfunction(setup_install_targets)
 
 function(setup_install_headers HEADER_DIR DEST_DIR)
-
   # Move headers
   install(
     DIRECTORY ${HEADER_DIR}
     COMPONENT headers
     DESTINATION ${DEST_DIR}
     FILES_MATCHING # preserve directory structure
-    PATTERN  "*.h"
-    PATTERN  "*.hpp"
+    PATTERN "*.h"
+    PATTERN "*.hpp"
     PATTERN "*.cuh" # TODO: make this conditional, e.g. $<IF:FLASHLIGHT_USE_CUDA,"*.cuh","a^">
     PATTERN "test*" EXCLUDE
     PATTERN "tests" EXCLUDE
@@ -103,17 +104,17 @@ function(setup_install_headers HEADER_DIR DEST_DIR)
     PATTERN "experimental" EXCLUDE
     PATTERN "plugincompiler" EXCLUDE
     PATTERN ".git" EXCLUDE
-    )
+  )
 endfunction(setup_install_headers)
 
 function(setup_install_find_module CONFIG_PATH)
   # Only actually move module files if doing a standalone install; otherwise,
   # assume we're being installed by a package manager
-  if (FL_BUILD_STANDALONE)
+  if(FL_BUILD_STANDALONE)
     install(
       FILES ${CONFIG_PATH}
       DESTINATION ${FL_INSTALL_CMAKE_DIR}
-      )
+    )
   endif()
 endfunction()
 
@@ -121,7 +122,7 @@ function(set_executable_output_directory EXEC_TARGET DIRECTORY)
   set_target_properties(${EXEC_TARGET} PROPERTIES
     RUNTIME_OUTPUT_DIRECTORY
     ${DIRECTORY}
-    )
+  )
 endfunction()
 
 # Small utility function which wraps cmake_dependent options and throws an error if the user
@@ -143,5 +144,6 @@ function(fl_dependent_option)
       message(FATAL_ERROR "${_dep} Required to build ${_option}")
     endif()
   endforeach()
+
   cmake_dependent_option(${_option} ${_text} "${_val}" "${_deps}" ${_frce})
 endfunction()
diff --git a/cmake/TestUtils.cmake b/cmake/utils/TestUtils.cmake
similarity index 100%
rename from cmake/TestUtils.cmake
rename to cmake/utils/TestUtils.cmake
diff --git a/cmake/flashlightConfig.cmake.in b/cmake/utils/flashlightConfig.cmake.in
similarity index 100%
rename from cmake/flashlightConfig.cmake.in
rename to cmake/utils/flashlightConfig.cmake.in
diff --git a/cmake/utils/toolchain.cmake b/cmake/utils/toolchain.cmake
index 57071c2..9d75007 100644
--- a/cmake/utils/toolchain.cmake
+++ b/cmake/utils/toolchain.cmake
@@ -8,12 +8,16 @@ message(STATUS "---- fm_cmake toolchain ----")
 #append cmake dir to module path
 set(FM_CMAKE_LIBRARY_DIR ${CMAKE_CURRENT_LIST_DIR})
 
-list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_LIBRARY_DIR}")
-message(STATUS "appended cmake/utils/ to cmake module path")
+set(FM_CMAKE_UTILITY_DIR "${FM_CMAKE_LIBRARY_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_UTILITY_DIR}")
+message(VERBOSE "appended utility dir to cmake module path (${FM_CMAKE_UTILITY_DIR})")
 
 list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_LIBRARY_DIR}/../")
-message(STATUS "appended (${FM_CMAKE_LIBRARY_DIR}/../) cmake/ to cmake module path")
+message(VERBOSE "appended (${FM_CMAKE_LIBRARY_DIR}/../) cmake/ to cmake module path")
 
+set(FM_CMAKE_FIND_SCRIPT_DIR "${FM_CMAKE_LIBRARY_DIR}/../find/")
+list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_FIND_SCRIPT_DIR}")
+message(VERBOSE "appended find scripts to module path (${FM_CMAKE_FIND_SCRIPT_DIR})")
 
 
 include(fm_assertions)
diff --git a/vcpkg.json b/vcpkg.json
index 7dcf289..0463957 100644
--- a/vcpkg.json
+++ b/vcpkg.json
@@ -6,14 +6,39 @@
     "gtest"
   ],
   "features": {
-    "cuda": {
-      "description": "Dependencies for gpu backend",
+    "distributed": {
+      "description": "Dependencies for distributed backend",
+      "dependencies": [
+        "mpi"
+      ]
+    },
+    "runtime": {
+      "description": "flashmini runtime helpers",
+      "dependencies": [
+        "glog",
+        "gflags"
+      ]
+    },
+    "vision": {
+      "description": "vision pkg",
+      "dependencies": [
+        "opencv",
+        "stb"
+      ]
+    },
+    "text": {
+      "description": "text pkg",
       "dependencies": []
     },
-    "cpu": {
-      "description": "Dependencies for cpu backend",
+    "speech": {
+      "description": "speech pkg",
       "dependencies": [
-        "onednn"
+        "libsndfile",
+        "libogg",
+        "libvorbis",
+        "libflac",
+        "fftw3",
+        "openblas"
       ]
     }
   }

From 7898f9a4caf5ec0daee67a4e44e5da77c56c027d Mon Sep 17 00:00:00 2001
From: Lukas Thomann <lukas.thomann@outlook.com>
Date: Wed, 25 Feb 2026 22:01:32 +0100
Subject: [PATCH 2/4] renamed "find" to "dependencies"

---
 cmake/{find => dependencies}/BuildBackwardCpp.cmake  |  0
 cmake/{find => dependencies}/BuildCereal.cmake       |  0
 .../BuildFlashlightSequence.cmake                    |  0
 .../{find => dependencies}/BuildFlashlightText.cmake |  0
 cmake/{find => dependencies}/BuildGloo.cmake         |  0
 cmake/{find => dependencies}/BuildGoogleTest.cmake   |  0
 cmake/{find => dependencies}/BuildSndFile.cmake      |  0
 cmake/{find => dependencies}/BuildStb.cmake          |  0
 cmake/{find => dependencies}/Buildsox.cmake          |  0
 cmake/{find => dependencies}/FindCBLAS.cmake         |  0
 cmake/{find => dependencies}/FindCUDNN.cmake         |  0
 cmake/{find => dependencies}/FindFFTW3.cmake         |  0
 cmake/{find => dependencies}/FindFLAC.cmake          |  0
 cmake/{find => dependencies}/FindFilesystem.cmake    |  0
 cmake/{find => dependencies}/FindGLOG.cmake          |  0
 cmake/{find => dependencies}/FindGMock.cmake         |  0
 cmake/{find => dependencies}/FindMKL.cmake           |  0
 cmake/{find => dependencies}/FindNCCL.cmake          |  0
 cmake/{find => dependencies}/FindOgg.cmake           |  0
 cmake/{find => dependencies}/FindSndFile.cmake       |  0
 cmake/{find => dependencies}/FindVorbis.cmake        |  0
 cmake/{find => dependencies}/Findgflags.cmake        |  0
 cmake/{find => dependencies}/Findsox.cmake           |  0
 cmake/utils/toolchain.cmake                          |  2 +-
 flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp  | 12 ++++--------
 25 files changed, 5 insertions(+), 9 deletions(-)
 rename cmake/{find => dependencies}/BuildBackwardCpp.cmake (100%)
 rename cmake/{find => dependencies}/BuildCereal.cmake (100%)
 rename cmake/{find => dependencies}/BuildFlashlightSequence.cmake (100%)
 rename cmake/{find => dependencies}/BuildFlashlightText.cmake (100%)
 rename cmake/{find => dependencies}/BuildGloo.cmake (100%)
 rename cmake/{find => dependencies}/BuildGoogleTest.cmake (100%)
 rename cmake/{find => dependencies}/BuildSndFile.cmake (100%)
 rename cmake/{find => dependencies}/BuildStb.cmake (100%)
 rename cmake/{find => dependencies}/Buildsox.cmake (100%)
 rename cmake/{find => dependencies}/FindCBLAS.cmake (100%)
 rename cmake/{find => dependencies}/FindCUDNN.cmake (100%)
 rename cmake/{find => dependencies}/FindFFTW3.cmake (100%)
 rename cmake/{find => dependencies}/FindFLAC.cmake (100%)
 rename cmake/{find => dependencies}/FindFilesystem.cmake (100%)
 rename cmake/{find => dependencies}/FindGLOG.cmake (100%)
 rename cmake/{find => dependencies}/FindGMock.cmake (100%)
 rename cmake/{find => dependencies}/FindMKL.cmake (100%)
 rename cmake/{find => dependencies}/FindNCCL.cmake (100%)
 rename cmake/{find => dependencies}/FindOgg.cmake (100%)
 rename cmake/{find => dependencies}/FindSndFile.cmake (100%)
 rename cmake/{find => dependencies}/FindVorbis.cmake (100%)
 rename cmake/{find => dependencies}/Findgflags.cmake (100%)
 rename cmake/{find => dependencies}/Findsox.cmake (100%)

diff --git a/cmake/find/BuildBackwardCpp.cmake b/cmake/dependencies/BuildBackwardCpp.cmake
similarity index 100%
rename from cmake/find/BuildBackwardCpp.cmake
rename to cmake/dependencies/BuildBackwardCpp.cmake
diff --git a/cmake/find/BuildCereal.cmake b/cmake/dependencies/BuildCereal.cmake
similarity index 100%
rename from cmake/find/BuildCereal.cmake
rename to cmake/dependencies/BuildCereal.cmake
diff --git a/cmake/find/BuildFlashlightSequence.cmake b/cmake/dependencies/BuildFlashlightSequence.cmake
similarity index 100%
rename from cmake/find/BuildFlashlightSequence.cmake
rename to cmake/dependencies/BuildFlashlightSequence.cmake
diff --git a/cmake/find/BuildFlashlightText.cmake b/cmake/dependencies/BuildFlashlightText.cmake
similarity index 100%
rename from cmake/find/BuildFlashlightText.cmake
rename to cmake/dependencies/BuildFlashlightText.cmake
diff --git a/cmake/find/BuildGloo.cmake b/cmake/dependencies/BuildGloo.cmake
similarity index 100%
rename from cmake/find/BuildGloo.cmake
rename to cmake/dependencies/BuildGloo.cmake
diff --git a/cmake/find/BuildGoogleTest.cmake b/cmake/dependencies/BuildGoogleTest.cmake
similarity index 100%
rename from cmake/find/BuildGoogleTest.cmake
rename to cmake/dependencies/BuildGoogleTest.cmake
diff --git a/cmake/find/BuildSndFile.cmake b/cmake/dependencies/BuildSndFile.cmake
similarity index 100%
rename from cmake/find/BuildSndFile.cmake
rename to cmake/dependencies/BuildSndFile.cmake
diff --git a/cmake/find/BuildStb.cmake b/cmake/dependencies/BuildStb.cmake
similarity index 100%
rename from cmake/find/BuildStb.cmake
rename to cmake/dependencies/BuildStb.cmake
diff --git a/cmake/find/Buildsox.cmake b/cmake/dependencies/Buildsox.cmake
similarity index 100%
rename from cmake/find/Buildsox.cmake
rename to cmake/dependencies/Buildsox.cmake
diff --git a/cmake/find/FindCBLAS.cmake b/cmake/dependencies/FindCBLAS.cmake
similarity index 100%
rename from cmake/find/FindCBLAS.cmake
rename to cmake/dependencies/FindCBLAS.cmake
diff --git a/cmake/find/FindCUDNN.cmake b/cmake/dependencies/FindCUDNN.cmake
similarity index 100%
rename from cmake/find/FindCUDNN.cmake
rename to cmake/dependencies/FindCUDNN.cmake
diff --git a/cmake/find/FindFFTW3.cmake b/cmake/dependencies/FindFFTW3.cmake
similarity index 100%
rename from cmake/find/FindFFTW3.cmake
rename to cmake/dependencies/FindFFTW3.cmake
diff --git a/cmake/find/FindFLAC.cmake b/cmake/dependencies/FindFLAC.cmake
similarity index 100%
rename from cmake/find/FindFLAC.cmake
rename to cmake/dependencies/FindFLAC.cmake
diff --git a/cmake/find/FindFilesystem.cmake b/cmake/dependencies/FindFilesystem.cmake
similarity index 100%
rename from cmake/find/FindFilesystem.cmake
rename to cmake/dependencies/FindFilesystem.cmake
diff --git a/cmake/find/FindGLOG.cmake b/cmake/dependencies/FindGLOG.cmake
similarity index 100%
rename from cmake/find/FindGLOG.cmake
rename to cmake/dependencies/FindGLOG.cmake
diff --git a/cmake/find/FindGMock.cmake b/cmake/dependencies/FindGMock.cmake
similarity index 100%
rename from cmake/find/FindGMock.cmake
rename to cmake/dependencies/FindGMock.cmake
diff --git a/cmake/find/FindMKL.cmake b/cmake/dependencies/FindMKL.cmake
similarity index 100%
rename from cmake/find/FindMKL.cmake
rename to cmake/dependencies/FindMKL.cmake
diff --git a/cmake/find/FindNCCL.cmake b/cmake/dependencies/FindNCCL.cmake
similarity index 100%
rename from cmake/find/FindNCCL.cmake
rename to cmake/dependencies/FindNCCL.cmake
diff --git a/cmake/find/FindOgg.cmake b/cmake/dependencies/FindOgg.cmake
similarity index 100%
rename from cmake/find/FindOgg.cmake
rename to cmake/dependencies/FindOgg.cmake
diff --git a/cmake/find/FindSndFile.cmake b/cmake/dependencies/FindSndFile.cmake
similarity index 100%
rename from cmake/find/FindSndFile.cmake
rename to cmake/dependencies/FindSndFile.cmake
diff --git a/cmake/find/FindVorbis.cmake b/cmake/dependencies/FindVorbis.cmake
similarity index 100%
rename from cmake/find/FindVorbis.cmake
rename to cmake/dependencies/FindVorbis.cmake
diff --git a/cmake/find/Findgflags.cmake b/cmake/dependencies/Findgflags.cmake
similarity index 100%
rename from cmake/find/Findgflags.cmake
rename to cmake/dependencies/Findgflags.cmake
diff --git a/cmake/find/Findsox.cmake b/cmake/dependencies/Findsox.cmake
similarity index 100%
rename from cmake/find/Findsox.cmake
rename to cmake/dependencies/Findsox.cmake
diff --git a/cmake/utils/toolchain.cmake b/cmake/utils/toolchain.cmake
index 9d75007..47d0dd8 100644
--- a/cmake/utils/toolchain.cmake
+++ b/cmake/utils/toolchain.cmake
@@ -15,7 +15,7 @@ message(VERBOSE "appended utility dir to cmake module path (${FM_CMAKE_UTILITY_D
 list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_LIBRARY_DIR}/../")
 message(VERBOSE "appended (${FM_CMAKE_LIBRARY_DIR}/../) cmake/ to cmake module path")
 
-set(FM_CMAKE_FIND_SCRIPT_DIR "${FM_CMAKE_LIBRARY_DIR}/../find/")
+set(FM_CMAKE_FIND_SCRIPT_DIR "${FM_CMAKE_LIBRARY_DIR}/../dependencies/")
 list(APPEND CMAKE_MODULE_PATH "${FM_CMAKE_FIND_SCRIPT_DIR}")
 message(VERBOSE "appended find scripts to module path (${FM_CMAKE_FIND_SCRIPT_DIR})")
 
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp
index ec209fd..17b242b 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp
@@ -167,15 +167,12 @@ std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::rnn(
 
     TensorDescriptor cyDesc(x.type(), hDims);
 
-    size_t workspaceSize =
-        getWorkspaceSize(handle, rnnDesc, seqLength, xDescs);
-    size_t reserveSize =
-        getReserveSize(handle, rnnDesc, seqLength, xDescs);
+    size_t workspaceSize = getWorkspaceSize(handle, rnnDesc, seqLength, xDescs);
+    size_t reserveSize = getReserveSize(handle, rnnDesc, seqLength, xDescs);
 
     Tensor workspace({static_cast<long long>(workspaceSize)}, fl::dtype::b8);
     // Space must be reused between forward and backward for cuDNN
-    payload->reserveSpace =
-        Tensor({static_cast<long long>(reserveSize)}, fl::dtype::b8);
+    payload->reserveSpace = Tensor({static_cast<long long>(reserveSize)}, fl::dtype::b8);
 
     {
         auto contiguousX = x.asContiguousTensor();
@@ -265,8 +262,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> CudnnAutogradExtension::rnnBackward(
     int outSize = hiddenSize * (bidirectional ? 2 : 1);
 
     DropoutDescriptor dropout(dropProb);
-    RNNDescriptor rnnDesc(
-        input.type(), hiddenSize, numLayers, mode, bidirectional, dropout);
+    RNNDescriptor rnnDesc(input.type(), hiddenSize, numLayers, mode, bidirectional, dropout);
     setCudnnRnnMathType(input, rnnDesc);
 
     TensorDescriptorArray yDesc(seqLength, y.type(), {1, 1, outSize, batchSize});

From 2c798275d277ed27ac2a3d9f9186d822f1817992 Mon Sep 17 00:00:00 2001
From: Lukas Thomann <lukas.thomann@outlook.com>
Date: Fri, 27 Feb 2026 20:08:36 +0100
Subject: [PATCH 3/4] accidentally deleted cpu and cuda features

---
 vcpkg.json | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vcpkg.json b/vcpkg.json
index 0463957..06083f2 100644
--- a/vcpkg.json
+++ b/vcpkg.json
@@ -6,6 +6,16 @@
     "gtest"
   ],
   "features": {
+    "cuda": {
+      "description": "Dependencies for gpu backend",
+      "dependencies": []
+    },
+    "cpu": {
+      "description": "Dependencies for cpu backend",
+      "dependencies": [
+        "onednn"
+      ]
+    },
     "distributed": {
       "description": "Dependencies for distributed backend",
       "dependencies": [
@@ -42,4 +52,4 @@
       ]
     }
   }
-}
+}
\ No newline at end of file

From 21a75af4c97e633ba3eb15c6cda6ec5dd3dbc5f8 Mon Sep 17 00:00:00 2001
From: Lukas Thomann <lukas.thomann@outlook.com>
Date: Fri, 27 Feb 2026 22:39:40 +0100
Subject: [PATCH 4/4] began to add first stuff

---
 Folder.DotSettings                            |   2 +
 cmake/dependencies/FindCUDNN.cmake            |   4 +
 cmake/utils/flashlightConfig.cmake.in         |   2 +-
 .../tensor/backend/cudnn/BatchNorm.cpp        |  32 +-
 .../autograd/tensor/backend/cudnn/Conv2D.cpp  |  74 ++---
 .../backend/cudnn/CudnnAutogradExtension.cpp  |  12 +-
 .../tensor/backend/cudnn/CudnnUtils.cpp       | 288 +++++++++---------
 .../tensor/backend/cudnn/CudnnUtils.h         |  67 +++-
 .../autograd/tensor/backend/cudnn/Pool2D.cpp  |  18 +-
 .../fl/autograd/tensor/backend/cudnn/RNN.cpp  | 266 ++++++++--------
 10 files changed, 401 insertions(+), 364 deletions(-)
 create mode 100644 Folder.DotSettings

diff --git a/Folder.DotSettings b/Folder.DotSettings
new file mode 100644
index 0000000..069ccf5
--- /dev/null
+++ b/Folder.DotSettings
@@ -0,0 +1,2 @@
+﻿<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
+	<s:String x:Key="/Default/CodeStyle/Naming/CppNamingOptions/Rules/=0B233A6C23E887458E5DB7357199AE90/@EntryIndexedValue">&lt;NamingElement Priority="6" Title="Parameters"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="function parameter" /&gt;&lt;type Name="lambda parameter" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" WarnAboutPrefixesAndSuffixes="False" Prefix="" Suffix="" Style="aaBb"&gt;&lt;ExtraRule Prefix="_" Suffix="" Style="aaBb" /&gt;&lt;/Policy&gt;&lt;/NamingElement&gt;</s:String></wpf:ResourceDictionary>
\ No newline at end of file
diff --git a/cmake/dependencies/FindCUDNN.cmake b/cmake/dependencies/FindCUDNN.cmake
index a150310..fd77eea 100644
--- a/cmake/dependencies/FindCUDNN.cmake
+++ b/cmake/dependencies/FindCUDNN.cmake
@@ -76,4 +76,8 @@ if(CUDNN_FOUND)
   endif()
 endif()
 
+if (CUDNN_FOUND AND CUDNN_VERSION VERSION_LESS "8.0")
+  message(FATAL_ERROR "Flashlight requires cuDNN >= 8.0, found ${CUDNN_VERSION}")
+endif()
+
 mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY CUDNN_VERSION)
diff --git a/cmake/utils/flashlightConfig.cmake.in b/cmake/utils/flashlightConfig.cmake.in
index 2bf2550..9d73423 100644
--- a/cmake/utils/flashlightConfig.cmake.in
+++ b/cmake/utils/flashlightConfig.cmake.in
@@ -49,7 +49,7 @@ if (@FL_BUILD_STANDALONE@)
   endif()
   if (@FL_USE_CUDA@)
     if (@FL_USE_CUDNN@)
-      find_dependency(CUDNN 7.1)
+      find_dependency(CUDNN 8)
     endif()
     if (@FL_BUILD_DISTRIBUTED@)
       find_dependency(NCCL)
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp
index 9f6b315..2538f88 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp
@@ -48,15 +48,15 @@ namespace {
 
         if(minAxis == 0) {
             modeOut = CUDNN_BATCHNORM_PER_ACTIVATION;
-            inDescDimsOut = Shape(
+            inDescDimsOut = Shape{
                 {
                     1,
                     1,
                     nfeatures,
                     static_cast<long long>(input.elements() / nfeatures)
                 }
-            );
-            wtDescDimsOut = Shape({1, 1, nfeatures});
+            };
+            wtDescDimsOut = Shape{1, 1, nfeatures};
         } else {
             modeOut = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7003
@@ -67,15 +67,15 @@ namespace {
             int batchsz = 1;
             for(int i = maxAxis + 1; i < input.ndim(); ++i)
                 batchsz *= input.dim(i);
-            inDescDimsOut = Shape(
+            inDescDimsOut = Shape{
                 {
                     1,
                     static_cast<long long>(input.elements() / (nfeatures * batchsz)),
                     nfeatures,
                     batchsz,
                 }
-            );
-            wtDescDimsOut = Shape({1, 1, nfeatures});
+            };
+            wtDescDimsOut = Shape{1, 1, nfeatures};
         }
     }
 
@@ -101,7 +101,7 @@ Tensor CudnnAutogradExtension::batchnorm(
         );
     FL_TENSOR_DTYPES_MATCH_CHECK(weight, bias, runningMean, runningVar);
 
-    auto output = Tensor(input.shape(), input.type());
+    auto output = Tensor{input.shape(), input.type()};
 
     cudnnBatchNormMode_t mode;
     Shape inDescDims, wtDescDims;
@@ -122,8 +122,8 @@ Tensor CudnnAutogradExtension::batchnorm(
     fl::dtype scalarsType =
         input.type() == fl::dtype::f16 ? fl::dtype::f32 : input.type();
 
-    auto inDesc = TensorDescriptor(input.type(), inDescDims);
-    auto wtDesc = TensorDescriptor(weightArray.type(), wtDescDims);
+    auto inDesc = TensorDescriptor{input.type(), inDescDims};
+    auto wtDesc = TensorDescriptor{weightArray.type(), wtDescDims};
 
     {
         DevicePtr inRaw(input);
@@ -140,8 +140,8 @@ Tensor CudnnAutogradExtension::batchnorm(
         );
 
         if(train) {
-            saveMean = Tensor({wtDescDims[2]}, scalarsType);
-            saveVar = Tensor({wtDescDims[2]}, scalarsType);
+            saveMean = Tensor{{wtDescDims[2]}, scalarsType};
+            saveVar = Tensor{{wtDescDims[2]}, scalarsType};
 
             DevicePtr saveMeanRaw(saveMean);
             DevicePtr saveVarRaw(saveVar);
@@ -223,13 +223,13 @@ std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::batchnormBackward(
     const void* one1 = kOne(scalarsType);
     const void* zero0 = kZero(scalarsType);
 
-    auto iDesc = TensorDescriptor(input.type(), inDescDims);
-    auto wDesc = TensorDescriptor(wt.type(), wtDescDims);
+    auto iDesc = TensorDescriptor{input.type(), inDescDims};
+    auto wDesc = TensorDescriptor{wt.type(), wtDescDims};
     // CuDNN doesn't support calculating only the gradients
     // required for batchnorm
-    auto gradIn = Tensor(input.shape(), input.type());
-    auto gradWt = Tensor(wt.shape(), wt.type());
-    auto gradBs = Tensor(wt.shape(), wt.type());
+    auto gradIn = Tensor{input.shape(), input.type()};
+    auto gradWt = Tensor{wt.shape(), wt.type()};
+    auto gradBs = Tensor{wt.shape(), wt.type()};
     {
         DevicePtr iRaw(input);
         DevicePtr wRaw(wt);
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp
index bb89e61..b9f63c6 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp
@@ -314,9 +314,9 @@ Tensor CudnnAutogradExtension::conv2d(
 
     auto hasBias = bias.elements() > 0;
 
-    auto inDesc = TensorDescriptor(input);
-    auto wtDesc = FilterDescriptor(weights);
-    auto convDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups);
+    auto inDesc = TensorDescriptor{input};
+    auto wtDesc = FilterDescriptor{weights};
+    auto convDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups};
     if(input.type() == fl::dtype::f16)
         CUDNN_CHECK_ERR(
             cudnnSetConvolutionMathType(
@@ -339,8 +339,8 @@ Tensor CudnnAutogradExtension::conv2d(
             odims.data()
         )
     );
-    auto output = Tensor({odims[3], odims[2], odims[1], odims[0]}, input.type());
-    auto outDesc = TensorDescriptor(output);
+    auto output = Tensor{{odims[3], odims[2], odims[1], odims[0]}, input.type()};
+    auto outDesc = TensorDescriptor{output};
 
     auto handle = getCudnnHandle();
     const auto& cudnnStream = getCudnnStream();
@@ -357,7 +357,7 @@ Tensor CudnnAutogradExtension::conv2d(
 
     try {
         wspace =
-            Tensor({static_cast<long long>(fwdAlgoBestPerf.memory)}, fl::dtype::b8);
+            Tensor{{static_cast<long long>(fwdAlgoBestPerf.memory)}, fl::dtype::b8};
     } catch(const std::exception&) {
         fwdAlgoBestPerf.algo = kFwdDefaultAlgo;
         CUDNN_CHECK_ERR(
@@ -372,7 +372,7 @@ Tensor CudnnAutogradExtension::conv2d(
             )
         );
         wspace =
-            Tensor({static_cast<long long>(fwdAlgoBestPerf.memory)}, fl::dtype::b8);
+            Tensor{{static_cast<long long>(fwdAlgoBestPerf.memory)}, fl::dtype::b8};
     }
     {
         DevicePtr inPtr(input);
@@ -405,7 +405,7 @@ Tensor CudnnAutogradExtension::conv2d(
         );
 
         if(hasBias) {
-            auto bsDesc = TensorDescriptor(bias);
+            auto bsDesc = TensorDescriptor{bias};
             DevicePtr bsPtr(bias);
             // ensure cudnn compute stream waits on stream of bias tensor
             relativeSync(cudnnStream, {bias});
@@ -453,10 +453,10 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
     // benchmarking suggests input or weight casting should occur, these
     // descriptors may not be used/new ones with the correct types will be
     // used instead.
-    auto iDesc = TensorDescriptor(input);
-    auto wDesc = FilterDescriptor(weight);
-    auto cDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups);
-    auto oDesc = TensorDescriptor(gradOutput);
+    auto iDesc = TensorDescriptor{input};
+    auto wDesc = FilterDescriptor{weight};
+    auto cDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups};
+    auto oDesc = TensorDescriptor{gradOutput};
 
     setDefaultMathType(cDesc, input);
 
@@ -491,10 +491,10 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
 
             Tensor ws;
             try {
-                ws = Tensor(
+                ws = Tensor{
                     {static_cast<long long>(bwdDataAlgoBestPerf.memory)},
                     fl::dtype::b8
-                );
+                };
             } catch(const std::exception&) {
                 bwdDataAlgoBestPerf.algo = kBwdDataDefaultAlgo;
                 CUDNN_CHECK_ERR(
@@ -508,13 +508,13 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
                         &bwdDataAlgoBestPerf.memory
                     )
                 );
-                ws = Tensor(
+                ws = Tensor{
                     {static_cast<long long>(bwdDataAlgoBestPerf.memory)},
                     fl::dtype::b8
-                );
+                };
             }
 
-            auto gradInput = Tensor(inTensor.shape(), inTensor.type());
+            auto gradInput = Tensor{inTensor.shape(), inTensor.type()};
             {
                 DevicePtr gradInputPtr(gradInput);
                 DevicePtr gradResultPtr(gradOutputTensor);
@@ -577,11 +577,11 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
                 /* incrementCount = */ false
             );
 
-            auto iDescF32 = TensorDescriptor(inTensorF32);
-            auto wDescF32 = FilterDescriptor(wtTensorF32);
+            auto iDescF32 = TensorDescriptor{inTensorF32};
+            auto wDescF32 = FilterDescriptor{wtTensorF32};
             auto cDescF32 =
-                ConvDescriptor(fl::dtype::f32, px, py, sx, sy, dx, dy, groups);
-            auto oDescF32 = TensorDescriptor(gradOutputTensorF32);
+                ConvDescriptor{fl::dtype::f32, px, py, sx, sy, dx, dy, groups};
+            auto oDescF32 = TensorDescriptor{gradOutputTensorF32};
             // core bwd data computation
             dataGradBenchmark->audit(
                 [&dataGradOut,
@@ -671,10 +671,10 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
     // benchmarking suggests input or weight casting should occur, these
     // descriptors may not be used/new ones with the correct types will be
     // used instead.
-    auto iDesc = TensorDescriptor(input);
-    auto wDesc = FilterDescriptor(weight);
-    auto cDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups);
-    auto oDesc = TensorDescriptor(gradOutput);
+    auto iDesc = TensorDescriptor{input};
+    auto wDesc = FilterDescriptor{weight};
+    auto cDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups};
+    auto oDesc = TensorDescriptor{gradOutput};
 
     setDefaultMathType(cDesc, input);
 
@@ -708,10 +708,10 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
 
             Tensor ws;
             try {
-                ws = Tensor(
+                ws = Tensor{
                     {static_cast<long long>(bwdFilterAlgoBestPerf.memory)},
                     fl::dtype::b8
-                );
+                };
             } catch(const std::exception&) {
                 bwdFilterAlgoBestPerf.algo = kBwdFilterDefaultAlgo;
                 CUDNN_CHECK_ERR(
@@ -725,13 +725,13 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
                         &bwdFilterAlgoBestPerf.memory
                     )
                 );
-                ws = Tensor(
+                ws = Tensor{
                     {static_cast<long long>(bwdFilterAlgoBestPerf.memory)},
                     fl::dtype::b8
-                );
+                };
             }
 
-            auto gradWeight = Tensor(wtTensor.shape(), wtTensor.type());
+            auto gradWeight = Tensor{wtTensor.shape(), wtTensor.type()};
             {
                 DevicePtr gradWeightPtr(gradWeight);
                 DevicePtr gradResultPtr(gradOutputTensor);
@@ -794,11 +794,11 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
                 /* incrementCount = */ false
             );
 
-            auto iDescF32 = TensorDescriptor(inTensorF32);
-            auto wDescF32 = FilterDescriptor(wtTensorF32);
+            auto iDescF32 = TensorDescriptor{inTensorF32};
+            auto wDescF32 = FilterDescriptor{wtTensorF32};
             auto cDescF32 =
-                ConvDescriptor(fl::dtype::f32, px, py, sx, sy, dx, dy, groups);
-            auto oDescF32 = TensorDescriptor(gradOutputTensorF32);
+                ConvDescriptor{fl::dtype::f32, px, py, sx, sy, dx, dy, groups};
+            auto oDescF32 = TensorDescriptor{gradOutputTensorF32};
             // core bwd data computation
             filterGradBenchmark->audit(
                 [&filterGradOut,
@@ -860,13 +860,13 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
         const Tensor& bsTensor,
         const Tensor& gradOutput,
         const TensorDescriptor& oDesc) -> Tensor {
-            auto gradBias = Tensor(bsTensor.shape(), bsTensor.type());
+            auto gradBias = Tensor{bsTensor.shape(), bsTensor.type()};
             {
                 DevicePtr gradBiasPtr(gradBias);
                 DevicePtr gradResultPtr(gradOutput);
                 // ensure cudnn compute stream waits on gradient tensor streams
                 relativeSync(cudnnStream, {gradOutput, gradBias});
-                auto bDesc = TensorDescriptor(bsTensor);
+                auto bDesc = TensorDescriptor{bsTensor};
                 CUDNN_CHECK_ERR(
                     cudnnConvolutionBackwardBias(
                         hndl,
@@ -911,7 +911,7 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
                     },
                     /* incrementCount = */ false
                 );
-                auto oDescF32 = TensorDescriptor(gradOutputF32);
+                auto oDescF32 = TensorDescriptor{gradOutputF32};
                 // Perform bias gradient computation
                 biasGradBenchmark->audit(
                     [&biasGradOut,
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp
index 305a6cc..eaac140 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp
@@ -16,13 +16,11 @@ namespace fl {
 std::shared_ptr<fl::DynamicBenchmark> CudnnAutogradExtension::createBenchmarkOptions() {
     return std::make_shared<fl::DynamicBenchmark>(
         std::make_shared<fl::DynamicBenchmarkOptions<KernelMode>>(
-            std::vector<KernelMode>(
-                {
-                    KernelMode::F32,
-                    KernelMode::F32_ALLOW_CONVERSION,
-                    KernelMode::F16
-                }
-            ),
+            std::vector<KernelMode>{
+                KernelMode::F32,
+                KernelMode::F32_ALLOW_CONVERSION,
+                KernelMode::F16
+            },
             fl::kDynamicBenchmarkDefaultCount
         )
     );
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp
index 82cadcb..804bc6f 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp
@@ -25,16 +25,16 @@ struct DeviceHandle {
     std::shared_ptr<fl::CUDAStream> stream;
 
     explicit DeviceHandle(std::shared_ptr<fl::CUDAStream> _stream) : cudnnHandle(nullptr),
-                                                                     stream(_stream) {
+        stream(_stream) {
         CUDNN_CHECK_ERR(cudnnCreate(&cudnnHandle));
         CUDNN_CHECK_ERR(cudnnSetStream(cudnnHandle, stream->handle()));
     }
 
     ~DeviceHandle() {
         if(cudnnHandle) {
-// See https://git.io/fNQnM - sometimes, at exit, the CUDA context
-// (or something) is already destroyed by the time a handle gets destroyed
-// because of an issue with the destruction order.
+            // See https://git.io/fNQnM - sometimes, at exit, the CUDA context
+            // (or something) is already destroyed by the time a handle gets destroyed
+            // because of an issue with the destruction order.
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
             CUDNN_CHECK_ERR(cudnnDestroy(cudnnHandle));
@@ -43,16 +43,16 @@ struct DeviceHandle {
     }
 };
 
-const float kFloatZero = 0.0;
-const float kFloatOne = 1.0;
+constexpr float kFloatZero = 0.0;
+constexpr float kFloatOne = 1.0;
 
-const double kDoubleZero = 0.0;
-const double kDoubleOne = 1.0;
+constexpr double kDoubleZero = 0.0;
+constexpr double kDoubleOne = 1.0;
 
 // TODO: move this to CudnnAutogradExtension if we make it a singleton
 std::unordered_map<int, DeviceHandle> handles;
 
-const DeviceHandle& getActiveDeviceHandle() {
+DeviceHandle const& getActiveDeviceHandle() {
     auto& manager = fl::DeviceManager::getInstance();
     auto& cudaDevice =
         manager.getActiveDevice(fl::DeviceType::CUDA).impl<fl::CUDADevice>();
@@ -88,57 +88,42 @@ namespace fl {
 void cudnnCheckErr(cudnnStatus_t status) {
     if(status == CUDNN_STATUS_SUCCESS)
         return;
-    const char* err = cudnnGetErrorString(status);
+    char const* err = cudnnGetErrorString(status);
     switch(status) {
-        case CUDNN_STATUS_BAD_PARAM:
-            throw std::invalid_argument(err);
-        default:
-            throw std::runtime_error(err);
+        case CUDNN_STATUS_BAD_PARAM: throw std::invalid_argument(err);
+        default: throw std::runtime_error(err);
     }
 }
 
-cudnnDataType_t cudnnMapToType(const fl::dtype& t) {
+cudnnDataType_t cudnnMapToType(fl::dtype const& t) {
     switch(t) {
-        case fl::dtype::f16:
-            return CUDNN_DATA_HALF;
-        case fl::dtype::f32:
-            return CUDNN_DATA_FLOAT;
-        case fl::dtype::f64:
-            return CUDNN_DATA_DOUBLE;
-        default:
-            throw std::invalid_argument("unsupported data type for cuDNN");
+        case fl::dtype::f16: return CUDNN_DATA_HALF;
+        case fl::dtype::f32: return CUDNN_DATA_FLOAT;
+        case fl::dtype::f64: return CUDNN_DATA_DOUBLE;
+        default: throw std::invalid_argument("unsupported data type for cuDNN");
     }
 }
 
-cudnnPoolingMode_t cudnnMapToPoolingMode(const PoolingMode mode) {
+cudnnPoolingMode_t cudnnMapToPoolingMode(PoolingMode const mode) {
     switch(mode) {
-        case PoolingMode::MAX:
-            return CUDNN_POOLING_MAX;
-        case PoolingMode::AVG_INCLUDE_PADDING:
-            return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-        case PoolingMode::AVG_EXCLUDE_PADDING:
-            return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-        default:
-            throw std::invalid_argument("unsupported pooling mode for cuDNN");
+        case PoolingMode::MAX: return CUDNN_POOLING_MAX;
+        case PoolingMode::AVG_INCLUDE_PADDING: return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+        case PoolingMode::AVG_EXCLUDE_PADDING: return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+        default: throw std::invalid_argument("unsupported pooling mode for cuDNN");
     }
 }
 
-cudnnRNNMode_t cudnnMapToRNNMode(const RnnMode mode) {
+cudnnRNNMode_t cudnnMapToRNNMode(RnnMode const mode) {
     switch(mode) {
-        case RnnMode::RELU:
-            return CUDNN_RNN_RELU;
-        case RnnMode::TANH:
-            return CUDNN_RNN_TANH;
-        case RnnMode::LSTM:
-            return CUDNN_LSTM;
-        case RnnMode::GRU:
-            return CUDNN_GRU;
-        default:
-            throw std::invalid_argument("unsupported RNN mode for cuDNN");
+        case RnnMode::RELU: return CUDNN_RNN_RELU;
+        case RnnMode::TANH: return CUDNN_RNN_TANH;
+        case RnnMode::LSTM: return CUDNN_LSTM;
+        case RnnMode::GRU: return CUDNN_GRU;
+        default: throw std::invalid_argument("unsupported RNN mode for cuDNN");
     }
 }
 
-TensorDescriptor::TensorDescriptor(const fl::dtype type, const Shape& flDims) {
+TensorDescriptor::TensorDescriptor(fl::dtype const type, Shape const& flDims) {
     CUDNN_CHECK_ERR(cudnnCreateTensorDescriptor(&descriptor));
     cudnnDataType_t cudnntype = cudnnMapToType(type);
 
@@ -165,7 +150,7 @@ TensorDescriptor::TensorDescriptor(const fl::dtype type, const Shape& flDims) {
     );
 }
 
-TensorDescriptor::TensorDescriptor(const Tensor& input) {
+TensorDescriptor::TensorDescriptor(Tensor const& input) {
     CUDNN_CHECK_ERR(cudnnCreateTensorDescriptor(&descriptor));
     cudnnDataType_t cudnntype = cudnnMapToType(input.type());
 
@@ -194,21 +179,19 @@ TensorDescriptor::TensorDescriptor(const Tensor& input) {
     );
 }
 
-TensorDescriptor::~TensorDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyTensorDescriptor(descriptor));
-}
+TensorDescriptor::~TensorDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyTensorDescriptor(descriptor)); }
 
 TensorDescriptorArray::TensorDescriptorArray(
     int size,
-    const fl::dtype type,
-    const Shape& dims
+    fl::dtype const type,
+    Shape const& dims
 ) {
-    desc_vec.reserve(size);
+    _descVec.reserve(size);
     for(int i = 0; i < size; i++) {
-        desc_vec.emplace_back(type, dims);
-        desc_raw_vec.push_back(desc_vec.back().descriptor);
+        _descVec.emplace_back(type, dims);
+        _descRawVec.push_back(_descVec.back().descriptor);
     }
-    descriptors = desc_raw_vec.data();
+    descriptors = _descRawVec.data();
 }
 
 TensorDescriptorArray::~TensorDescriptorArray() = default;
@@ -241,11 +224,9 @@ PoolingDescriptor::PoolingDescriptor(
     );
 }
 
-PoolingDescriptor::~PoolingDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyPoolingDescriptor(descriptor));
-}
+PoolingDescriptor::~PoolingDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyPoolingDescriptor(descriptor)); }
 
-FilterDescriptor::FilterDescriptor(const Tensor& input) {
+FilterDescriptor::FilterDescriptor(Tensor const& input) {
     CUDNN_CHECK_ERR(cudnnCreateFilterDescriptor(&descriptor));
     cudnnDataType_t cudnntype = cudnnMapToType(input.type());
 
@@ -267,121 +248,146 @@ FilterDescriptor::FilterDescriptor(const Tensor& input) {
     );
 }
 
-FilterDescriptor::~FilterDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyFilterDescriptor(descriptor));
-}
+FilterDescriptor::~FilterDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyFilterDescriptor(descriptor)); }
 
-DropoutDescriptor::DropoutDescriptor(float drop_prob) {
+DropoutDescriptor::DropoutDescriptor(float dropProb) {
     CUDNN_CHECK_ERR(cudnnCreateDropoutDescriptor(&descriptor));
-    auto cudnnHandle = getCudnnHandle();
-    unsigned long long seed = 0;
-    size_t state_size;
-    CUDNN_CHECK_ERR(cudnnDropoutGetStatesSize(cudnnHandle, &state_size));
-    auto& dropout_states = getDropoutStates();
-    if(dropout_states.isEmpty()) {
-        dropout_states =
-            Tensor({static_cast<long long>(state_size)}, fl::dtype::b8);
-        DevicePtr statesraw(dropout_states);
+
+    auto const cudnnHandle = getCudnnHandle();
+    constexpr unsigned long long seed = 0;
+    size_t stateSize;
+
+    CUDNN_CHECK_ERR(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize));
+
+    auto& dropoutStates = getDropoutStates();
+
+    if(dropoutStates.isEmpty()) {
+        dropoutStates =
+            Tensor{{static_cast<long long>(stateSize)}, fl::dtype::b8};
+        DevicePtr statesraw(dropoutStates);
         CUDNN_CHECK_ERR(
             cudnnSetDropoutDescriptor(
                 descriptor,
                 cudnnHandle,
-                drop_prob,
+                dropProb,
                 statesraw.get(),
-                state_size,
+                stateSize,
                 seed
             )
         );
-    } else {
-        DevicePtr statesraw(dropout_states);
-// See https://git.io/fp9oo for an explanation.
-#if CUDNN_VERSION >= 7000
+    }
+    else {
+        DevicePtr statesraw(dropoutStates);
         CUDNN_CHECK_ERR(
             cudnnRestoreDropoutDescriptor(
                 descriptor,
                 cudnnHandle,
-                drop_prob,
+                dropProb,
                 statesraw.get(),
-                state_size,
+                stateSize,
                 seed
             )
         );
-#else
-        auto dropout_struct = reinterpret_cast<CudnnDropoutStruct*>(descriptor);
-        dropout_struct->dropout = drop_prob;
-        dropout_struct->nstates = state_size;
-        dropout_struct->states = statesraw.get();
-#endif
     }
 }
 
-DropoutDescriptor::~DropoutDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyDropoutDescriptor(descriptor));
-}
+DropoutDescriptor::~DropoutDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyDropoutDescriptor(descriptor)); }
 
 Tensor& DropoutDescriptor::getDropoutStates() {
-    thread_local Tensor dropout_states;
-    return dropout_states;
+    thread_local Tensor dropoutStates;
+    return dropoutStates;
 }
 
 RNNDescriptor::RNNDescriptor(
     fl::dtype type,
-    int hidden_size,
-    int num_layers,
+    int inputSize,
+    int hiddenSize,
+    int numLayers,
     RnnMode mode,
     bool bidirectional,
     DropoutDescriptor& dropout
 ) {
-    CUDNN_CHECK_ERR(cudnnCreateRNNDescriptor(&descriptor));
-
-    auto cudnnHandle = getCudnnHandle();
+    CUDNN_CHECK_ERR(cudnnCreateRNNDescriptor(&_handle));
 
-    cudnnRNNInputMode_t in_mode = CUDNN_LINEAR_INPUT;
+    constexpr auto inMode = CUDNN_LINEAR_INPUT;
+    constexpr auto algo = CUDNN_RNN_ALGO_STANDARD;
 
-    cudnnDirectionMode_t dir =
-        bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+    auto const dir = bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
 
-    cudnnRNNMode_t cell = cudnnMapToRNNMode(mode);
-    cudnnRNNAlgo_t algo = CUDNN_RNN_ALGO_STANDARD;
-    cudnnDataType_t cudnntype = cudnnMapToType(type);
+    auto const cell = cudnnMapToRNNMode(mode);
+    auto const dataType = cudnnMapToType(type);
 
-#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
     CUDNN_CHECK_ERR(
-        cudnnSetRNNDescriptor(
-            cudnnHandle,
-            descriptor,
-            hidden_size,
-            num_layers,
-            dropout.descriptor,
-            in_mode,
-            dir,
-            cell,
+        //https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-892/api/index.html#cudnnSetRNNDescriptor_v8
+        cudnnSetRNNDescriptor_v8(
+            _handle,
             algo,
-            cudnntype
+            cell,
+            CUDNN_RNN_DOUBLE_BIAS, //TODO review; double is default for old cudnn
+            dir,
+            inMode,
+            dataType,
+            dataType, // math precision
+            mathType(type),
+            inputSize,
+            hiddenSize,
+            hiddenSize, //projection size (unused)
+            numLayers,
+            dropout.descriptor,
+            0
         )
     );
-#else
+}
+
+RNNDescriptor::~RNNDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyRNNDescriptor(_handle)); }
+
+}
+
+namespace fl {
+
+
+RNNDataDescriptor::RNNDataDescriptor(fl::dtype type, Shape const& dims) {
+    create();
+    auto const inputSize = dims.ndim() > 0 ? static_cast<int>(dims[0]) : 1;
+    auto const batchSize = dims.ndim() > 1 ? static_cast<int>(dims[1]) : 1;
+    auto const maxSeqSize = dims.ndim() > 2 ? static_cast<int>(dims[2]) : 1;
+
+    std::vector seqSizes(batchSize, maxSeqSize);
+
+    set(
+        type,
+        inputSize,
+        maxSeqSize,
+        seqSizes
+    );
+}
+
+RNNDataDescriptor::~RNNDataDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyRNNDataDescriptor(_handle)); }
+void RNNDataDescriptor::create() { CUDNN_CHECK_ERR(cudnnCreateRNNDataDescriptor(&_handle)); }
+void RNNDataDescriptor::set(
+    fl::dtype type,
+    int inputSize,
+    int maxSeqSize,
+    std::span<int const> sequenceSizes
+) const {
     CUDNN_CHECK_ERR(
-        cudnnSetRNNDescriptor_v6(
-            cudnnHandle,
-            descriptor,
-            hidden_size,
-            num_layers,
-            dropout.descriptor,
-            in_mode,
-            dir,
-            cell,
-            algo,
-            cudnntype
+        cudnnSetRNNDataDescriptor(
+            _handle,
+            cudnnMapToType(type),
+            CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED,
+            maxSeqSize,
+            sequenceSizes.size(), //batch size
+            inputSize,
+            sequenceSizes.data(),
+            nullptr //no padding
         )
     );
-#endif
 }
 
-RNNDescriptor::~RNNDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyRNNDescriptor(descriptor));
 }
 
+namespace fl {
+
 ConvDescriptor::ConvDescriptor(
     fl::dtype type,
     int px,
@@ -413,35 +419,27 @@ ConvDescriptor::ConvDescriptor(
     CUDNN_CHECK_ERR(cudnnSetConvolutionGroupCount(descriptor, groups));
 }
 
-ConvDescriptor::~ConvDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyConvolutionDescriptor(descriptor));
-}
+ConvDescriptor::~ConvDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyConvolutionDescriptor(descriptor)); }
 
 cudnnHandle_t getCudnnHandle() { return getActiveDeviceHandle().cudnnHandle; }
 
-const CUDAStream& getCudnnStream() { return *getActiveDeviceHandle().stream; }
+CUDAStream const& getCudnnStream() { return *getActiveDeviceHandle().stream; }
 
-const void* kOne(const fl::dtype t) {
+void const* kOne(fl::dtype const t) {
     switch(t) {
         case fl::dtype::f16:
-        case fl::dtype::f32:
-            return &kFloatOne;
-        case fl::dtype::f64:
-            return &kDoubleOne;
-        default:
-            throw std::invalid_argument("unsupported data type for cuDNN");
+        case fl::dtype::f32: return &kFloatOne;
+        case fl::dtype::f64: return &kDoubleOne;
+        default: throw std::invalid_argument("unsupported data type for cuDNN");
     }
 }
 
-const void* kZero(const fl::dtype t) {
+void const* kZero(fl::dtype const t) {
     switch(t) {
         case fl::dtype::f16:
-        case fl::dtype::f32:
-            return &kFloatZero;
-        case fl::dtype::f64:
-            return &kDoubleZero;
-        default:
-            throw std::invalid_argument("unsupported data type for cuDNN");
+        case fl::dtype::f32: return &kFloatZero;
+        case fl::dtype::f64: return &kDoubleZero;
+        default: throw std::invalid_argument("unsupported data type for cuDNN");
     }
 }
 
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h
index fca9969..ad83be3 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h
@@ -13,13 +13,15 @@
 #include "flashlight/fl/runtime/CUDAStream.h"
 #include "flashlight/fl/tensor/TensorBase.h"
 
+#include <span>
+
 namespace fl {
 
 class TensorDescriptor {
 public:
-    explicit TensorDescriptor(const Tensor& a);
+    explicit TensorDescriptor(Tensor const& a);
 
-    TensorDescriptor(const fl::dtype type, const Shape& af_dims);
+    TensorDescriptor(fl::dtype const type, Shape const& afDims);
 
     cudnnTensorDescriptor_t descriptor;
     ~TensorDescriptor();
@@ -27,19 +29,19 @@ class TensorDescriptor {
 
 class TensorDescriptorArray {
 public:
-    TensorDescriptorArray(int size, const fl::dtype type, const Shape& dims);
+    TensorDescriptorArray(int size, fl::dtype const type, Shape const& dims);
 
     cudnnTensorDescriptor_t* descriptors;
     ~TensorDescriptorArray();
 
 private:
-    std::vector<TensorDescriptor> desc_vec;
-    std::vector<cudnnTensorDescriptor_t> desc_raw_vec;
+    std::vector<TensorDescriptor> _descVec;
+    std::vector<cudnnTensorDescriptor_t> _descRawVec;
 };
 
 class FilterDescriptor {
 public:
-    explicit FilterDescriptor(const Tensor& a);
+    explicit FilterDescriptor(Tensor const& input);
     cudnnFilterDescriptor_t descriptor;
     ~FilterDescriptor();
 };
@@ -77,7 +79,7 @@ class PoolingDescriptor {
 
 class DropoutDescriptor {
 public:
-    explicit DropoutDescriptor(float drop_prob);
+    explicit DropoutDescriptor(float dropProb);
     cudnnDropoutDescriptor_t descriptor;
     ~DropoutDescriptor();
 
@@ -88,28 +90,65 @@ class RNNDescriptor {
 public:
     RNNDescriptor(
         fl::dtype type,
-        int hidden_size,
-        int num_layers,
+        int inputSize,
+        int hiddenSize,
+        int numLayers,
         RnnMode mode,
         bool bidirectional,
         DropoutDescriptor& dropout
     );
-    cudnnRNNDescriptor_t descriptor;
     ~RNNDescriptor();
+
+private:
+    cudnnRNNDescriptor_t _handle = nullptr;
+
+    static constexpr auto mathType(fl::dtype type) {
+        return type == fl::dtype::f16 ? CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION : CUDNN_DEFAULT_MATH;
+    }
+
+public:
+    /**
+     * @return descriptor handle
+     */
+    constexpr auto get() const { return _handle; }
 };
 
+
+class RNNDataDescriptor {
+public:
+    RNNDataDescriptor(
+        fl::dtype type,
+        Shape const& dims
+    );
+
+    ~RNNDataDescriptor();
+
+private:
+    void create();
+    void set(dtype type, int inputSize, int maxSeqSize, std::span<int const> sequenceSizes) const;
+
+    cudnnRNNDataDescriptor_t _handle = nullptr;
+
+public:
+    /**
+     * @return descriptor handle
+     */
+    constexpr auto get() const { return _handle; }
+};
+
+
 #define CUDNN_CHECK_ERR(expr) ::fl::cudnnCheckErr((expr))
 
 void cudnnCheckErr(cudnnStatus_t status);
 
-cudnnDataType_t cudnnMapToType(const fl::dtype& t);
+cudnnDataType_t cudnnMapToType(fl::dtype const& t);
 
-const void* kOne(const fl::dtype t);
+void const* kOne(fl::dtype const t);
 
-const void* kZero(const fl::dtype t);
+void const* kZero(fl::dtype const t);
 
 // TODO: move this to CudnnAutogradExtension if we make it a singleton
 cudnnHandle_t getCudnnHandle();
-const CUDAStream& getCudnnStream();
+CUDAStream const& getCudnnStream();
 
 } // namespace fl
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp
index 24b08c8..ef838bb 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp
@@ -25,10 +25,10 @@ Tensor CudnnAutogradExtension::pool2d(
     const PoolingMode mode,
     std::shared_ptr<detail::AutogradPayload>
 ) {
-    auto inDesc = TensorDescriptor(input);
+    auto inDesc = TensorDescriptor{input};
 
     // init pooling descriptor
-    auto poolDesc = PoolingDescriptor(wx, wy, sx, sy, px, py, mode);
+    auto poolDesc = PoolingDescriptor{wx, wy, sx, sy, px, py, mode};
 
     // init output descriptor
     auto ix = input.dim(0);
@@ -36,7 +36,7 @@ Tensor CudnnAutogradExtension::pool2d(
     auto ox = 1 + (ix + 2 * px - wx) / sx;
     auto oy = 1 + (iy + 2 * py - wy) / sy;
 
-    auto output = Tensor(
+    auto output = Tensor{
         {
             ox,
             oy,
@@ -44,8 +44,8 @@ Tensor CudnnAutogradExtension::pool2d(
             input.ndim() < 4 ? 1 : input.dim(3)
         },
         input.type()
-    );
-    auto outDesc = TensorDescriptor(output);
+    };
+    auto outDesc = TensorDescriptor{output};
     {
         DevicePtr inputraw(input);
         DevicePtr resultraw(output);
@@ -90,11 +90,11 @@ Tensor CudnnAutogradExtension::pool2dBackward(
     const PoolingMode mode,
     std::shared_ptr<detail::AutogradPayload>
 ) {
-    auto i_desc = TensorDescriptor(input);
-    auto o_desc = TensorDescriptor(poolOutput);
-    auto p_desc = PoolingDescriptor(wx, wy, sx, sy, px, py, mode);
+    auto i_desc = TensorDescriptor{input};
+    auto o_desc = TensorDescriptor{poolOutput};
+    auto p_desc = PoolingDescriptor{wx, wy, sx, sy, px, py, mode};
 
-    auto gradInput = Tensor(input.shape(), input.type());
+    auto gradInput = Tensor{input.shape(), input.type()};
 
     auto hndl = getCudnnHandle();
     const auto& cudnnStream = getCudnnStream();
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp
index 17b242b..568e4d5 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp
@@ -15,58 +15,34 @@
 
 namespace fl {
 namespace {
-    size_t getWorkspaceSize(
-        cudnnHandle_t handle,
-        const RNNDescriptor& rnnDesc,
-        const int seqLength,
-        const TensorDescriptorArray& xDescs
-    ) {
-        size_t workspaceSize;
-        CUDNN_CHECK_ERR(
-            cudnnGetRNNWorkspaceSize(
-                handle,
-                rnnDesc.descriptor,
-                seqLength,
-                xDescs.descriptors,
-                &workspaceSize
-            )
-        );
-        return workspaceSize;
-    }
+    struct temp_space_sizes {
+        size_t size;
+        size_t reserveSize;
+    };
 
-    size_t getReserveSize(
+    temp_space_sizes rnnTempSpaceSizes(
         cudnnHandle_t handle,
-        const RNNDescriptor& rnnDesc,
-        const int seqLength,
-        const TensorDescriptorArray& xDescs
+        RNNDescriptor const& rnnDescriptor,
+        RNNDataDescriptor const& xDescriptor,
+        cudnnForwardMode_t mode
     ) {
-        size_t reserveSize;
+        temp_space_sizes sizes{};
+
         CUDNN_CHECK_ERR(
-            cudnnGetRNNTrainingReserveSize(
+            cudnnGetRNNTempSpaceSizes(
                 handle,
-                rnnDesc.descriptor,
-                seqLength,
-                xDescs.descriptors,
-                &reserveSize
+                rnnDescriptor.get(),
+                mode,
+                xDescriptor.get(),
+                &sizes.size,
+                &sizes.reserveSize
             )
         );
-        return reserveSize;
-    }
 
-    void setCudnnRnnMathType(const Tensor& input, const RNNDescriptor& rnnDesc) {
-        if(input.type() == fl::dtype::f16)
-            CUDNN_CHECK_ERR(
-                cudnnSetRNNMatrixMathType(
-                    rnnDesc.descriptor,
-                    CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
-                )
-            );
-        else
-            CUDNN_CHECK_ERR(
-                cudnnSetRNNMatrixMathType(rnnDesc.descriptor, CUDNN_DEFAULT_MATH)
-            );
+        return sizes;
     }
 
+
     struct CudnnRnnAutogradPayload : public detail::AutogradPayloadData {
         Tensor reserveSpace;
     };
@@ -74,15 +50,15 @@ namespace {
 } // namespace
 
 std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::rnn(
-    const Tensor& input,
-    const Tensor& hiddenStateIn,
-    const Tensor& cellStateIn,
-    const Tensor& weights,
-    const int hiddenSize,
-    const int numLayers,
-    const RnnMode mode,
-    const bool bidirectional,
-    const float dropProb,
+    Tensor const& input,
+    Tensor const& hiddenStateIn,
+    Tensor const& cellStateIn,
+    Tensor const& weights,
+    int const hiddenSize,
+    int const numLayers,
+    RnnMode const mode,
+    bool const bidirectional,
+    float const dropProb,
     std::shared_ptr<detail::AutogradPayload> autogradPayload
 ) {
     FL_TENSOR_DTYPES_MATCH_CHECK(input, hiddenStateIn, cellStateIn, weights);
@@ -93,25 +69,33 @@ std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::rnn(
         autogradPayload->data = payload;
 
     Tensor x = input.asContiguousTensor();
+    RNNDataDescriptor xDesc{x.type(), x.shape()};
+
     Tensor hiddenState = hiddenStateIn.asContiguousTensor();
     Tensor cellState = cellStateIn.asContiguousTensor();
 
-    DropoutDescriptor dropout(dropProb);
-    RNNDescriptor rnnDesc(
-        input.type(), hiddenSize, numLayers, mode, bidirectional, dropout);
-    setCudnnRnnMathType(input, rnnDesc);
+    DropoutDescriptor dropout{dropProb};
 
-    auto dims = x.shape();
+    auto const& dims = x.shape();
     int inputSize = dims[0];
     int batchSize = dims.ndim() < 2 ? 1 : dims[1];
     int seqLength = dims.ndim() < 3 ? 1 : dims[2];
 
+
+    RNNDescriptor rnnDesc{
+        input.type(),
+        inputSize,
+        hiddenSize,
+        numLayers,
+        mode,
+        bidirectional,
+        dropout
+    };
+
+
     int totalLayers = numLayers * (bidirectional ? 2 : 1);
     int outSize = hiddenSize * (bidirectional ? 2 : 1);
 
-    TensorDescriptorArray xDescs(
-        seqLength, x.type(), {1, 1, inputSize, batchSize});
-
     if(!hiddenState.isEmpty()) {
         auto hxDims = hiddenState.shape();
         int hxHiddenSize = hxDims[0];
@@ -119,31 +103,31 @@ std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::rnn(
         int hxTotalLayers = hiddenState.ndim() < 3 ? 1 : hxDims[2];
 
         if(
-            !(hxHiddenSize == hiddenSize && hxBatchSize == batchSize
-            && hxTotalLayers == totalLayers)
+            hxHiddenSize != hiddenSize || hxBatchSize != batchSize
+            || hxTotalLayers != totalLayers
         )
             throw std::invalid_argument("invalid hidden state dims for RNN");
     }
 
     if(
         !cellState.isEmpty()
-        && !(mode == RnnMode::LSTM && cellState.dim(0) == hiddenSize
-        && cellState.dim(1) == batchSize && cellState.dim(2) == totalLayers)
+        && (mode != RnnMode::LSTM || cellState.dim(0) != hiddenSize
+            || cellState.dim(1) != batchSize || cellState.dim(2) != totalLayers)
     )
         throw std::invalid_argument("invalid cell state dims for RNN");
 
     Shape hDims = {1, hiddenSize, batchSize, totalLayers};
-    TensorDescriptor hxDesc(x.type(), hDims);
-    TensorDescriptor cxDesc(x.type(), hDims);
+    TensorDescriptor hxDesc{x.type(), hDims};
+    TensorDescriptor cxDesc{x.type(), hDims};
 
     auto handle = getCudnnHandle();
-    const auto& cudnnStream = getCudnnStream();
+    auto const& cudnnStream = getCudnnStream();
 
     size_t paramSize;
     CUDNN_CHECK_ERR(
         cudnnGetRNNParamsSize(
             handle,
-            rnnDesc.descriptor,
+            rnnDesc._handle,
             xDescs.descriptors[0],
             &paramSize,
             cudnnMapToType(weights.type())
@@ -155,24 +139,24 @@ std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::rnn(
         );
     FilterDescriptor wDesc(weights);
 
-    Tensor y({outSize, batchSize, seqLength}, input.type());
-    TensorDescriptorArray yDesc(seqLength, y.type(), {1, 1, outSize, batchSize});
+    Tensor y{{outSize, batchSize, seqLength}, input.type()};
+    TensorDescriptorArray yDesc{seqLength, y.type(), {1, 1, outSize, batchSize}};
 
-    Tensor hy({hiddenSize, batchSize, totalLayers}, x.type());
-    TensorDescriptor hyDesc(x.type(), hDims);
+    Tensor hy{{hiddenSize, batchSize, totalLayers}, x.type()};
+    TensorDescriptor hyDesc{x.type(), hDims};
 
-    Tensor cy;
+    Tensor cy{};
     if(mode == RnnMode::LSTM)
-        cy = Tensor(hy.shape(), x.type());
+        cy = Tensor{hy.shape(), x.type()};
 
-    TensorDescriptor cyDesc(x.type(), hDims);
+    TensorDescriptor cyDesc{x.type(), hDims};
 
-    size_t workspaceSize = getWorkspaceSize(handle, rnnDesc, seqLength, xDescs);
-    size_t reserveSize = getReserveSize(handle, rnnDesc, seqLength, xDescs);
+    constexpr auto forwardMode = CUDNN_FWD_MODE_TRAINING;
+    auto [workspaceSize, reserveSize] = rnnTempSpaceSizes(handle, rnnDesc, xDesc, forwardMode);
 
     Tensor workspace({static_cast<long long>(workspaceSize)}, fl::dtype::b8);
     // Space must be reused between forward and backward for cuDNN
-    payload->reserveSpace = Tensor({static_cast<long long>(reserveSize)}, fl::dtype::b8);
+    payload->reserveSpace = Tensor{{static_cast<long long>(reserveSize)}, fl::dtype::b8};
 
     {
         auto contiguousX = x.asContiguousTensor();
@@ -190,34 +174,40 @@ std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::rnn(
         relativeSync(
             cudnnStream,
             {
-                contiguousX, hiddenState, cellState, contiguousWeights, y, hy, cy,
-                workspace, payload->reserveSpace,
+                contiguousX,
+                hiddenState,
+                cellState,
+                contiguousWeights,
+                y,
+                hy,
+                cy,
+                workspace,
+                payload->reserveSpace,
             }
         );
 
         CUDNN_CHECK_ERR(
-            cudnnRNNForwardTraining(
+            cudnnRNNForward(
                 handle,
-                rnnDesc.descriptor,
+                rnnDesc.get(),
                 seqLength,
-                xDescs.descriptors,
+                xDesc.get(),
                 xRaw.get(),
-                hxDesc.descriptor,
-                hxRaw.get(),
-                cxDesc.descriptor,
-                cxRaw.get(),
-                wDesc.descriptor,
-                wRaw.get(),
-                yDesc.descriptors,
-                yRaw.get(),
-                hyDesc.descriptor,
-                hyRaw.get(),
-                cyDesc.descriptor,
-                cyRaw.get(),
-                workspaceRaw.get(),
+                yDesc.get(),
+                yRaw,
+                hxDesc,
+                hxRaw,
+                hyRaw,
+                cxDesc,
+                cxRaw,
+                cyRaw,
+                weightspace and its size????,
+                //TEMP continue here
+
                 workspaceSize,
-                reserveSpaceRaw.get(),
-                reserveSize
+                workspaceRaw
+                reserveSize,
+                reserveSpaceRaw.get()
             )
         );
     }
@@ -228,17 +218,17 @@ std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::rnn(
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor> CudnnAutogradExtension::rnnBackward(
-    const Tensor& input,
-    const Tensor& hiddenState,
-    const Tensor& cellState,
-    const Tensor& weights,
-    const std::shared_ptr<detail::RNNGradData> gradData,
-    const Tensor& output,
-    const int numLayers,
-    const int hiddenSize,
-    const RnnMode mode,
-    const bool bidirectional,
-    const float dropProb,
+    Tensor const& input,
+    Tensor const& hiddenState,
+    Tensor const& cellState,
+    Tensor const& weights,
+    std::shared_ptr<detail::RNNGradData> const gradData,
+    Tensor const& output,
+    int const numLayers,
+    int const hiddenSize,
+    RnnMode const mode,
+    bool const bidirectional,
+    float const dropProb,
     std::shared_ptr<detail::AutogradPayload> autogradPayload
 ) {
     if(!autogradPayload)
@@ -249,7 +239,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> CudnnAutogradExtension::rnnBackward(
         std::static_pointer_cast<CudnnRnnAutogradPayload>(autogradPayload->data);
 
     auto handle = getCudnnHandle();
-    const auto& cudnnStream = getCudnnStream();
+    auto const& cudnnStream = getCudnnStream();
 
     auto x = input.asContiguousTensor();
     auto& y = output;
@@ -261,29 +251,32 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> CudnnAutogradExtension::rnnBackward(
     int totalLayers = numLayers * (bidirectional ? 2 : 1);
     int outSize = hiddenSize * (bidirectional ? 2 : 1);
 
-    DropoutDescriptor dropout(dropProb);
-    RNNDescriptor rnnDesc(input.type(), hiddenSize, numLayers, mode, bidirectional, dropout);
+    DropoutDescriptor dropout{dropProb};
+    RNNDescriptor rnnDesc{input.type(), hiddenSize, numLayers, mode, bidirectional, dropout};
     setCudnnRnnMathType(input, rnnDesc);
 
-    TensorDescriptorArray yDesc(seqLength, y.type(), {1, 1, outSize, batchSize});
-    TensorDescriptorArray dyDesc(seqLength, y.type(), {1, 1, outSize, batchSize});
+    TensorDescriptorArray yDesc{seqLength, y.type(), {1, 1, outSize, batchSize}};
+    TensorDescriptorArray dyDesc{seqLength, y.type(), {1, 1, outSize, batchSize}};
 
     Shape hDims = {1, hiddenSize, batchSize, totalLayers};
-    TensorDescriptor dhyDesc(x.type(), hDims);
-    TensorDescriptor dcyDesc(x.type(), hDims);
-    TensorDescriptor hxDesc(x.type(), hDims);
-    TensorDescriptor cxDesc(x.type(), hDims);
+    TensorDescriptor dhyDesc{x.type(), hDims};
+    TensorDescriptor dcyDesc{x.type(), hDims};
+    TensorDescriptor hxDesc{x.type(), hDims};
+    TensorDescriptor cxDesc{x.type(), hDims};
 
-    Tensor dhx(hiddenState.shape(), hiddenState.type());
-    Tensor dcx(cellState.shape(), cellState.type());
-    TensorDescriptor dhxDesc(x.type(), hDims);
-    TensorDescriptor dcxDesc(x.type(), hDims);
+    Tensor dhx{hiddenState.shape(), hiddenState.type()};
+    Tensor dcx{cellState.shape(), cellState.type()};
+    TensorDescriptor dhxDesc{x.type(), hDims};
+    TensorDescriptor dcxDesc{x.type(), hDims};
 
     FilterDescriptor wDesc(weights);
 
-    Tensor dx(input.shape(), input.type());
-    TensorDescriptorArray dxDescs(
-        seqLength, dx.type(), {1, 1, inputSize, batchSize});
+    Tensor dx{input.shape(), input.type()};
+    TensorDescriptorArray dxDescs{
+        seqLength,
+        dx.type(),
+        {1, 1, inputSize, batchSize}
+    };
 
     size_t workspaceSize =
         getWorkspaceSize(handle, rnnDesc, seqLength, dxDescs);
@@ -325,7 +318,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> CudnnAutogradExtension::rnnBackward(
         CUDNN_CHECK_ERR(
             cudnnRNNBackwardData(
                 handle,
-                rnnDesc.descriptor,
+                rnnDesc._handle,
                 seqLength,
                 yDesc.descriptors,
                 yRaw.get(),
@@ -357,17 +350,20 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> CudnnAutogradExtension::rnnBackward(
 
     if(input.type() == fl::dtype::f16)
         CUDNN_CHECK_ERR(
-            cudnnSetRNNMatrixMathType(
-                rnnDesc.descriptor,
-                CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
-            )
-        );
+        cudnnSetRNNMatrixMathType(
+            rnnDesc._handle,
+            CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
+        )
+    );
     else
         CUDNN_CHECK_ERR(
-            cudnnSetRNNMatrixMathType(rnnDesc.descriptor, CUDNN_DEFAULT_MATH)
-        );
-    TensorDescriptorArray xDescs(
-        seqLength, x.type(), {1, 1, inputSize, batchSize});
+        cudnnSetRNNMatrixMathType(rnnDesc._handle, CUDNN_DEFAULT_MATH)
+    );
+    TensorDescriptorArray xDescs{
+        seqLength,
+        x.type(),
+        {1, 1, inputSize, batchSize}
+    };
     Tensor dw = fl::full(weights.shape(), 0, weights.type());
 
     FilterDescriptor dwDesc(dw);
@@ -382,7 +378,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> CudnnAutogradExtension::rnnBackward(
         CUDNN_CHECK_ERR(
             cudnnRNNBackwardWeights(
                 handle,
-                rnnDesc.descriptor,
+                rnnDesc._handle,
                 seqLength,
                 xDescs.descriptors,
                 xRaw.get(),