Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Folder.DotSettings
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
<s:String x:Key="/Default/CodeStyle/Naming/CppNamingOptions/Rules/=0B233A6C23E887458E5DB7357199AE90/@EntryIndexedValue">&lt;NamingElement Priority="6" Title="Parameters"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="function parameter" /&gt;&lt;type Name="lambda parameter" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" WarnAboutPrefixesAndSuffixes="False" Prefix="" Suffix="" Style="aaBb"&gt;&lt;ExtraRule Prefix="_" Suffix="" Style="aaBb" /&gt;&lt;/Policy&gt;&lt;/NamingElement&gt;</s:String></wpf:ResourceDictionary>
4 changes: 4 additions & 0 deletions cmake/dependencies/FindCUDNN.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,8 @@ if(CUDNN_FOUND)
endif()
endif()

if (CUDNN_FOUND AND CUDNN_VERSION VERSION_LESS "8.0")
message(FATAL_ERROR "Flashlight requires cuDNN >= 8.0, found ${CUDNN_VERSION}")
endif()

mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY CUDNN_VERSION)
2 changes: 1 addition & 1 deletion cmake/utils/flashlightConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ if (@FL_BUILD_STANDALONE@)
endif()
if (@FL_USE_CUDA@)
if (@FL_USE_CUDNN@)
find_dependency(CUDNN 7.1)
find_dependency(CUDNN 8)
endif()
if (@FL_BUILD_DISTRIBUTED@)
find_dependency(NCCL)
Expand Down
32 changes: 16 additions & 16 deletions flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ namespace {

if(minAxis == 0) {
modeOut = CUDNN_BATCHNORM_PER_ACTIVATION;
inDescDimsOut = Shape(
inDescDimsOut = Shape{
{
1,
1,
nfeatures,
static_cast<long long>(input.elements() / nfeatures)
}
);
wtDescDimsOut = Shape({1, 1, nfeatures});
};
wtDescDimsOut = Shape{1, 1, nfeatures};
} else {
modeOut = CUDNN_BATCHNORM_SPATIAL;
#if CUDNN_VERSION >= 7003
Expand All @@ -67,15 +67,15 @@ namespace {
int batchsz = 1;
for(int i = maxAxis + 1; i < input.ndim(); ++i)
batchsz *= input.dim(i);
inDescDimsOut = Shape(
inDescDimsOut = Shape{
{
1,
static_cast<long long>(input.elements() / (nfeatures * batchsz)),
nfeatures,
batchsz,
}
);
wtDescDimsOut = Shape({1, 1, nfeatures});
};
wtDescDimsOut = Shape{1, 1, nfeatures};
}
}

Expand All @@ -101,7 +101,7 @@ Tensor CudnnAutogradExtension::batchnorm(
);
FL_TENSOR_DTYPES_MATCH_CHECK(weight, bias, runningMean, runningVar);

auto output = Tensor(input.shape(), input.type());
auto output = Tensor{input.shape(), input.type()};

cudnnBatchNormMode_t mode;
Shape inDescDims, wtDescDims;
Expand All @@ -122,8 +122,8 @@ Tensor CudnnAutogradExtension::batchnorm(
fl::dtype scalarsType =
input.type() == fl::dtype::f16 ? fl::dtype::f32 : input.type();

auto inDesc = TensorDescriptor(input.type(), inDescDims);
auto wtDesc = TensorDescriptor(weightArray.type(), wtDescDims);
auto inDesc = TensorDescriptor{input.type(), inDescDims};
auto wtDesc = TensorDescriptor{weightArray.type(), wtDescDims};

{
DevicePtr inRaw(input);
Expand All @@ -140,8 +140,8 @@ Tensor CudnnAutogradExtension::batchnorm(
);

if(train) {
saveMean = Tensor({wtDescDims[2]}, scalarsType);
saveVar = Tensor({wtDescDims[2]}, scalarsType);
saveMean = Tensor{{wtDescDims[2]}, scalarsType};
saveVar = Tensor{{wtDescDims[2]}, scalarsType};

DevicePtr saveMeanRaw(saveMean);
DevicePtr saveVarRaw(saveVar);
Expand Down Expand Up @@ -223,13 +223,13 @@ std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::batchnormBackward(
const void* one1 = kOne(scalarsType);
const void* zero0 = kZero(scalarsType);

auto iDesc = TensorDescriptor(input.type(), inDescDims);
auto wDesc = TensorDescriptor(wt.type(), wtDescDims);
auto iDesc = TensorDescriptor{input.type(), inDescDims};
auto wDesc = TensorDescriptor{wt.type(), wtDescDims};
// CuDNN doesn't support calculating only the gradients
// required for batchnorm
auto gradIn = Tensor(input.shape(), input.type());
auto gradWt = Tensor(wt.shape(), wt.type());
auto gradBs = Tensor(wt.shape(), wt.type());
auto gradIn = Tensor{input.shape(), input.type()};
auto gradWt = Tensor{wt.shape(), wt.type()};
auto gradBs = Tensor{wt.shape(), wt.type()};
{
DevicePtr iRaw(input);
DevicePtr wRaw(wt);
Expand Down
74 changes: 37 additions & 37 deletions flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,9 +314,9 @@ Tensor CudnnAutogradExtension::conv2d(

auto hasBias = bias.elements() > 0;

auto inDesc = TensorDescriptor(input);
auto wtDesc = FilterDescriptor(weights);
auto convDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups);
auto inDesc = TensorDescriptor{input};
auto wtDesc = FilterDescriptor{weights};
auto convDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups};
if(input.type() == fl::dtype::f16)
CUDNN_CHECK_ERR(
cudnnSetConvolutionMathType(
Expand All @@ -339,8 +339,8 @@ Tensor CudnnAutogradExtension::conv2d(
odims.data()
)
);
auto output = Tensor({odims[3], odims[2], odims[1], odims[0]}, input.type());
auto outDesc = TensorDescriptor(output);
auto output = Tensor{{odims[3], odims[2], odims[1], odims[0]}, input.type()};
auto outDesc = TensorDescriptor{output};

auto handle = getCudnnHandle();
const auto& cudnnStream = getCudnnStream();
Expand All @@ -357,7 +357,7 @@ Tensor CudnnAutogradExtension::conv2d(

try {
wspace =
Tensor({static_cast<long long>(fwdAlgoBestPerf.memory)}, fl::dtype::b8);
Tensor{{static_cast<long long>(fwdAlgoBestPerf.memory)}, fl::dtype::b8};
} catch(const std::exception&) {
fwdAlgoBestPerf.algo = kFwdDefaultAlgo;
CUDNN_CHECK_ERR(
Expand All @@ -372,7 +372,7 @@ Tensor CudnnAutogradExtension::conv2d(
)
);
wspace =
Tensor({static_cast<long long>(fwdAlgoBestPerf.memory)}, fl::dtype::b8);
Tensor{{static_cast<long long>(fwdAlgoBestPerf.memory)}, fl::dtype::b8};
}
{
DevicePtr inPtr(input);
Expand Down Expand Up @@ -405,7 +405,7 @@ Tensor CudnnAutogradExtension::conv2d(
);

if(hasBias) {
auto bsDesc = TensorDescriptor(bias);
auto bsDesc = TensorDescriptor{bias};
DevicePtr bsPtr(bias);
// ensure cudnn compute stream waits on stream of bias tensor
relativeSync(cudnnStream, {bias});
Expand Down Expand Up @@ -453,10 +453,10 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
// benchmarking suggests input or weight casting should occur, these
// descriptors may not be used/new ones with the correct types will be
// used instead.
auto iDesc = TensorDescriptor(input);
auto wDesc = FilterDescriptor(weight);
auto cDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups);
auto oDesc = TensorDescriptor(gradOutput);
auto iDesc = TensorDescriptor{input};
auto wDesc = FilterDescriptor{weight};
auto cDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups};
auto oDesc = TensorDescriptor{gradOutput};

setDefaultMathType(cDesc, input);

Expand Down Expand Up @@ -491,10 +491,10 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(

Tensor ws;
try {
ws = Tensor(
ws = Tensor{
{static_cast<long long>(bwdDataAlgoBestPerf.memory)},
fl::dtype::b8
);
};
} catch(const std::exception&) {
bwdDataAlgoBestPerf.algo = kBwdDataDefaultAlgo;
CUDNN_CHECK_ERR(
Expand All @@ -508,13 +508,13 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
&bwdDataAlgoBestPerf.memory
)
);
ws = Tensor(
ws = Tensor{
{static_cast<long long>(bwdDataAlgoBestPerf.memory)},
fl::dtype::b8
);
};
}

auto gradInput = Tensor(inTensor.shape(), inTensor.type());
auto gradInput = Tensor{inTensor.shape(), inTensor.type()};
{
DevicePtr gradInputPtr(gradInput);
DevicePtr gradResultPtr(gradOutputTensor);
Expand Down Expand Up @@ -577,11 +577,11 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
/* incrementCount = */ false
);

auto iDescF32 = TensorDescriptor(inTensorF32);
auto wDescF32 = FilterDescriptor(wtTensorF32);
auto iDescF32 = TensorDescriptor{inTensorF32};
auto wDescF32 = FilterDescriptor{wtTensorF32};
auto cDescF32 =
ConvDescriptor(fl::dtype::f32, px, py, sx, sy, dx, dy, groups);
auto oDescF32 = TensorDescriptor(gradOutputTensorF32);
ConvDescriptor{fl::dtype::f32, px, py, sx, sy, dx, dy, groups};
auto oDescF32 = TensorDescriptor{gradOutputTensorF32};
// core bwd data computation
dataGradBenchmark->audit(
[&dataGradOut,
Expand Down Expand Up @@ -671,10 +671,10 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
// benchmarking suggests input or weight casting should occur, these
// descriptors may not be used/new ones with the correct types will be
// used instead.
auto iDesc = TensorDescriptor(input);
auto wDesc = FilterDescriptor(weight);
auto cDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups);
auto oDesc = TensorDescriptor(gradOutput);
auto iDesc = TensorDescriptor{input};
auto wDesc = FilterDescriptor{weight};
auto cDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups};
auto oDesc = TensorDescriptor{gradOutput};

setDefaultMathType(cDesc, input);

Expand Down Expand Up @@ -708,10 +708,10 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(

Tensor ws;
try {
ws = Tensor(
ws = Tensor{
{static_cast<long long>(bwdFilterAlgoBestPerf.memory)},
fl::dtype::b8
);
};
} catch(const std::exception&) {
bwdFilterAlgoBestPerf.algo = kBwdFilterDefaultAlgo;
CUDNN_CHECK_ERR(
Expand All @@ -725,13 +725,13 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
&bwdFilterAlgoBestPerf.memory
)
);
ws = Tensor(
ws = Tensor{
{static_cast<long long>(bwdFilterAlgoBestPerf.memory)},
fl::dtype::b8
);
};
}

auto gradWeight = Tensor(wtTensor.shape(), wtTensor.type());
auto gradWeight = Tensor{wtTensor.shape(), wtTensor.type()};
{
DevicePtr gradWeightPtr(gradWeight);
DevicePtr gradResultPtr(gradOutputTensor);
Expand Down Expand Up @@ -794,11 +794,11 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
/* incrementCount = */ false
);

auto iDescF32 = TensorDescriptor(inTensorF32);
auto wDescF32 = FilterDescriptor(wtTensorF32);
auto iDescF32 = TensorDescriptor{inTensorF32};
auto wDescF32 = FilterDescriptor{wtTensorF32};
auto cDescF32 =
ConvDescriptor(fl::dtype::f32, px, py, sx, sy, dx, dy, groups);
auto oDescF32 = TensorDescriptor(gradOutputTensorF32);
ConvDescriptor{fl::dtype::f32, px, py, sx, sy, dx, dy, groups};
auto oDescF32 = TensorDescriptor{gradOutputTensorF32};
// core bwd data computation
filterGradBenchmark->audit(
[&filterGradOut,
Expand Down Expand Up @@ -860,13 +860,13 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
const Tensor& bsTensor,
const Tensor& gradOutput,
const TensorDescriptor& oDesc) -> Tensor {
auto gradBias = Tensor(bsTensor.shape(), bsTensor.type());
auto gradBias = Tensor{bsTensor.shape(), bsTensor.type()};
{
DevicePtr gradBiasPtr(gradBias);
DevicePtr gradResultPtr(gradOutput);
// ensure cudnn compute stream waits on gradient tensor streams
relativeSync(cudnnStream, {gradOutput, gradBias});
auto bDesc = TensorDescriptor(bsTensor);
auto bDesc = TensorDescriptor{bsTensor};
CUDNN_CHECK_ERR(
cudnnConvolutionBackwardBias(
hndl,
Expand Down Expand Up @@ -911,7 +911,7 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
},
/* incrementCount = */ false
);
auto oDescF32 = TensorDescriptor(gradOutputF32);
auto oDescF32 = TensorDescriptor{gradOutputF32};
// Perform bias gradient computation
biasGradBenchmark->audit(
[&biasGradOut,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@ namespace fl {
std::shared_ptr<fl::DynamicBenchmark> CudnnAutogradExtension::createBenchmarkOptions() {
return std::make_shared<fl::DynamicBenchmark>(
std::make_shared<fl::DynamicBenchmarkOptions<KernelMode>>(
std::vector<KernelMode>(
{
KernelMode::F32,
KernelMode::F32_ALLOW_CONVERSION,
KernelMode::F16
}
),
std::vector<KernelMode>{
KernelMode::F32,
KernelMode::F32_ALLOW_CONVERSION,
KernelMode::F16
},
fl::kDynamicBenchmarkDefaultCount
)
);
Expand Down
Loading
Loading