Skip to content

Commit 532329c

Browse files
committed
Adding remaining flavors for grouped conv fwd
As titled. Following variants are added: - grouped_conv2d_fwd_dynamic_op - grouped_conv3d_fwd_dynamic_op - grouped_conv3d_fwd_bilinear - grouped_conv3d_fwd_convscale - grouped_conv3d_fwd_convinvscale - grouped_conv3d_fwd_convscale_add - grouped_conv3d_fwd_convscale_relu - grouped_conv3d_fwd_scale - grouped_conv3d_fwd_combconvscale - grouped_conv3d_fwd_scaleadd_scaleadd_relu
1 parent 6dd37ab commit 532329c

208 files changed

Lines changed: 6858 additions & 8808 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

example/62_convnd_activ/convinvscale/CMakeLists.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,11 @@ if (NOT GPU_TARGETS MATCHES "gfx11")
55
add_custom_target(example_convnd_activ_xdl_convinvscale)
66
add_example_executable(example_convnd_fwd_xdl_convinvscale_fp8 convnd_fwd_xdl_convinvscale_fp8.cpp)
77
add_example_dependencies(example_convnd_activ_xdl_convinvscale example_convnd_fwd_xdl_convinvscale_fp8)
8-
endif()
8+
endif()
9+
10+
# WMMA
11+
if (GPU_TARGETS MATCHES "gfx12")
12+
add_custom_target(example_convnd_activ_wmma_convinvscale)
13+
add_example_executable(example_convnd_fwd_wmma_convinvscale_fp8 convnd_fwd_wmma_convinvscale_fp8.cpp)
14+
add_example_dependencies(example_convnd_activ_wmma_convinvscale example_convnd_fwd_wmma_convinvscale_fp8)
15+
endif()
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2+
// SPDX-License-Identifier: MIT
3+
4+
#include "convnd_fwd_convinvscale_common.hpp"
5+
6+
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
7+
8+
using InDataType = ck::f8_t;
9+
using WeiDataType = ck::f8_t;
10+
using AccDataType = float;
11+
using CShuffleDataType = float;
12+
using DsDataType = ck::Tuple<>;
13+
using OutDataType = ck::f8_t;
14+
using AComputeDataType = ck::f8_t;
15+
using BComputeDataType = ck::f8_t;
16+
17+
template <ck::index_t... Is>
18+
using S = ck::Sequence<Is...>;
19+
20+
using InElementOp = PassThrough;
21+
using WeiElementOp = PassThrough;
22+
using OutElementOp = ConvInvscale;
23+
24+
static constexpr auto ConvSpec =
25+
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
26+
27+
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
28+
29+
template <ck::index_t NDimSpatial,
30+
typename InLayout,
31+
typename WeiLayout,
32+
typename DsLayout,
33+
typename OutLayout>
34+
using DeviceGroupedConvNDFwdInstance =
35+
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
36+
NDimSpatial, // NDimSpatial
37+
InLayout, // ALayout
38+
WeiLayout, // BLayout
39+
DsLayout, // DsLayout (empty tuple for ConvInvScale)
40+
OutLayout, // ELayout
41+
InDataType, // ADataType
42+
WeiDataType, // BDataType
43+
AccDataType, // AccDataType
44+
CShuffleDataType, // CShuffleDataType
45+
DsDataType, // DsDataType (empty tuple)
46+
OutDataType, // EDataType
47+
InElementOp, // AElementwiseOperation
48+
WeiElementOp, // BElementwiseOperation
49+
OutElementOp, // CDEElementwiseOperation
50+
ConvSpec, // ConvForwardSpecialization
51+
GemmSpec, // GemmSpecialization
52+
64, // BlockSize
53+
64, // MPerBlock
54+
64, // NPerBlock
55+
32, // KPerBlock
56+
8, // AK1
57+
8, // BK1
58+
16, // MPerWmma
59+
16, // NPerWmma
60+
4, // MRepeat
61+
2, // NRepeat
62+
S<4, 16, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
63+
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
64+
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
65+
2, // ABlockTransferSrcVectorDim
66+
1, // ABlockTransferSrcScalarPerVector
67+
8, // ABlockTransferDstScalarPerVector_AK1
68+
1, // ABlockLdsExtraM
69+
S<4, 16, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
70+
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
71+
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
72+
2, // BBlockTransferSrcVectorDim
73+
1, // BBlockTransferSrcScalarPerVector
74+
8, // BBlockTransferDstScalarPerVector_BK1
75+
1, // BBlockLdsExtraN
76+
1, // CShuffleMRepeatPerShuffle
77+
1, // CShuffleNRepeatPerShuffle
78+
S<1, 16, 1, 4>, // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
79+
1, // CDEBlockTransferScalarPerVector_NPerBlock
80+
ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
81+
ck::BlockGemmPipelineVersion::v1, // BlkGemmPipelineVer
82+
AComputeDataType, // AComputeDataType
83+
BComputeDataType, // BComputeDataType
84+
1>; // NumGroupsToMerge
85+
86+
#include "run_convnd_fwd_convinvscale_example.inc"
87+
88+
int main(int argc, char* argv[])
89+
{
90+
if(!ck::is_gfx12_supported())
91+
{
92+
std::cout << "This kernel support gfx12 only" << std::endl;
93+
94+
return 0;
95+
}
96+
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
97+
}

example/62_convnd_activ/convscale/CMakeLists.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,19 @@ if (NOT GPU_TARGETS MATCHES "gfx11")
1515
add_example_executable(example_convnd_fwd_xdl_convscale_bf8_fp8 convnd_fwd_xdl_convscale_bf8_fp8.cpp)
1616
add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_bf8_fp8)
1717
endif()
18+
19+
# WMMA
20+
if (GPU_TARGETS MATCHES "gfx12")
21+
add_custom_target(example_convnd_activ_wmma_convscale)
22+
add_example_executable(example_convnd_fwd_wmma_convscale_fp8 convnd_fwd_wmma_convscale_fp8.cpp)
23+
add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_fp8)
24+
25+
add_example_executable(example_convnd_fwd_wmma_convscale_bf8 convnd_fwd_wmma_convscale_bf8.cpp)
26+
add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_bf8)
27+
28+
add_example_executable(example_convnd_fwd_wmma_convscale_fp8_bf8 convnd_fwd_wmma_convscale_fp8_bf8.cpp)
29+
add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_fp8_bf8)
30+
31+
add_example_executable(example_convnd_fwd_wmma_convscale_bf8_fp8 convnd_fwd_wmma_convscale_bf8_fp8.cpp)
32+
add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_bf8_fp8)
33+
endif()
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2+
// SPDX-License-Identifier: MIT
3+
4+
#include "convnd_fwd_convscale_common.hpp"
5+
6+
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
7+
8+
using InDataType = ck::bf8_t;
9+
using WeiDataType = ck::bf8_t;
10+
using AccDataType = float;
11+
using CShuffleDataType = float;
12+
using DsDataType = ck::Tuple<>;
13+
using OutDataType = ck::f8_t;
14+
using AComputeDataType = InDataType;
15+
using BComputeDataType = AComputeDataType;
16+
17+
template <ck::index_t... Is>
18+
using S = ck::Sequence<Is...>;
19+
20+
using InElementOp = PassThrough;
21+
using WeiElementOp = PassThrough;
22+
using OutElementOp = ConvScale;
23+
24+
static constexpr auto ConvSpec =
25+
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
26+
27+
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
28+
29+
template <ck::index_t NDimSpatial,
30+
typename InLayout,
31+
typename WeiLayout,
32+
typename DsLayout,
33+
typename OutLayout>
34+
using DeviceGroupedConvNDFwdInstance =
35+
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
36+
NDimSpatial, // NDimSpatial
37+
InLayout, // ALayout
38+
WeiLayout, // BLayout
39+
DsLayout, // DsLayout (empty tuple for ConvScale)
40+
OutLayout, // ELayout
41+
InDataType, // ADataType
42+
WeiDataType, // BDataType
43+
AccDataType, // AccDataType
44+
CShuffleDataType, // CShuffleDataType
45+
DsDataType, // DsDataType (empty tuple)
46+
OutDataType, // EDataType
47+
InElementOp, // AElementwiseOperation
48+
WeiElementOp, // BElementwiseOperation
49+
OutElementOp, // CDEElementwiseOperation
50+
ConvSpec, // ConvForwardSpecialization
51+
GemmSpec, // GemmSpecialization
52+
64, // BlockSize
53+
64, // MPerBlock
54+
64, // NPerBlock
55+
32, // KPerBlock
56+
8, // AK1
57+
8, // BK1
58+
16, // MPerWmma
59+
16, // NPerWmma
60+
4, // MRepeat
61+
2, // NRepeat
62+
S<4, 16, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
63+
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
64+
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
65+
2, // ABlockTransferSrcVectorDim
66+
1, // ABlockTransferSrcScalarPerVector
67+
8, // ABlockTransferDstScalarPerVector_AK1
68+
1, // ABlockLdsExtraM
69+
S<4, 16, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
70+
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
71+
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
72+
2, // BBlockTransferSrcVectorDim
73+
1, // BBlockTransferSrcScalarPerVector
74+
8, // BBlockTransferDstScalarPerVector_BK1
75+
1, // BBlockLdsExtraN
76+
1, // CShuffleMRepeatPerShuffle
77+
1, // CShuffleNRepeatPerShuffle
78+
S<1, 16, 1, 4>, // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
79+
1, // CDEBlockTransferScalarPerVector_NPerBlock
80+
ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
81+
ck::BlockGemmPipelineVersion::v1, // BlkGemmPipelineVer
82+
AComputeDataType, // AComputeDataType
83+
BComputeDataType, // BComputeDataType
84+
1>; // NumGroupsToMerge
85+
86+
#include "run_convnd_fwd_convscale_example.inc"
87+
88+
int main(int argc, char* argv[])
89+
{
90+
if(!ck::is_gfx12_supported())
91+
{
92+
std::cout << "This kernel support gfx12 only" << std::endl;
93+
94+
return 0;
95+
}
96+
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
97+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2+
// SPDX-License-Identifier: MIT
3+
4+
#include "convnd_fwd_convscale_common.hpp"
5+
6+
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
7+
8+
using InDataType = ck::bf8_t;
9+
using WeiDataType = ck::f8_t;
10+
using AccDataType = float;
11+
using CShuffleDataType = float;
12+
using DsDataType = ck::Tuple<>;
13+
using OutDataType = ck::f8_t;
14+
using AComputeDataType = ck::bf8_t;
15+
using BComputeDataType = ck::f8_t;
16+
17+
template <ck::index_t... Is>
18+
using S = ck::Sequence<Is...>;
19+
20+
using InElementOp = PassThrough;
21+
using WeiElementOp = PassThrough;
22+
using OutElementOp = ConvScale;
23+
24+
static constexpr auto ConvSpec =
25+
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
26+
27+
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
28+
29+
template <ck::index_t NDimSpatial,
30+
typename InLayout,
31+
typename WeiLayout,
32+
typename DsLayout,
33+
typename OutLayout>
34+
using DeviceGroupedConvNDFwdInstance =
35+
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
36+
NDimSpatial, // NDimSpatial
37+
InLayout, // ALayout
38+
WeiLayout, // BLayout
39+
DsLayout, // DsLayout (empty tuple for ConvScale)
40+
OutLayout, // ELayout
41+
InDataType, // ADataType
42+
WeiDataType, // BDataType
43+
AccDataType, // AccDataType
44+
CShuffleDataType, // CShuffleDataType
45+
DsDataType, // DsDataType (empty tuple)
46+
OutDataType, // EDataType
47+
InElementOp, // AElementwiseOperation
48+
WeiElementOp, // BElementwiseOperation
49+
OutElementOp, // CDEElementwiseOperation
50+
ConvSpec, // ConvForwardSpecialization
51+
GemmSpec, // GemmSpecialization
52+
64, // BlockSize
53+
64, // MPerBlock
54+
64, // NPerBlock
55+
32, // KPerBlock
56+
8, // AK1
57+
8, // BK1
58+
16, // MPerWmma
59+
16, // NPerWmma
60+
4, // MRepeat
61+
2, // NRepeat
62+
S<4, 16, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
63+
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
64+
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
65+
2, // ABlockTransferSrcVectorDim
66+
1, // ABlockTransferSrcScalarPerVector
67+
8, // ABlockTransferDstScalarPerVector_AK1
68+
1, // ABlockLdsExtraM
69+
S<4, 16, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
70+
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
71+
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
72+
2, // BBlockTransferSrcVectorDim
73+
1, // BBlockTransferSrcScalarPerVector
74+
8, // BBlockTransferDstScalarPerVector_BK1
75+
1, // BBlockLdsExtraN
76+
1, // CShuffleMRepeatPerShuffle
77+
1, // CShuffleNRepeatPerShuffle
78+
S<1, 16, 1, 4>, // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
79+
1, // CDEBlockTransferScalarPerVector_NPerBlock
80+
ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
81+
ck::BlockGemmPipelineVersion::v1, // BlkGemmPipelineVer
82+
AComputeDataType, // AComputeDataType
83+
BComputeDataType, // BComputeDataType
84+
1>; // NumGroupsToMerge
85+
86+
#include "run_convnd_fwd_convscale_example.inc"
87+
88+
int main(int argc, char* argv[])
89+
{
90+
if(!ck::is_gfx12_supported())
91+
{
92+
std::cout << "This kernel support gfx12 only" << std::endl;
93+
94+
return 0;
95+
}
96+
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
97+
}

0 commit comments

Comments
 (0)