Skip to content

Commit 8588160

Browse files
metax666duqimeng
andauthored
[Metax] Fix dynload (PaddlePaddle#2287)
* [Metax] Fix dynload * [Metax] Fix dynload * [Metax] Fix eigen error Co-authored-by: duqimeng <77875733+duqimeng@users.noreply.github.com>
1 parent 34f4f8e commit 8588160

20 files changed

Lines changed: 2357 additions & 171 deletions

.github/workflows/_Metax-X86.yml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,6 @@ on:
1717
default: 'true'
1818

1919

20-
defaults:
21-
run:
22-
shell: bash
23-
24-
2520
jobs:
2621

2722
check-bypass:
@@ -65,10 +60,10 @@ jobs:
6560
# !!!!! SKIP IF NO METAX CHANGE !!!!
6661
echo "=========== Checking PR Changes If METAX FULL CI Needed ==========="
6762
change_numbers=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | wc -l)
68-
# change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)
69-
change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/" || true)
70-
# change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)
71-
change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/metax_gpu" || true)
63+
change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)
64+
# change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/" || true)
65+
change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)
66+
# change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/metax_gpu" || true)
7267
git --no-pager diff --name-only remotes/origin/${BRANCH}
7368
7469
if [ $change_numbers -ne $change_backend ]; then

backends/metax_gpu/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,7 @@ target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcFlashAttn.so)
788788
target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcpti.so)
789789

790790
include_directories(BEFORE ${PADDLE_SOURCE_DIR})
791+
include_directories(BEFORE ${CMAKE_SOURCE_DIR}/headers)
791792

792793
target_compile_definitions(
793794
${TARGET_NAME}
@@ -826,8 +827,12 @@ add_custom_command(
826827
POST_BUILD
827828
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/python/
828829
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/python/
830+
COMMAND ${CMAKE_COMMAND} -E make_directory
831+
${CMAKE_CURRENT_BINARY_DIR}/python/include/
829832
COMMAND ${CMAKE_COMMAND} -E make_directory
830833
${CMAKE_CURRENT_BINARY_DIR}/python/paddle_custom_device/
834+
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/headers
835+
${CMAKE_CURRENT_BINARY_DIR}/python/include/
831836
COMMAND
832837
${CMAKE_COMMAND} -E copy_if_different
833838
${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.so

backends/metax_gpu/change_patch.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717

1818
rm -r ../../Paddle/third_party/eigen3
1919
cd patch
20-
unzip mcEigen_3.4.0_paddle_final.zip
21-
mv mcEigen_3.4.0_paddle_final eigen3
20+
unzip Eigen_3.4.0_paddle.zip
21+
mv Eigen_3.4.0_paddle eigen3
2222
cd ..
2323
cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
2424
rm -r patch/eigen3

backends/metax_gpu/compile.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ fi
3131
echo "make_maca"
3232
cd build
3333
cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
34-
make_maca -j18
34+
make_maca -j18 VERBOSE=1
3535

3636

3737
echo "install whl"
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
17+
#include <cublasXt.h>
18+
#include <cublas_v2.h>
19+
#include <cuda.h>
20+
#if CUDA_VERSION >= 12030 && defined(__linux__)
21+
#include <cublas_api.h>
22+
#endif
23+
24+
#include <mutex> // NOLINT
25+
#include <type_traits>
26+
27+
#include "paddle/phi/backends/dynload/dynamic_loader.h"
28+
#include "paddle/phi/common/port.h"
29+
30+
namespace phi {
31+
namespace dynload {
32+
33+
extern std::once_flag cublas_dso_flag;
34+
extern void* cublas_dso_handle;
35+
36+
/**
37+
* The following macro definition can generate structs
38+
* (for each function) to dynamic load cublas routine
39+
* via operator overloading.
40+
*
41+
* note: default dynamic linked libs
42+
*/
43+
#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
44+
struct DynLoad__##__name { \
45+
template <typename... Args> \
46+
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
47+
using cublas_func = \
48+
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
49+
std::call_once(cublas_dso_flag, []() { \
50+
cublas_dso_handle = phi::dynload::GetCublasDsoHandle(); \
51+
}); \
52+
std::string replaced_name = #__name; \
53+
replaced_name = replaced_name.replace(0, 2, "mc"); \
54+
int index = replaced_name.find("_", 0); \
55+
if (index != -1) replaced_name = replaced_name.substr(0, index); \
56+
static void* p_##__name = \
57+
dlsym(cublas_dso_handle, replaced_name.c_str()); \
58+
return reinterpret_cast<cublas_func>(p_##__name)(args...); \
59+
} \
60+
}; \
61+
extern DynLoad__##__name __name
62+
63+
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
64+
__macro(cublasSaxpy_v2); \
65+
__macro(cublasDaxpy_v2); \
66+
__macro(cublasCaxpy_v2); \
67+
__macro(cublasZaxpy_v2); \
68+
__macro(cublasSscal_v2); \
69+
__macro(cublasDscal_v2); \
70+
__macro(cublasScopy_v2); \
71+
__macro(cublasDcopy_v2); \
72+
__macro(cublasSgemv_v2); \
73+
__macro(cublasDgemv_v2); \
74+
__macro(cublasCgemv_v2); \
75+
__macro(cublasZgemv_v2); \
76+
__macro(cublasSgemm_v2); \
77+
__macro(cublasDgemm_v2); \
78+
__macro(cublasCgemm_v2); \
79+
__macro(cublasZgemm_v2); \
80+
__macro(cublasHgemm); \
81+
__macro(cublasSgemmEx); \
82+
__macro(cublasSgeam); \
83+
__macro(cublasDgeam); \
84+
__macro(cublasStrsm_v2); \
85+
__macro(cublasDtrsm_v2); \
86+
__macro(cublasCtrsm_v2); \
87+
__macro(cublasZtrsm_v2); \
88+
__macro(cublasCreate_v2); \
89+
__macro(cublasDestroy_v2); \
90+
__macro(cublasSetStream_v2); \
91+
__macro(cublasSetPointerMode_v2); \
92+
__macro(cublasGetPointerMode_v2); \
93+
__macro(cublasSgemmBatched); \
94+
__macro(cublasDgemmBatched); \
95+
__macro(cublasCgemmBatched); \
96+
__macro(cublasZgemmBatched); \
97+
__macro(cublasStrsmBatched); \
98+
__macro(cublasDtrsmBatched); \
99+
__macro(cublasCtrsmBatched); \
100+
__macro(cublasZtrsmBatched); \
101+
__macro(cublasSgetrfBatched); \
102+
__macro(cublasSgetriBatched); \
103+
__macro(cublasDgetrfBatched); \
104+
__macro(cublasDgetriBatched); \
105+
__macro(cublasCgetrfBatched); \
106+
__macro(cublasCgetriBatched); \
107+
__macro(cublasZgetrfBatched); \
108+
__macro(cublasZgetriBatched); \
109+
__macro(cublasSmatinvBatched); \
110+
__macro(cublasDmatinvBatched); \
111+
__macro(cublasCmatinvBatched); \
112+
__macro(cublasZmatinvBatched); \
113+
__macro(cublasSgetrsBatched); \
114+
__macro(cublasDgetrsBatched); \
115+
__macro(cublasSdot_v2); \
116+
__macro(cublasDdot_v2); \
117+
__macro(cublasCdotc_v2); \
118+
__macro(cublasZdotc_v2); \
119+
__macro(cublasCdotu_v2); \
120+
__macro(cublasZdotu_v2); \
121+
__macro(cublasDotEx); \
122+
__macro(cublasGemmEx); \
123+
__macro(cublasSgemmStridedBatched); \
124+
__macro(cublasDgemmStridedBatched); \
125+
__macro(cublasCgemmStridedBatched); \
126+
__macro(cublasZgemmStridedBatched); \
127+
__macro(cublasHgemmStridedBatched); \
128+
__macro(cublasSetMathMode); \
129+
__macro(cublasGetMathMode); \
130+
__macro(cublasCgeam); \
131+
__macro(cublasZgeam); \
132+
__macro(cublasGemmBatchedEx); \
133+
__macro(cublasGemmStridedBatchedEx);
134+
135+
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
136+
137+
#if CUDA_VERSION >= 12030 && defined(__linux__)
138+
#define CUBLAS_BLAS_ROUTINE_EACH_R5(__macro) \
139+
__macro(cublasGemmStridedBatchedEx_64); \
140+
__macro(cublasGemmEx_64); \
141+
__macro(cublasSgemmEx_64);
142+
143+
CUBLAS_BLAS_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
144+
#endif
145+
146+
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
147+
} // namespace dynload
148+
} // namespace phi
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License. */
15+
16+
#pragma once
17+
18+
#include <cublasLt.h>
19+
#include <cuda.h>
20+
21+
#include <mutex> // NOLINT
22+
#include <type_traits>
23+
24+
#include "paddle/phi/backends/dynload/dynamic_loader.h"
25+
#include "paddle/phi/common/port.h"
26+
27+
namespace phi {
28+
namespace dynload {
29+
30+
extern std::once_flag cublasLt_dso_flag;
31+
extern void* cublasLt_dso_handle;
32+
33+
/**
34+
* The following macro definition can generate structs
35+
* (for each function) to dynamic load cublasLt routine
36+
* via operator overloading.
37+
*
38+
* note: default dynamic linked libs
39+
*/
40+
#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \
41+
struct DynLoad__##__name { \
42+
template <typename... Args> \
43+
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
44+
using cublasLt_func = \
45+
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
46+
std::call_once(cublasLt_dso_flag, []() { \
47+
cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle(); \
48+
}); \
49+
std::string replaced_name = #__name; \
50+
replaced_name = replaced_name.replace(0, 2, "mc"); \
51+
static void* p_##__name = \
52+
dlsym(cublasLt_dso_handle, replaced_name.c_str()); \
53+
return reinterpret_cast<cublasLt_func>(p_##__name)(args...); \
54+
} \
55+
}; \
56+
extern DynLoad__##__name __name
57+
// APIs available after CUDA 11.1
58+
#if CUDA_VERSION >= 11010 || defined(PADDLE_WITH_CUSTOM_DEVICE)
59+
#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \
60+
__macro(cublasLtCreate); \
61+
__macro(cublasLtDestroy); \
62+
__macro(cublasLtMatmul); \
63+
__macro(cublasLtMatmulDescCreate); \
64+
__macro(cublasLtMatmulDescDestroy); \
65+
__macro(cublasLtMatmulDescSetAttribute); \
66+
__macro(cublasLtMatmulDescGetAttribute); \
67+
__macro(cublasLtMatrixLayoutCreate); \
68+
__macro(cublasLtMatrixLayoutDestroy); \
69+
__macro(cublasLtMatrixLayoutSetAttribute); \
70+
__macro(cublasLtMatrixLayoutGetAttribute); \
71+
__macro(cublasLtMatmulPreferenceCreate); \
72+
__macro(cublasLtMatmulPreferenceDestroy); \
73+
__macro(cublasLtMatmulPreferenceSetAttribute); \
74+
__macro(cublasLtMatmulAlgoGetHeuristic); \
75+
__macro(cublasLtMatrixTransform); \
76+
__macro(cublasLtMatrixTransformDescCreate); \
77+
__macro(cublasLtMatrixTransformDescDestroy); \
78+
__macro(cublasLtMatrixTransformDescSetAttribute); \
79+
__macro(cublasLtMatmulAlgoInit); \
80+
__macro(cublasLtMatmulAlgoConfigSetAttribute); \
81+
__macro(cublasLtMatmulAlgoConfigGetAttribute); \
82+
__macro(cublasLtMatmulAlgoGetIds); \
83+
__macro(cublasLtMatmulAlgoCapGetAttribute); \
84+
__macro(cublasLtMatmulAlgoCheck);
85+
// __macro(cublasLtGetCudartVersion);
86+
#else
87+
#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \
88+
__macro(cublasLtCreate); \
89+
__macro(cublasLtDestroy); \
90+
__macro(cublasLtMatmul); \
91+
__macro(cublasLtMatmulDescCreate); \
92+
__macro(cublasLtMatmulDescDestroy); \
93+
__macro(cublasLtMatmulDescSetAttribute); \
94+
__macro(cublasLtMatmulDescGetAttribute); \
95+
__macro(cublasLtMatrixLayoutCreate); \
96+
__macro(cublasLtMatrixLayoutDestroy); \
97+
__macro(cublasLtMatrixLayoutSetAttribute); \
98+
__macro(cublasLtMatrixLayoutGetAttribute); \
99+
__macro(cublasLtMatmulPreferenceCreate); \
100+
__macro(cublasLtMatmulPreferenceDestroy); \
101+
__macro(cublasLtMatmulPreferenceSetAttribute); \
102+
__macro(cublasLtMatmulAlgoGetHeuristic); \
103+
__macro(cublasLtMatrixTransform); \
104+
__macro(cublasLtMatrixTransformDescCreate); \
105+
__macro(cublasLtMatrixTransformDescDestroy); \
106+
__macro(cublasLtMatrixTransformDescSetAttribute);
107+
#endif
108+
109+
CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
110+
// #endif
111+
112+
#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
113+
} // namespace dynload
114+
} // namespace phi

0 commit comments

Comments
 (0)