diff --git a/.gitignore b/.gitignore index 2effaff..4a5d4b7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # Generated files build/ +build-*/ +cmake-build-*/ generated/ # Prerequisites diff --git a/CMakeLists.txt b/CMakeLists.txt index 570b7d7..8999e5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,7 @@ set(PYBIND11_ENABLE_EXTRAS ON) option(WITH_CPU "Enable CPU backend" OFF) option(WITH_NVIDIA "Enable CUDA backend" OFF) option(WITH_ILUVATAR "Enable Iluvatar GPU backend" OFF) +option(WITH_HYGON "Enable Hygon GPU backend" OFF) option(WITH_METAX "Enable MetaX backend" OFF) option(WITH_CAMBRICON "Enable Cambricon backend" OFF) option(WITH_MOORE "Enable Moore backend" OFF) @@ -18,6 +19,8 @@ option(WITH_MOORE "Enable Moore backend" OFF) option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF) option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF) +set(_DEFAULT_HYGON_DTK_ROOT "/opt/dtk") + if(AUTO_DETECT_DEVICES) message(STATUS "Auto-detecting available devices...") @@ -37,6 +40,13 @@ if(AUTO_DETECT_DEVICES) message(STATUS "Auto-detected Iluvatar environment.") endif() + if(DEFINED ENV{DTK_ROOT} OR + EXISTS "${_DEFAULT_HYGON_DTK_ROOT}/cuda/bin/nvcc" OR + EXISTS "${_DEFAULT_HYGON_DTK_ROOT}/cuda/cuda/bin/nvcc") + set(WITH_HYGON ON) + message(STATUS "Auto-detected Hygon environment.") + endif() + if(DEFINED ENV{MACA_PATH}) set(WITH_METAX ON) message(STATUS "Auto-detected MetaX environment from MACA_PATH") @@ -77,14 +87,14 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) # Only one CUDA-like GPU backend can be enabled at a time. set(_gpu_backend_count 0) -foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_METAX WITH_MOORE) +foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_HYGON WITH_METAX WITH_MOORE) if(${_gpu_backend}) math(EXPR _gpu_backend_count "${_gpu_backend_count} + 1") endif() endforeach() if(_gpu_backend_count GREATER 1) - message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_METAX`, and `WITH_MOORE` are mutually exclusive. Build one GPU backend at a time.") + message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_HYGON`, `WITH_METAX`, and `WITH_MOORE` are mutually exclusive. Build one GPU backend at a time.") endif() if(WITH_NVIDIA) @@ -111,6 +121,64 @@ if(WITH_ILUVATAR) find_package(CUDAToolkit REQUIRED) endif() +if(WITH_HYGON) + add_compile_definitions(WITH_HYGON=1) + set(DTK_ROOT $ENV{DTK_ROOT}) + if(NOT DTK_ROOT) + set(DTK_ROOT "${_DEFAULT_HYGON_DTK_ROOT}") + endif() + if(NOT EXISTS "${DTK_ROOT}") + message(FATAL_ERROR "`WITH_HYGON` is `ON` but `DTK_ROOT` (`${DTK_ROOT}`) does not exist.") + endif() + + set(_HYGON_ARCH_DEFAULT "gfx906") + if(DEFINED ENV{HYGON_ARCH} AND NOT "$ENV{HYGON_ARCH}" STREQUAL "") + set(_HYGON_ARCH_DEFAULT "$ENV{HYGON_ARCH}") + else() + find_program(HYGON_ROCMINFO_EXECUTABLE NAMES rocminfo HINTS "${DTK_ROOT}/bin") + if(HYGON_ROCMINFO_EXECUTABLE) + execute_process( + COMMAND ${HYGON_ROCMINFO_EXECUTABLE} + OUTPUT_VARIABLE _HYGON_ROCMINFO_OUTPUT + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "gfx[0-9]+" _HYGON_ARCH_AUTO "${_HYGON_ROCMINFO_OUTPUT}") + if(_HYGON_ARCH_AUTO) + set(_HYGON_ARCH_DEFAULT "${_HYGON_ARCH_AUTO}") + endif() + endif() + endif() + + set(HYGON_ARCH "${_HYGON_ARCH_DEFAULT}" CACHE STRING "Hygon GPU architecture") + set(HYGON_CUDA_ROOT "${DTK_ROOT}/cuda") + if(EXISTS "${DTK_ROOT}/cuda/cuda/bin/nvcc") + set(HYGON_CUDA_ROOT "${DTK_ROOT}/cuda/cuda") + endif() + + if(NOT EXISTS "${HYGON_CUDA_ROOT}/bin/nvcc") + message(FATAL_ERROR "`WITH_HYGON` is `ON` but `${HYGON_CUDA_ROOT}/bin/nvcc` was not found. Checked `${DTK_ROOT}/cuda/bin/nvcc` and `${DTK_ROOT}/cuda/cuda/bin/nvcc`.") + endif() + + set(CMAKE_CUDA_COMPILER "${HYGON_CUDA_ROOT}/bin/nvcc" CACHE FILEPATH "Hygon CUDA compiler (DTK nvcc)") + set(CUDAToolkit_ROOT "${HYGON_CUDA_ROOT}" CACHE PATH "Hygon CUDA toolkit root") + set(CMAKE_CUDA_ARCHITECTURES OFF CACHE STRING "Disable default CUDA arch flags for Hygon" FORCE) + set(CMAKE_CUDA_FLAGS "-std=c++17 -fPIC -arch=${HYGON_ARCH} -Wno-return-type -Wno-error=unused-private-field" CACHE STRING "Hygon CUDA flags") + set(CMAKE_CUDA_SEPARABLE_COMPILATION OFF CACHE BOOL "Disable RDC for Hygon") + + # DTK's nvcc wrapper may invoke `nvcc` by name during compiler checks. + set(ENV{PATH} "${HYGON_CUDA_ROOT}/bin:$ENV{PATH}") + + include_directories("${DTK_ROOT}/include") + include_directories("${HYGON_CUDA_ROOT}/include") + link_directories("${DTK_ROOT}/lib") + link_directories("${HYGON_CUDA_ROOT}/lib64") + + message(STATUS "Hygon: CUDA compiler ${CMAKE_CUDA_COMPILER}, arch ${HYGON_ARCH}, DTK root ${DTK_ROOT}") + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) +endif() + if(WITH_METAX) add_compile_definitions(WITH_METAX=1) @@ -179,7 +247,7 @@ if(WITH_CAMBRICON) endif() # If all other platforms are not enabled, CPU is enabled by default. -if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_MOORE) +if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_HYGON AND NOT WITH_METAX AND NOT WITH_MOORE) add_compile_definitions(WITH_CPU=1) endif() diff --git a/README.md b/README.md index 875a936..c61bc02 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,17 @@ For the ``: |----------------------------------------|------------------------------------|:-: | `-DWITH_CPU=[ON\|OFF]` | Compile the CPU implementation | n | `-DWITH_NVIDIA=[ON\|OFF]` | Compile the NVIDIA implementation | n +| `-DWITH_ILUVATAR=[ON\|OFF]` | Compile the Iluvatar implementation| n +| `-DWITH_HYGON=[ON\|OFF]` | Compile the Hygon implementation | n | `-DWITH_METAX=[ON\|OFF]` | Compile the MetaX implementation | n | `-DGENERATE_PYTHON_BINDINGS=[ON\|OFF]` | Generate Python bindings | n *Note: If no accelerator options are provided, `WITH_CPU` is enabled by default.* +For Hygon builds, set `DTK_ROOT` to the DTK installation root if it is not +installed at `/opt/dtk`. You can override the default DCU arch with +`-DHYGON_ARCH=` when configuring CMake. + ## 🚀 Running Examples After a successful build, the executables are located in the `build/examples` directory. diff --git a/examples/gemm/gemm.cc b/examples/gemm/gemm.cc index 4664740..9e1593c 100644 --- a/examples/gemm/gemm.cc +++ b/examples/gemm/gemm.cc @@ -11,6 +11,9 @@ #if WITH_ILUVATAR #include "iluvatar/gemm/cublas.h" #endif +#if WITH_HYGON +#include "hygon/gemm/cublas.h" +#endif #if WITH_METAX #include "metax/gemm/mcblas.h" #endif diff --git a/examples/runtime_api.h b/examples/runtime_api.h index c5b7597..bc7398f 100644 --- a/examples/runtime_api.h +++ b/examples/runtime_api.h @@ -19,6 +19,15 @@ #define DEVICE_MEMCPY_HOST_TO_DEVICE cudaMemcpyHostToDevice #define DEVICE_MEMCPY_DEVICE_TO_HOST cudaMemcpyDeviceToHost #define DEFAULT_DEVICE_TYPE Device::Type::kIluvatar +#elif WITH_HYGON +#include +#define DEVICE_MALLOC cudaMalloc +#define DEVICE_FREE cudaFree +#define DEVICE_MEMCPY cudaMemcpy +#define DEVICE_MEMSET cudaMemset +#define DEVICE_MEMCPY_HOST_TO_DEVICE cudaMemcpyHostToDevice +#define DEVICE_MEMCPY_DEVICE_TO_HOST cudaMemcpyDeviceToHost +#define DEFAULT_DEVICE_TYPE Device::Type::kHygon #elif WITH_METAX #include #define DEVICE_MALLOC mcMalloc diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3ca0715..11eefb6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -77,6 +77,34 @@ if(WITH_ILUVATAR) list(APPEND DEVICE_LIST "iluvatar") endif() +if(WITH_HYGON) + set(HYGON_PATTERNS + "cuda/*.cc" + "cuda/*.cpp" + "cuda/*.cu" + "hygon/*.cc" + "hygon/*.cpp" + "hygon/*.cu" + ) + + file(GLOB_RECURSE HYGON_SOURCES CONFIGURE_DEPENDS ${HYGON_PATTERNS}) + + enable_language(CUDA) + + target_compile_definitions(infiniops PUBLIC WITH_HYGON=1) + target_sources(infiniops PRIVATE ${HYGON_SOURCES}) + + find_package(CUDAToolkit REQUIRED) + target_link_libraries(infiniops PUBLIC CUDA::cudart CUDA::cublas) + + set_target_properties(infiniops PROPERTIES + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) + + list(APPEND DEVICE_LIST "hygon") +endif() + if(WITH_METAX) set(METAX_PATTERNS "cuda/*.cc" @@ -154,7 +182,7 @@ if(GENERATE_PYTHON_BINDINGS) set(PYBIND11_SOURCES "${PROJECT_SOURCE_DIR}/generated/bindings/ops.cc") # TODO: There might be a better solution. - if(WITH_NVIDIA OR WITH_ILUVATAR) + if(WITH_NVIDIA OR WITH_ILUVATAR OR WITH_HYGON) set_source_files_properties(${PYBIND11_SOURCES} PROPERTIES LANGUAGE CUDA) endif() diff --git a/src/common/cast.h b/src/common/cast.h index a37fb94..9272aee 100644 --- a/src/common/cast.h +++ b/src/common/cast.h @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_COMMON_CAST_H_ #define INFINI_OPS_COMMON_CAST_H_ -#if defined(WITH_NVIDIA) || defined(WITH_ILUVATAR) || defined(WITH_METAX) || \ - defined(WITH_MOORE) +#if defined(WITH_NVIDIA) || defined(WITH_ILUVATAR) || defined(WITH_HYGON) || \ + defined(WITH_METAX) || defined(WITH_MOORE) #include "common/cuda/cast.h" #else #include "common/cpu/cast.h" diff --git a/src/common/cuda/cast.h b/src/common/cuda/cast.h index 1f67a44..01a032a 100644 --- a/src/common/cuda/cast.h +++ b/src/common/cuda/cast.h @@ -1,9 +1,7 @@ #ifndef INFINI_OPS_COMMON_CUDA_CAST_H_ #define INFINI_OPS_COMMON_CUDA_CAST_H_ -#ifdef WITH_NVIDIA -#include -#elif defined(WITH_ILUVATAR) +#if defined(WITH_NVIDIA) || defined(WITH_ILUVATAR) || defined(WITH_HYGON) #include #elif defined(WITH_METAX) #include diff --git a/src/common/cuda/kernel_commons.h b/src/common/cuda/kernel_commons.h index e2ef107..8d84d14 100644 --- a/src/common/cuda/kernel_commons.h +++ b/src/common/cuda/kernel_commons.h @@ -1,15 +1,8 @@ #ifndef INFINI_OPS_COMMON_CUDA_KERNEL_COMMONS_H_ #define INFINI_OPS_COMMON_CUDA_KERNEL_COMMONS_H_ -#ifdef WITH_NVIDIA +#if defined(WITH_NVIDIA) || defined(WITH_ILUVATAR) || defined(WITH_HYGON) #include -#include -#include -using cuda_bfloat16 = nv_bfloat16; -using cuda_bfloat162 = nv_bfloat162; -#elif defined(WITH_ILUVATAR) -#include -#include #include using cuda_bfloat16 = nv_bfloat16; using cuda_bfloat162 = nv_bfloat162; @@ -39,7 +32,7 @@ constexpr int CUDA_BLOCK_SIZE_512 = 512; constexpr int CUDA_BLOCK_SIZE_1024 = 1024; constexpr int CUDA_BLOCK_SIZE_2048 = 2048; -#if defined(WITH_NVIDIA) || defined(WITH_ILUVATAR) +#if defined(WITH_NVIDIA) || defined(WITH_ILUVATAR) || defined(WITH_HYGON) // Cache `cudaDeviceProp` per device, initialized once at first access. class DevicePropertyCache { public: diff --git a/src/cuda/gemm/blas.h b/src/cuda/gemm/blas.h index 5a7cf2f..ff6c878 100644 --- a/src/cuda/gemm/blas.h +++ b/src/cuda/gemm/blas.h @@ -21,7 +21,11 @@ class Blas : public Gemm { // TODO: Check constraints. } - ~Blas() { Backend::blasDestroy(handle_); } + ~Blas() { + if (handle_ != nullptr) { + Backend::blasDestroy(handle_); + } + } Blas(const Tensor a, const Tensor b, std::optional alpha, std::optional beta, Tensor c) @@ -69,7 +73,6 @@ class Blas : public Gemm { return β } - private: auto GetOpA(int trans_a, int trans_b) const { if (swap_a_and_b_) { return (b_is_col_major_ == trans_b) ? Backend::BLAS_OP_T @@ -88,13 +91,14 @@ class Blas : public Gemm { : Backend::BLAS_OP_N; } - bool a_is_col_major_{false}; + bool swap_a_and_b_{false}; - bool b_is_col_major_{false}; + mutable typename Backend::blasHandle_t handle_{}; - bool swap_a_and_b_{false}; + private: + bool a_is_col_major_{false}; - typename Backend::blasHandle_t handle_; + bool b_is_col_major_{false}; }; } // namespace infini::ops diff --git a/src/data_type.h b/src/data_type.h index af2aec7..f9bed2d 100644 --- a/src/data_type.h +++ b/src/data_type.h @@ -5,10 +5,7 @@ #include #include -#ifdef WITH_NVIDIA -#include -#include -#elif defined(WITH_ILUVATAR) +#if defined(WITH_NVIDIA) || defined(WITH_ILUVATAR) || defined(WITH_HYGON) #include #include #elif defined(WITH_METAX) @@ -198,7 +195,7 @@ DEFINE_DATA_TYPE_MAPPING(kInt64, std::int64_t) DEFINE_DATA_TYPE_MAPPING(kFloat32, float) DEFINE_DATA_TYPE_MAPPING(kFloat64, double) -#if defined(WITH_NVIDIA) || defined(WITH_ILUVATAR) +#if defined(WITH_NVIDIA) || defined(WITH_ILUVATAR) || defined(WITH_HYGON) DEFINE_DATA_TYPE_MAPPING(kFloat16, half) DEFINE_DATA_TYPE_MAPPING(kBFloat16, __nv_bfloat16) #elif defined(WITH_METAX) diff --git a/src/hygon/add/kernel.h b/src/hygon/add/kernel.h new file mode 100644 index 0000000..a051b72 --- /dev/null +++ b/src/hygon/add/kernel.h @@ -0,0 +1,42 @@ +#ifndef INFINI_OPS_HYGON_ADD_KERNEL_H_ +#define INFINI_OPS_HYGON_ADD_KERNEL_H_ + +#include + +// clang-format off +#include +// clang-format on + +#include "cuda/add/kernel.h" + +namespace infini::ops { + +namespace add { + +struct HygonBackend { + using stream_t = cudaStream_t; + + static constexpr auto malloc = [](auto&&... args) { + return cudaMalloc(std::forward(args)...); + }; + + static constexpr auto memcpy = cudaMemcpy; + + static constexpr auto free = [](auto&&... args) { + return cudaFree(std::forward(args)...); + }; + + static constexpr auto memcpyH2D = cudaMemcpyHostToDevice; +}; + +} // namespace add + +template <> +class Operator : public CudaAdd { + public: + using CudaAdd::CudaAdd; +}; + +} // namespace infini::ops + +#endif diff --git a/src/hygon/gemm/cublas.h b/src/hygon/gemm/cublas.h new file mode 100644 index 0000000..a220f2a --- /dev/null +++ b/src/hygon/gemm/cublas.h @@ -0,0 +1,123 @@ +#ifndef INFINI_OPS_HYGON_GEMM_CUBLAS_H_ +#define INFINI_OPS_HYGON_GEMM_CUBLAS_H_ + +#include + +// clang-format off +#include "cublas_v2.h" +// clang-format on + +#include "cuda/gemm/blas.h" + +namespace infini::ops { + +namespace gemm { + +struct HygonBackend { + using blasHandle_t = cublasHandle_t; + + using stream_t = cudaStream_t; + + static constexpr auto BLAS_OP_N = CUBLAS_OP_N; + + static constexpr auto BLAS_OP_T = CUBLAS_OP_T; + + static constexpr auto R_16F = CUDA_R_16F; + + static constexpr auto R_16BF = CUDA_R_16BF; + + static constexpr auto R_32F = CUDA_R_32F; + + static constexpr auto BLAS_COMPUTE_32F = CUBLAS_COMPUTE_32F; + + // DTK exposes the TF32 enum for compatibility, but BW/GFX9-class Hygon + // devices do not provide a working TF32 GEMM fast path. + static constexpr auto BLAS_COMPUTE_32F_FAST_TF32 = CUBLAS_COMPUTE_32F; + + static constexpr auto BLAS_GEMM_DEFAULT = CUBLAS_GEMM_DEFAULT_TENSOR_OP; + + static constexpr auto blasCreate = cublasCreate; + + static constexpr auto blasSetStream = cublasSetStream; + + static constexpr auto blasDestroy = cublasDestroy; + + static constexpr auto blasGemmEx = [](auto&&... args) { + return cublasGemmEx(std::forward(args)...); + }; + + static constexpr auto blasGemmStridedBatchedEx = [](auto&&... args) { + return cublasGemmStridedBatchedEx(std::forward(args)...); + }; + + static auto GetDataType(DataType dtype) { + if (dtype == DataType::kFloat16) return R_16F; + if (dtype == DataType::kBFloat16) return R_16BF; + return R_32F; + } + + static auto GetComputeType(DataType dtype) { + if (dtype == DataType::kFloat16 || dtype == DataType::kBFloat16) + return BLAS_COMPUTE_32F; + return BLAS_COMPUTE_32F_FAST_TF32; + } +}; + +} // namespace gemm + +template <> +class Operator : public Blas { + public: + using Blas::Blas; + + void operator()(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) const override { + if (this->handle_ == nullptr) { + gemm::HygonBackend::blasCreate(&this->handle_); + } + + gemm::HygonBackend::blasSetStream( + this->handle_, static_cast(this->stream_)); + + const auto& alpha_value{alpha.value_or(this->alpha_)}; + const auto& beta_value{beta.value_or(this->beta_)}; + + const auto& trans_a_value{trans_a.value_or(this->trans_a_)}; + const auto& trans_b_value{trans_b.value_or(this->trans_b_)}; + auto op_a{this->GetOpA(trans_a_value, trans_b_value)}; + auto op_b{this->GetOpB(trans_a_value, trans_b_value)}; + const void* alpha_ptr{this->GetAlphaPtr(alpha_value, c.dtype())}; + const void* beta_ptr{this->GetBetaPtr(beta_value, c.dtype())}; + + if (this->batch_count_ == 1) { + gemm::HygonBackend::blasGemmEx( + this->handle_, op_a, op_b, + this->swap_a_and_b_ ? this->n_ : this->m_, + this->swap_a_and_b_ ? this->m_ : this->n_, this->k_, alpha_ptr, + this->swap_a_and_b_ ? b.data() : a.data(), + gemm::HygonBackend::GetDataType(this->swap_a_and_b_ ? b.dtype() + : a.dtype()), + this->swap_a_and_b_ ? this->ldb_ : this->lda_, + this->swap_a_and_b_ ? a.data() : b.data(), + gemm::HygonBackend::GetDataType(this->swap_a_and_b_ ? a.dtype() + : b.dtype()), + this->swap_a_and_b_ ? this->lda_ : this->ldb_, beta_ptr, c.data(), + gemm::HygonBackend::GetDataType(c.dtype()), this->ldc_, + gemm::HygonBackend::GetComputeType(c.dtype()), + gemm::HygonBackend::BLAS_GEMM_DEFAULT); + gemm::HygonBackend::blasDestroy(this->handle_); + this->handle_ = nullptr; + return; + } + + Blas::operator()(a, b, alpha, beta, trans_a, trans_b, + c); + gemm::HygonBackend::blasDestroy(this->handle_); + this->handle_ = nullptr; + } +}; + +} // namespace infini::ops + +#endif