diff --git a/src/infiniop/devices/metax/metax_common.h b/src/infiniop/devices/metax/metax_common.h index 313d30e4c..a29d44791 100644 --- a/src/infiniop/devices/metax/metax_common.h +++ b/src/infiniop/devices/metax/metax_common.h @@ -6,9 +6,11 @@ #ifdef ENABLE_METAX_MC_API #include #include +#include #else #include #include +#include #endif #include #include diff --git a/src/infiniop/devices/metax/metax_ht2mc.h b/src/infiniop/devices/metax/metax_ht2mc.h index 447792b67..671bbb688 100644 --- a/src/infiniop/devices/metax/metax_ht2mc.h +++ b/src/infiniop/devices/metax/metax_ht2mc.h @@ -101,4 +101,6 @@ #define hcGraphDestroy mcGraphDestroy #define hcGraphExecDestroy mcGraphExecDestroy #define hcGraphLaunch mcGraphLaunch +#define hcMemsetAsync mcMemsetAsync +#define hcGetLastError mcGetLastError #endif diff --git a/src/infiniop/devices/metax/metax_kernel_common.h b/src/infiniop/devices/metax/metax_kernel_common.h index f58fe6c53..202e40395 100644 --- a/src/infiniop/devices/metax/metax_kernel_common.h +++ b/src/infiniop/devices/metax/metax_kernel_common.h @@ -1,8 +1,12 @@ #define INFINIOP_METAX_KERNEL __global__ void #ifdef ENABLE_METAX_MC_API +#include +#include #include #else +#include +#include #include #endif diff --git a/src/infiniop/ops/addcmul/cuda/kernel.cuh b/src/infiniop/ops/addcmul/cuda/kernel.cuh index 3ee56f0d1..2ec83c26c 100644 --- a/src/infiniop/ops/addcmul/cuda/kernel.cuh +++ b/src/infiniop/ops/addcmul/cuda/kernel.cuh @@ -1,8 +1,6 @@ #ifndef __ADDCMUL_CUDA_CUH__ #define __ADDCMUL_CUDA_CUH__ -#include -#include #include namespace op::addcmul::cuda { diff --git a/src/infiniop/ops/addcmul/metax/addcmul_metax.maca b/src/infiniop/ops/addcmul/metax/addcmul_metax.maca index b19ac9931..6d851217b 100644 --- a/src/infiniop/ops/addcmul/metax/addcmul_metax.maca +++ b/src/infiniop/ops/addcmul/metax/addcmul_metax.maca @@ -1,3 +1,4 @@ +#include "../../../devices/metax/metax_handle.h" #include "../../../elementwise/metax/elementwise_metax.h" #include "addcmul_metax.h" @@ -129,7 +130,7 @@ static inline infiniStatus_t launch_addcmul_kernel( auto *t1_ptr = reinterpret_cast(inputs.at(1)); auto *t2_ptr = reinterpret_cast(inputs.at(2)); - mcStream_t metax_stream = reinterpret_cast(stream); + hcStream_t metax_stream = reinterpret_cast(stream); constexpr uint32_t BLOCK_SIZE = 256; uint32_t grid = static_cast((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE); @@ -146,7 +147,7 @@ static inline infiniStatus_t launch_addcmul_kernel( t2_ptr, desc->getValue()); - CHECK_METAX(mcGetLastError()); + CHECK_METAX(hcGetLastError()); return INFINI_STATUS_SUCCESS; } diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.cc b/src/infiniop/ops/addr/cpu/addr_cpu.cc index a2c4526cf..f1069aef1 100644 --- a/src/infiniop/ops/addr/cpu/addr_cpu.cc +++ b/src/infiniop/ops/addr/cpu/addr_cpu.cc @@ -1,6 +1,6 @@ #include "addr_cpu.h" #include "../../../devices/cpu/common_cpu.h" -#include + namespace op::addr::cpu { Descriptor::~Descriptor() = default; diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc index 95d4b700b..98782f958 100644 --- a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc +++ b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc @@ -37,27 +37,36 @@ infiniStatus_t calculateArgWhere( const void *x) { const Tdata *x_data = reinterpret_cast(x); - // int64_t *y_data = reinterpret_cast(y); - std::vector positions; - // #pragma omp parallel for + + std::vector positions; + const size_t ndim = info.shapes.size(); + for (size_t i = 0; i < info.num_elements; i++) { - size_t pos = 0, tem = i; - std::vector position(info.strides.size()); - for (size_t j = info.strides.size() - 1; j >= 0; j--) { - position[j] = tem % info.shapes[j]; - tem /= info.shapes[j]; - pos += position[j] * info.strides[j]; + size_t pos = 0; + size_t tmp = i; + + std::vector coord(ndim); + + // unravel index + for (size_t j = ndim; j-- > 0;) { + coord[j] = tmp % info.shapes[j]; + tmp /= info.shapes[j]; + pos += coord[j] * info.strides[j]; } - if (fabs(x_data[pos] - 0.0f) > 1e-5) { - for (auto p : position) { - positions.push_back(p); + + // PyTorch semantics: != 0 + if (x_data[pos] != Tdata(0)) { + for (size_t j = 0; j < ndim; j++) { + positions.push_back(coord[j]); } } } + *count = positions.size() / ndim; + *y = new int64_t[positions.size()]; memcpy(*y, positions.data(), positions.size() * sizeof(int64_t)); - *count = positions.size() / info.strides.size(); + return INFINI_STATUS_SUCCESS; } diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.mu b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu index 59c7b9b88..ac95d78e8 100644 --- a/src/infiniop/ops/argwhere/moore/argwhere_moore.mu +++ b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu @@ -3,16 +3,6 @@ #include "argwhere_kernel.h" #include "argwhere_moore.h" #include "infinicore.h" -#include - -// template -// INFINIOP_MOORE_KERNEL parallel_block_argwhere(T *data, int64_t *results, size_t N, -// size_t M, const size_t *shapes, -// const ptrdiff_t *strides, size_t ndim, -// size_t *count) { -// parallel_block_argwhere_kernel<<<1, M / 2, M>>>( -// data, results, N, shapes, strides, ndim, count); -// } infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N, size_t M, const size_t *shapes, @@ -90,12 +80,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim, musaMemcpyDeviceToHost, moore_stream); - // cudaStreamSynchronize(cuda_stream); - // for (size_t i = 0; i < (*count) * ndim; i++) { - // spdlog::debug("(*y)[{}]:{}", i, static_cast(*y)[i]); - // } - // cudaFreeAsync(result, cuda_stream); - // cudaFreeAsync(count_cuda, cuda_stream); return INFINI_STATUS_SUCCESS; } diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh index 3a75eaa26..f984c0686 100644 --- a/src/infiniop/ops/atanh/cuda/kernel.cuh +++ b/src/infiniop/ops/atanh/cuda/kernel.cuh @@ -1,9 +1,6 @@ #ifndef __ATANH_CUDA_H__ #define __ATANH_CUDA_H__ -#include -#include - namespace op::atanh::cuda { typedef struct AtanhOp { public: diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca index 3bf8039a5..c14bb75bc 100644 --- a/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca @@ -1,8 +1,9 @@ #include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_handle.h" #include "../../../devices/metax/metax_kernel_common.h" + #include "binary_cross_entropy_with_logits_metax.h" -#include + #include namespace op::bce_with_logits::metax { @@ -191,7 +192,7 @@ infiniStatus_t Descriptor::calculate( const void *pos_weight, void *stream) const { - mcStream_t custream = (mcStream_t)stream; + hcStream_t custream = (hcStream_t)stream; size_t n = _info.num_elements; // F16/BF16 + 归约需要 float workspace @@ -219,7 +220,7 @@ infiniStatus_t Descriptor::calculate( case INFINI_DTYPE_F32: { // 如果是规约操作,计算前需将输出位置清零 if (_reduction != INFINIOP_REDUCTION_NONE) { - mcMemsetAsync(out, 0, sizeof(float), custream); + hcMemsetAsync(out, 0, sizeof(float), custream); } bce_logits_kernel<<>>( @@ -255,7 +256,7 @@ infiniStatus_t Descriptor::calculate( out_raw = out; } else { workspace_f = static_cast(workspace); - mcMemsetAsync(workspace_f, 0, sizeof(float), custream); + hcMemsetAsync(workspace_f, 0, sizeof(float), custream); out_raw = workspace_f; } @@ -294,7 +295,7 @@ infiniStatus_t Descriptor::calculate( out_raw = out; } else { workspace_f = static_cast(workspace); - mcMemsetAsync(workspace_f, 0, sizeof(float), custream); + hcMemsetAsync(workspace_f, 0, sizeof(float), custream); out_raw = workspace_f; } @@ -324,8 +325,8 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_BAD_TENSOR_DTYPE; } - mcError_t err = mcGetLastError(); - if (err != mcSuccess) { + hcError_t err = hcGetLastError(); + if (err != hcSuccess) { return INFINI_STATUS_INTERNAL_ERROR; } return INFINI_STATUS_SUCCESS; diff --git a/src/infiniop/ops/cdist/metax/cdist_metax.maca b/src/infiniop/ops/cdist/metax/cdist_metax.maca index 280437943..b0ebb0d81 100644 --- a/src/infiniop/ops/cdist/metax/cdist_metax.maca +++ b/src/infiniop/ops/cdist/metax/cdist_metax.maca @@ -1,4 +1,6 @@ +#include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_handle.h" +#include "../../../devices/metax/metax_kernel_common.h" #include "cdist_metax.h" #include namespace op::cdist::metax { @@ -126,7 +128,7 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_BAD_TENSOR_DTYPE; } - mcStream_t custream = (mcStream_t)stream; + hcStream_t custream = (hcStream_t)stream; dim3 block(16, 16); dim3 grid( static_cast((_info.n + block.x - 1) / block.x), @@ -151,8 +153,8 @@ infiniStatus_t Descriptor::calculate( _info.y_matrix.col_stride, _p); - auto err = mcGetLastError(); - if (err != mcSuccess) { + auto err = hcGetLastError(); + if (err != hcSuccess) { return INFINI_STATUS_INTERNAL_ERROR; } diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh index 11ad5981e..30b4ae4fb 100644 --- a/src/infiniop/ops/equal/cuda/kernel.cuh +++ b/src/infiniop/ops/equal/cuda/kernel.cuh @@ -1,13 +1,6 @@ #ifndef __EQUAL_CUDA_H__ #define __EQUAL_CUDA_H__ -#if defined(__MACACC__) -#include -#include -#else -#include -#include -#endif #include namespace op::equal::cuda { diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh index 21b6a5f8d..25dfd55a0 100644 --- a/src/infiniop/ops/hardswish/cuda/kernel.cuh +++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh @@ -2,13 +2,6 @@ #define __HARDSWISH_CUDA_H__ #include -#if defined(__MACACC__) -#include -#include -#else -#include -#include -#endif namespace op::hardswish::cuda { diff --git a/src/infiniop/ops/hardtanh/cuda/kernel.cuh b/src/infiniop/ops/hardtanh/cuda/kernel.cuh index 28987f82c..fa8c3d130 100644 --- a/src/infiniop/ops/hardtanh/cuda/kernel.cuh +++ b/src/infiniop/ops/hardtanh/cuda/kernel.cuh @@ -1,13 +1,6 @@ #ifndef __HARDTANH_CUDA_H__ #define __HARDTANH_CUDA_H__ -#if defined(__MACACC__) -#include -#include -#else -#include -#include -#endif #include namespace op::hardtanh::cuda { diff --git a/src/infiniop/ops/hypot/cuda/kernel.cuh b/src/infiniop/ops/hypot/cuda/kernel.cuh index 2af33955d..a251dbc7f 100644 --- a/src/infiniop/ops/hypot/cuda/kernel.cuh +++ b/src/infiniop/ops/hypot/cuda/kernel.cuh @@ -3,14 +3,6 @@ #include #include -#if ENABLE_METAX_API -#include -#include -using nv_bfloat162 = __maca_bfloat162; -#else -#include -#include -#endif namespace op::hypot::cuda { diff --git a/src/infiniop/ops/hypot/metax/hypot_metax.maca b/src/infiniop/ops/hypot/metax/hypot_metax.maca index 8eb403c64..3ce904fa3 100644 --- a/src/infiniop/ops/hypot/metax/hypot_metax.maca +++ b/src/infiniop/ops/hypot/metax/hypot_metax.maca @@ -61,7 +61,7 @@ infiniStatus_t Descriptor::calculate( case INFINI_DTYPE_F16: return _device_info->calculate<256, cuda::HypotOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::HypotOp, nv_bfloat162>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::HypotOp, cuda_bfloat162>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: return _device_info->calculate<256, cuda::HypotOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: diff --git a/src/infiniop/ops/index_add/cuda/kernel.cuh b/src/infiniop/ops/index_add/cuda/kernel.cuh index cb658e0d9..929af819d 100644 --- a/src/infiniop/ops/index_add/cuda/kernel.cuh +++ b/src/infiniop/ops/index_add/cuda/kernel.cuh @@ -1,16 +1,6 @@ #ifndef __INDEX_ADD_CUDA_H__ #define __INDEX_ADD_CUDA_H__ -#if ENABLE_METAX_API -#include -#include -#define __nv_bfloat16 __maca_bfloat16 -#define __nv_bfloat162 __maca_bfloat162 -#else -#include -#include -#include -#endif #include namespace op::index_add::cuda { @@ -40,7 +30,7 @@ __device__ __forceinline__ void atomic_add_custom(__half *address, __half val) { #endif } -__device__ __forceinline__ void atomic_add_custom(__nv_bfloat16 *address, __nv_bfloat16 val) { +__device__ __forceinline__ void atomic_add_custom(cuda_bfloat16 *address, cuda_bfloat16 val) { #if __CUDA_ARCH__ >= 800 atomicAdd(address, val); #else @@ -52,9 +42,9 @@ __device__ __forceinline__ void atomic_add_custom(__nv_bfloat16 *address, __nv_b do { assumed = old; unsigned short old_val_raw = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - __nv_bfloat16 old_val = *reinterpret_cast<__nv_bfloat16 *>(&old_val_raw); + cuda_bfloat16 old_val = *reinterpret_cast(&old_val_raw); - __nv_bfloat16 new_val = old_val + val; + cuda_bfloat16 new_val = old_val + val; unsigned short new_val_raw = *reinterpret_cast(&new_val); unsigned int new_int = (size_t)address & 2 ? (old & 0xffff) | (new_val_raw << 16) diff --git a/src/infiniop/ops/index_add/metax/index_add_metax.maca b/src/infiniop/ops/index_add/metax/index_add_metax.maca index d32a78737..609599332 100644 --- a/src/infiniop/ops/index_add/metax/index_add_metax.maca +++ b/src/infiniop/ops/index_add/metax/index_add_metax.maca @@ -1,14 +1,13 @@ #include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_handle.h" +#include "../../../devices/metax/metax_kernel_common.h" #include "../../../tensor.h" + #include "../cuda/kernel.cuh" #include "index_add_metax.h" #include -#include #include -#include -#include -#include + #include namespace op::index_add::metax { @@ -33,8 +32,8 @@ __device__ __forceinline__ void gpuAtomicAdd( template <> __device__ __forceinline__ void gpuAtomicAdd( - __maca_bfloat16 *address, - __maca_bfloat16 val) { + cuda_bfloat16 *address, + cuda_bfloat16 val) { unsigned int *addr = (unsigned int *)((char *)address - ((size_t)address & 2)); unsigned int old = *addr; @@ -47,7 +46,7 @@ __device__ __forceinline__ void gpuAtomicAdd( ? (assumed >> 16) : (assumed & 0xFFFF); - __maca_bfloat16 sum = (__maca_bfloat16)((float)*reinterpret_cast<__maca_bfloat16 *>(&old_val) + (float)val); + cuda_bfloat16 sum = (cuda_bfloat16)((float)*reinterpret_cast(&old_val) + (float)val); unsigned short res = *reinterpret_cast(&sum); @@ -266,7 +265,7 @@ infiniStatus_t Descriptor::calculate( break; case INFINI_DTYPE_BF16: - LAUNCH(__maca_bfloat16, int32_t); + LAUNCH(cuda_bfloat16, int32_t); break; case INFINI_DTYPE_F32: @@ -297,7 +296,7 @@ infiniStatus_t Descriptor::calculate( break; case INFINI_DTYPE_BF16: - LAUNCH(__maca_bfloat16, int64_t); + LAUNCH(cuda_bfloat16, int64_t); break; case INFINI_DTYPE_F32: diff --git a/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu index f79451066..d946b44d4 100644 --- a/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu +++ b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu @@ -1,3 +1,5 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" #include "../../../handle.h" // Iluvatar does not support atomic add yet @@ -8,9 +10,6 @@ #include "index_add_nvidia.cuh" #include -#include -#include - namespace op::index_add::nvidia { template diff --git a/src/infiniop/ops/index_copy/cuda/kernel.cuh b/src/infiniop/ops/index_copy/cuda/kernel.cuh index 373765cf6..3b7866564 100644 --- a/src/infiniop/ops/index_copy/cuda/kernel.cuh +++ b/src/infiniop/ops/index_copy/cuda/kernel.cuh @@ -1,16 +1,6 @@ #ifndef __INDEX_COPY_CUDA_H__ #define __INDEX_COPY_CUDA_H__ -// #include -#if defined(__MACA__) || defined(__MACACC__) -#include -#include -using nv_bfloat162 = __maca_bfloat162; -#else -#include -#include -#endif - #include namespace op::index_copy::cuda { diff --git a/src/infiniop/ops/index_copy/metax/index_copy_metax.maca b/src/infiniop/ops/index_copy/metax/index_copy_metax.maca index 750908198..9618cb8de 100644 --- a/src/infiniop/ops/index_copy/metax/index_copy_metax.maca +++ b/src/infiniop/ops/index_copy/metax/index_copy_metax.maca @@ -1,17 +1,10 @@ #include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_handle.h" +#include "../../../devices/metax/metax_kernel_common.h" #include "index_copy_metax.h" #include -#include #include -#include #include -#if defined(__MACA__) || defined(__MACACC__) -#include -#include -#endif -#include "../../../tensor.h" -#include "../cuda/kernel.cuh" #include "../../../tensor.h" #include "../cuda/kernel.cuh" @@ -149,8 +142,8 @@ infiniStatus_t Descriptor::calculate( LAUNCH(__half, int32_t); break; case INFINI_DTYPE_BF16: -#if defined(__MACA__) || defined(__MACACC__) - LAUNCH(__maca_bfloat16, int32_t); +#if defined(__MACA__) || defined(ENABLE_METAX_MC_API) + LAUNCH(cuda_bfloat16, int32_t); #endif break; case INFINI_DTYPE_F32: @@ -174,8 +167,8 @@ infiniStatus_t Descriptor::calculate( LAUNCH(__half, int64_t); break; case INFINI_DTYPE_BF16: -#if defined(__MACA__) || defined(__MACACC__) - LAUNCH(__maca_bfloat16, int64_t); +#if defined(__MACA__) || defined(ENABLE_METAX_MC_API) + LAUNCH(cuda_bfloat16, int64_t); #endif break; case INFINI_DTYPE_F32: diff --git a/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu index 6c0840b6c..5bf1208bb 100644 --- a/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu +++ b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu @@ -1,12 +1,11 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" #include "../../../handle.h" -#include "../cuda/kernel.cuh" // 假设这是通用 kernel 头文件路径,或者是 index_copy_cuda.h + +#include "../cuda/kernel.cuh" #include "index_copy_nvidia.cuh" #include -// 【关键】引入 CUDA 浮点类型定义 -#include -#include - namespace op::index_copy::nvidia { // ================================================================== @@ -131,7 +130,7 @@ infiniStatus_t Descriptor::calculate( auto idx_dtype = _info.idx_dtype(); // 宏:根据 T_STORAGE 类型实例化 launch_kernel -// T_STORAGE 将会是: float, double, int32_t, __half, __nv_bfloat16 +// T_STORAGE 将会是: float, double, int32_t, __half, cuda_bfloat16 #define LAUNCH_BY_SIZE(T_STORAGE) \ switch (idx_dtype) { \ case INFINI_DTYPE_I32: \ @@ -157,9 +156,9 @@ infiniStatus_t Descriptor::calculate( case INFINI_DTYPE_F16: LAUNCH_BY_SIZE(__half); break; - // 16-bit BFloat16 (bf16) -> 使用 __nv_bfloat16 + // 16-bit BFloat16 (bf16) -> 使用 cuda_bfloat16 case INFINI_DTYPE_BF16: - LAUNCH_BY_SIZE(__nv_bfloat16); + LAUNCH_BY_SIZE(cuda_bfloat16); break; // Integers case INFINI_DTYPE_I32: diff --git a/src/infiniop/ops/sigmoid/cuda/kernel.cuh b/src/infiniop/ops/sigmoid/cuda/kernel.cuh index 9c7978b21..505fdc38b 100644 --- a/src/infiniop/ops/sigmoid/cuda/kernel.cuh +++ b/src/infiniop/ops/sigmoid/cuda/kernel.cuh @@ -1,10 +1,6 @@ #ifndef __SIDMOID_CUDA_H__ #define __SIDMOID_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include -#include - namespace op::sigmoid::cuda { typedef struct SigmoidOp { public: @@ -18,8 +14,8 @@ public: } else if constexpr (std::is_same_v) { half denominator = __hadd(__float2half(1.0f), hexp(__hneg(x))); return hrcp(denominator); - } else if constexpr (std::is_same_v) { - __nv_bfloat16 denominator = __float2bfloat16(__fadd_rn(1.0f, __expf(__bfloat162float(-x)))); + } else if constexpr (std::is_same_v) { + cuda_bfloat16 denominator = __float2bfloat16(__fadd_rn(1.0f, __expf(__bfloat162float(-x)))); return __float2bfloat16(1.0f) / denominator; } else if constexpr (std::is_same_v) { if (x >= 0.0f) { diff --git a/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu index 43f6df9e6..5cbd80db7 100644 --- a/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu +++ b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu @@ -1,3 +1,5 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + #include "../cuda/kernel.cuh" #include "sigmoid_nvidia.cuh" @@ -43,7 +45,7 @@ infiniStatus_t Descriptor::calculate( case INFINI_DTYPE_F16: return _device_info->calculate<256, cuda::SigmoidOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::SigmoidOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SigmoidOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: return _device_info->calculate<256, cuda::SigmoidOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: diff --git a/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh b/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh index 214163156..cf75ba180 100644 --- a/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh +++ b/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh @@ -1,15 +1,6 @@ #ifndef __SMOOTH_L1_LOSS_CUDA_CUH__ #define __SMOOTH_L1_LOSS_CUDA_CUH__ -#if defined(__MACA__) || defined(__MACACC__) -#include -#include -#else -#include -#include -#include -#endif - #include namespace op::smooth_l1_loss::cuda { diff --git a/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca index 5221ddac3..cad808dc2 100644 --- a/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca +++ b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca @@ -1,10 +1,9 @@ #include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_handle.h" +#include "../../../devices/metax/metax_kernel_common.h" #include "smooth_l1_loss_metax.h" #include -#include #include -#include #include #include "../../../tensor.h" @@ -105,7 +104,7 @@ void launch_kernel_impl( size_t n, float beta, int reduction, const TensorShape &in_s, const TensorShape &tg_s, const TensorShape &out_s, void *stream) { - auto mc_stream = reinterpret_cast(stream); + auto mc_stream = reinterpret_cast(stream); size_t grid = (n + 255) / 256; float scale = (reduction == 1) ? (1.0f / static_cast(n)) : 1.0f; @@ -192,12 +191,12 @@ infiniStatus_t Descriptor::create( infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, void *output, const void *input, const void *target, void *stream) const { - auto mc_stream = reinterpret_cast(stream); + auto mc_stream = reinterpret_cast(stream); if (_opaque->reduction != 0) { // Reduction 模式:清空 Workspace (float) 而不是 Output // 这样可以确保累加从 0.0f 开始 - mcMemsetAsync(workspace, 0, sizeof(float), mc_stream); + hcMemsetAsync(workspace, 0, sizeof(float), mc_stream); } size_t n = _opaque->total_elements; @@ -211,9 +210,7 @@ infiniStatus_t Descriptor::calculate( LAUNCH(__half); break; case INFINI_DTYPE_BF16: -#if defined(__MACA__) || defined(__MACACC__) - LAUNCH(__maca_bfloat16); -#endif + LAUNCH(cuda_bfloat16); break; case INFINI_DTYPE_F32: LAUNCH(float); diff --git a/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu index b3d224d64..28c1b06f9 100644 --- a/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu +++ b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu @@ -1,5 +1,6 @@ #include "../../../devices/nvidia/nvidia_kernel_common.cuh" #include "../../../handle.h" + #include "../cuda/kernel.cuh" #include "smooth_l1_loss_nvidia.cuh" #include @@ -128,7 +129,7 @@ infiniStatus_t Descriptor::calculate( launch_kernel(output, input, target, workspace, numel, beta, reduction, stream); break; case INFINI_DTYPE_BF16: - launch_kernel(output, input, target, workspace, numel, beta, reduction, stream); + launch_kernel(output, input, target, workspace, numel, beta, reduction, stream); break; case INFINI_DTYPE_F32: launch_kernel(output, input, target, workspace, numel, beta, reduction, stream); diff --git a/src/infiniop/ops/take/cuda/kernel.cuh b/src/infiniop/ops/take/cuda/kernel.cuh index 6e1a50885..06a36ccab 100644 --- a/src/infiniop/ops/take/cuda/kernel.cuh +++ b/src/infiniop/ops/take/cuda/kernel.cuh @@ -1,7 +1,6 @@ #ifndef __TAKE_CUDA_H__ #define __TAKE_CUDA_H__ -// #include #include namespace op::take::cuda { diff --git a/src/infiniop/ops/take/metax/take_metax.maca b/src/infiniop/ops/take/metax/take_metax.maca index d16164378..9d60f6406 100644 --- a/src/infiniop/ops/take/metax/take_metax.maca +++ b/src/infiniop/ops/take/metax/take_metax.maca @@ -1,12 +1,9 @@ #include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_handle.h" +#include "../../../devices/metax/metax_kernel_common.h" #include "take_metax.h" #include -#include #include -#include -#include -#include #include #include "../../../tensor.h" @@ -193,7 +190,7 @@ infiniStatus_t Descriptor::calculate( break; case INFINI_DTYPE_BF16: - LAUNCH(__maca_bfloat16, int32_t); + LAUNCH(cuda_bfloat16, int32_t); break; case INFINI_DTYPE_F32: @@ -224,7 +221,7 @@ infiniStatus_t Descriptor::calculate( break; case INFINI_DTYPE_BF16: - LAUNCH(__maca_bfloat16, int64_t); + LAUNCH(cuda_bfloat16, int64_t); break; case INFINI_DTYPE_F32: diff --git a/src/infiniop/ops/take/nvidia/take_nvidia.cu b/src/infiniop/ops/take/nvidia/take_nvidia.cu index 840fc0f06..b465b9d46 100644 --- a/src/infiniop/ops/take/nvidia/take_nvidia.cu +++ b/src/infiniop/ops/take/nvidia/take_nvidia.cu @@ -1,4 +1,6 @@ +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" #include "../../../handle.h" + #include "../cuda/kernel.cuh" #include "take_nvidia.cuh" #include diff --git a/test/infinicore/ops/topk.py b/test/infinicore/ops/topk.py index 50876b1b7..541645f13 100644 --- a/test/infinicore/ops/topk.py +++ b/test/infinicore/ops/topk.py @@ -37,8 +37,6 @@ def parse_test_cases(): for data in _TEST_CASES_DATA: shape, in_strides, k, dim, largest, sorted_ = data - out_supports_inplace = not is_broadcast(in_strides) - for dtype in _TENSOR_DTYPES: tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4}) @@ -55,14 +53,10 @@ def parse_test_cases(): comparison_target=None, tolerance=tol, description=f"topk - OUT_OF_PLACE", - output_count=2, + output_count=1, ) ) - # topk returns (values, indices) - in-place/out variant requires tuple of outputs - # The current test harness expects a single TensorSpec for `output_spec`, so - # we avoid creating an in-place test for topk here and only test out-of-place. - return test_cases @@ -76,11 +70,12 @@ def get_test_cases(self): return parse_test_cases() def torch_operator(self, *args, **kwargs): - return torch.topk(*args, **kwargs) + # only returns values, not indices + return torch.topk(*args, **kwargs)[0] def infinicore_operator(self, *args, **kwargs): - """InfiniCore implementation (operator not yet available).""" - return infinicore.topk(*args, **kwargs) + # only returns values, not indices + return infinicore.topk(*args, **kwargs)[0] def main():