issue/1105 - fix hpcc compilation

wooway777 · wooway777 · commit a76e142db34c · 2026-03-24T19:22:58.000+08:00
diff --git a/src/infiniop/ops/addcmul/cuda/kernel.cuh b/src/infiniop/ops/addcmul/cuda/kernel.cuh
@@ -1,8 +1,10 @@
 #ifndef __ADDCMUL_CUDA_CUH__
 #define __ADDCMUL_CUDA_CUH__
 
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
+#endif
 #include <type_traits>
 
 namespace op::addcmul::cuda {
diff --git a/src/infiniop/ops/addcmul/metax/addcmul_metax.maca b/src/infiniop/ops/addcmul/metax/addcmul_metax.maca
@@ -129,7 +129,7 @@ static inline infiniStatus_t launch_addcmul_kernel(
     auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
     auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
 
-    mcStream_t metax_stream = reinterpret_cast<mcStream_t>(stream);
+    hcStream_t metax_stream = reinterpret_cast<hcStream_t>(stream);
 
     constexpr uint32_t BLOCK_SIZE = 256;
     uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
@@ -146,7 +146,7 @@ static inline infiniStatus_t launch_addcmul_kernel(
         t2_ptr,
         desc->getValue());
 
-    CHECK_METAX(mcGetLastError());
+    CHECK_METAX(hcGetLastError());
     return INFINI_STATUS_SUCCESS;
 }
 
diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.cc b/src/infiniop/ops/addr/cpu/addr_cpu.cc
@@ -1,6 +1,6 @@
 #include "addr_cpu.h"
 #include "../../../devices/cpu/common_cpu.h"
-#include <spdlog/spdlog.h>
+
 namespace op::addr::cpu {
 Descriptor::~Descriptor() = default;
 
diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.mu b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu
@@ -3,16 +3,6 @@
 #include "argwhere_kernel.h"
 #include "argwhere_moore.h"
 #include "infinicore.h"
-#include <spdlog/spdlog.h>
-
-// template <typename T>
-// INFINIOP_MOORE_KERNEL parallel_block_argwhere(T *data, int64_t *results, size_t N,
-//                             size_t M, const size_t *shapes,
-//                             const ptrdiff_t *strides, size_t ndim,
-//                             size_t *count) {
-//        parallel_block_argwhere_kernel<float><<<1, M / 2, M>>>(
-//         data, results, N, shapes, strides, ndim, count);
-// }
 
 infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N,
                             size_t M, const size_t *shapes,
@@ -90,12 +80,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
     musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim,
                     musaMemcpyDeviceToHost, moore_stream);
 
-    // cudaStreamSynchronize(cuda_stream);
-    // for (size_t i = 0; i < (*count) * ndim; i++) {
-    //   spdlog::debug("(*y)[{}]:{}", i, static_cast<size_t *>(*y)[i]);
-    // }
-    // cudaFreeAsync(result, cuda_stream);
-    // cudaFreeAsync(count_cuda, cuda_stream);
     return INFINI_STATUS_SUCCESS;
 }
 
diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh
@@ -1,8 +1,10 @@
 #ifndef __ATANH_CUDA_H__
 #define __ATANH_CUDA_H__
 
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
+#endif
 
 namespace op::atanh::cuda {
 typedef struct AtanhOp {
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca
@@ -2,7 +2,11 @@
 #include "../../../devices/metax/metax_handle.h"
 #include "../../../devices/metax/metax_kernel_common.h"
 #include "binary_cross_entropy_with_logits_metax.h"
+#if defined(ENABLE_METAX_MC_API)
 #include <mc_runtime.h>
+#else
+#include <hc_runtime.h>
+#endif
 #include <type_traits>
 
 namespace op::bce_with_logits::metax {
@@ -191,7 +195,7 @@ infiniStatus_t Descriptor::calculate(
     const void *pos_weight,
     void *stream) const {
 
-    mcStream_t custream = (mcStream_t)stream;
+    hcStream_t custream = (hcStream_t)stream;
     size_t n = _info.num_elements;
 
     // F16/BF16 + 归约需要 float workspace
@@ -219,7 +223,7 @@ infiniStatus_t Descriptor::calculate(
     case INFINI_DTYPE_F32: {
         // 如果是规约操作，计算前需将输出位置清零
         if (_reduction != INFINIOP_REDUCTION_NONE) {
-            mcMemsetAsync(out, 0, sizeof(float), custream);
+            hcMemsetAsync(out, 0, sizeof(float), custream);
         }
 
         bce_logits_kernel<float, float><<<grid, block, 0, custream>>>(
@@ -255,7 +259,7 @@ infiniStatus_t Descriptor::calculate(
             out_raw = out;
         } else {
             workspace_f = static_cast<float *>(workspace);
-            mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
+            hcMemsetAsync(workspace_f, 0, sizeof(float), custream);
             out_raw = workspace_f;
         }
 
@@ -294,7 +298,7 @@ infiniStatus_t Descriptor::calculate(
             out_raw = out;
         } else {
             workspace_f = static_cast<float *>(workspace);
-            mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
+            hcMemsetAsync(workspace_f, 0, sizeof(float), custream);
             out_raw = workspace_f;
         }
 
@@ -324,8 +328,8 @@ infiniStatus_t Descriptor::calculate(
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
-    mcError_t err = mcGetLastError();
-    if (err != mcSuccess) {
+    hcError_t err = hcGetLastError();
+    if (err != hcSuccess) {
         return INFINI_STATUS_INTERNAL_ERROR;
     }
     return INFINI_STATUS_SUCCESS;
diff --git a/src/infiniop/ops/cdist/metax/cdist_metax.maca b/src/infiniop/ops/cdist/metax/cdist_metax.maca
@@ -126,7 +126,7 @@ infiniStatus_t Descriptor::calculate(
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
-    mcStream_t custream = (mcStream_t)stream;
+    hcStream_t custream = (hcStream_t)stream;
     dim3 block(16, 16);
     dim3 grid(
         static_cast<unsigned int>((_info.n + block.x - 1) / block.x),
@@ -151,8 +151,8 @@ infiniStatus_t Descriptor::calculate(
         _info.y_matrix.col_stride,
         _p);
 
-    auto err = mcGetLastError();
-    if (err != mcSuccess) {
+    auto err = hcGetLastError();
+    if (err != hcSuccess) {
         return INFINI_STATUS_INTERNAL_ERROR;
     }
 
diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh
@@ -1,10 +1,15 @@
 #ifndef __EQUAL_CUDA_H__
 #define __EQUAL_CUDA_H__
 
-#if defined(__MACACC__)
+#if ENABLE_METAX_API
+#if defined(ENABLE_METAX_MC_API)
 #include <maca_bfloat16.h>
 #include <maca_fp16.h>
 #else
+#include <hpcc_bfloat16.h>
+#include <hpcc_fp16.h>
+#endif
+#elif defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #endif
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -2,10 +2,15 @@
 #define __HARDSWISH_CUDA_H__
 
 #include <cmath>
-#if defined(__MACACC__)
+#if ENABLE_METAX_API
+#if defined(ENABLE_METAX_MC_API)
 #include <maca_bfloat16.h>
 #include <maca_fp16.h>
 #else
+#include <hpcc_bfloat16.h>
+#include <hpcc_fp16.h>
+#endif
+#elif defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #endif
diff --git a/src/infiniop/ops/hardtanh/cuda/kernel.cuh b/src/infiniop/ops/hardtanh/cuda/kernel.cuh
@@ -1,10 +1,15 @@
 #ifndef __HARDTANH_CUDA_H__
 #define __HARDTANH_CUDA_H__
 
-#if defined(__MACACC__)
+#if ENABLE_METAX_API
+#if defined(ENABLE_METAX_MC_API)
 #include <maca_bfloat16.h>
 #include <maca_fp16.h>
 #else
+#include <hpcc_bfloat16.h>
+#include <hpcc_fp16.h>
+#endif
+#elif defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #endif
diff --git a/src/infiniop/ops/hypot/cuda/kernel.cuh b/src/infiniop/ops/hypot/cuda/kernel.cuh
@@ -4,10 +4,14 @@
 #include <cmath>
 #include <type_traits>
 #if ENABLE_METAX_API
+#if defined(ENABLE_METAX_MC_API)
 #include <maca_bfloat16.h>
 #include <maca_fp16.h>
-using nv_bfloat162 = __maca_bfloat162;
 #else
+#include <hpcc_bfloat16.h>
+#include <hpcc_fp16.h>
+#endif
+#elif defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #endif
diff --git a/src/infiniop/ops/hypot/metax/hypot_metax.maca b/src/infiniop/ops/hypot/metax/hypot_metax.maca
@@ -61,7 +61,7 @@ infiniStatus_t Descriptor::calculate(
     case INFINI_DTYPE_F16:
         return _device_info->calculate<256, cuda::HypotOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::HypotOp, nv_bfloat162>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::HypotOp, cuda_bfloat162>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
         return _device_info->calculate<256, cuda::HypotOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
diff --git a/src/infiniop/ops/index_add/cuda/kernel.cuh b/src/infiniop/ops/index_add/cuda/kernel.cuh
@@ -2,14 +2,18 @@
 #define __INDEX_ADD_CUDA_H__
 
 #if ENABLE_METAX_API
+#if defined(ENABLE_METAX_MC_API)
 #include <maca_bfloat16.h>
 #include <maca_fp16.h>
-#define __nv_bfloat16 __maca_bfloat16
-#define __nv_bfloat162 __maca_bfloat162
 #else
+#include <hpcc_bfloat16.h>
+#include <hpcc_fp16.h>
+#endif
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
-#include <cuda_runtime.h>
 #endif
 #include <cstdint>
 
@@ -40,7 +44,7 @@ __device__ __forceinline__ void atomic_add_custom(__half *address, __half val) {
 #endif
 }
 
-__device__ __forceinline__ void atomic_add_custom(__nv_bfloat16 *address, __nv_bfloat16 val) {
+__device__ __forceinline__ void atomic_add_custom(cuda_bfloat16 *address, cuda_bfloat16 val) {
 #if __CUDA_ARCH__ >= 800
     atomicAdd(address, val);
 #else
@@ -52,9 +56,9 @@ __device__ __forceinline__ void atomic_add_custom(__nv_bfloat16 *address, __nv_b
     do {
         assumed = old;
         unsigned short old_val_raw = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-        __nv_bfloat16 old_val = *reinterpret_cast<__nv_bfloat16 *>(&old_val_raw);
+        cuda_bfloat16 old_val = *reinterpret_cast<cuda_bfloat16 *>(&old_val_raw);
 
-        __nv_bfloat16 new_val = old_val + val;
+        cuda_bfloat16 new_val = old_val + val;
         unsigned short new_val_raw = *reinterpret_cast<unsigned short *>(&new_val);
 
         unsigned int new_int = (size_t)address & 2 ? (old & 0xffff) | (new_val_raw << 16)
diff --git a/src/infiniop/ops/index_add/metax/index_add_metax.maca b/src/infiniop/ops/index_add/metax/index_add_metax.maca
@@ -1,14 +1,22 @@
 #include "../../../devices/metax/metax_common.h"
 #include "../../../devices/metax/metax_handle.h"
+#include "../../../devices/metax/metax_kernel_common.h"
 #include "../../../tensor.h"
 #include "../cuda/kernel.cuh"
 #include "index_add_metax.h"
 #include <cmath>
-#include <common/mc_library_types.h>
 #include <cstdio>
+#if defined(ENABLE_METAX_MC_API)
+#include <common/mc_library_types.h>
 #include <maca_bfloat16.h>
 #include <maca_fp16.h>
 #include <mcr/mc_runtime.h>
+#else
+#include <common/hc_library_types.h>
+#include <hpcc_bfloat16.h>
+#include <hpcc_fp16.h>
+#include <hcr/hc_runtime.h>
+#endif
 #include <vector>
 
 namespace op::index_add::metax {
@@ -33,8 +41,8 @@ __device__ __forceinline__ void gpuAtomicAdd(
 
 template <>
 __device__ __forceinline__ void gpuAtomicAdd(
-    __maca_bfloat16 *address,
-    __maca_bfloat16 val) {
+    cuda_bfloat16 *address,
+    cuda_bfloat16 val) {
     unsigned int *addr = (unsigned int *)((char *)address - ((size_t)address & 2));
 
     unsigned int old = *addr;
@@ -47,7 +55,7 @@ __device__ __forceinline__ void gpuAtomicAdd(
                                    ? (assumed >> 16)
                                    : (assumed & 0xFFFF);
 
-        __maca_bfloat16 sum = (__maca_bfloat16)((float)*reinterpret_cast<__maca_bfloat16 *>(&old_val) + (float)val);
+        cuda_bfloat16 sum = (cuda_bfloat16)((float)*reinterpret_cast<cuda_bfloat16 *>(&old_val) + (float)val);
 
         unsigned short res = *reinterpret_cast<unsigned short *>(&sum);
 
@@ -266,7 +274,7 @@ infiniStatus_t Descriptor::calculate(
             break;
 
         case INFINI_DTYPE_BF16:
-            LAUNCH(__maca_bfloat16, int32_t);
+            LAUNCH(cuda_bfloat16, int32_t);
             break;
 
         case INFINI_DTYPE_F32:
@@ -297,7 +305,7 @@ infiniStatus_t Descriptor::calculate(
             break;
 
         case INFINI_DTYPE_BF16:
-            LAUNCH(__maca_bfloat16, int64_t);
+            LAUNCH(cuda_bfloat16, int64_t);
             break;
 
         case INFINI_DTYPE_F32:
diff --git a/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu
@@ -7,9 +7,10 @@
 
 #include "index_add_nvidia.cuh"
 #include <cstdint>
-
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
+#endif
 
 namespace op::index_add::nvidia {
 
diff --git a/src/infiniop/ops/index_copy/cuda/kernel.cuh b/src/infiniop/ops/index_copy/cuda/kernel.cuh
@@ -2,11 +2,16 @@
 #define __INDEX_COPY_CUDA_H__
 
 // #include <cuda_runtime.h>
-#if defined(__MACA__) || defined(__MACACC__)
+#if defined(ENABLE_METAX_API)
+#if defined(ENABLE_METAX_MC_API)
 #include <maca_bfloat16.h>
 #include <maca_fp16.h>
-using nv_bfloat162 = __maca_bfloat162;
 #else
+#include <hpcc_bfloat16.h>
+#include <hpcc_fp16.h>
+#endif
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #endif
diff --git a/src/infiniop/ops/index_copy/metax/index_copy_metax.maca b/src/infiniop/ops/index_copy/metax/index_copy_metax.maca
diff --git a/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu
diff --git a/src/infiniop/ops/sigmoid/cuda/kernel.cuh b/src/infiniop/ops/sigmoid/cuda/kernel.cuh
diff --git a/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh b/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh
diff --git a/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca
diff --git a/src/infiniop/ops/take/metax/take_metax.maca b/src/infiniop/ops/take/metax/take_metax.maca