InfiniTensor · wooway777 · Mar 25, 2026 · Mar 24, 2026 · Mar 25, 2026
diff --git a/src/infiniop/devices/metax/metax_common.h b/src/infiniop/devices/metax/metax_common.h
@@ -6,9 +6,11 @@
 #ifdef ENABLE_METAX_MC_API
 #include <mcblas/mcblas.h>
 #include <mcdnn/mcdnn.h>
+#include <mcr/mc_runtime.h>
 #else
 #include <hcblas/hcblas.h>
 #include <hcdnn/hcdnn.h>
+#include <hcr/hc_runtime.h>
 #endif
 #include <functional>
 #include <memory>

diff --git a/src/infiniop/devices/metax/metax_ht2mc.h b/src/infiniop/devices/metax/metax_ht2mc.h
@@ -101,4 +101,6 @@
 #define hcGraphDestroy mcGraphDestroy
 #define hcGraphExecDestroy mcGraphExecDestroy
 #define hcGraphLaunch mcGraphLaunch
+#define hcMemsetAsync mcMemsetAsync
+#define hcGetLastError mcGetLastError
 #endif
diff --git a/src/infiniop/devices/metax/metax_kernel_common.h b/src/infiniop/devices/metax/metax_kernel_common.h
@@ -1,8 +1,12 @@
 #define INFINIOP_METAX_KERNEL __global__ void
 
 #ifdef ENABLE_METAX_MC_API
+#include <maca_bfloat16.h>
+#include <maca_fp16.h>
 #include <maca_fp8.h>
 #else
+#include <hpcc_bfloat16.h>
+#include <hpcc_fp16.h>
 #include <hpcc_fp8.h>
 #endif
 

diff --git a/src/infiniop/ops/addcmul/cuda/kernel.cuh b/src/infiniop/ops/addcmul/cuda/kernel.cuh
@@ -1,8 +1,6 @@
 #ifndef __ADDCMUL_CUDA_CUH__
 #define __ADDCMUL_CUDA_CUH__
 
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
 #include <type_traits>
 
 namespace op::addcmul::cuda {

diff --git a/src/infiniop/ops/addcmul/metax/addcmul_metax.maca b/src/infiniop/ops/addcmul/metax/addcmul_metax.maca
@@ -1,3 +1,4 @@
+#include "../../../devices/metax/metax_handle.h"
 #include "../../../elementwise/metax/elementwise_metax.h"
 
 #include "addcmul_metax.h"
@@ -129,7 +130,7 @@ static inline infiniStatus_t launch_addcmul_kernel(
     auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
     auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
 
-    mcStream_t metax_stream = reinterpret_cast<mcStream_t>(stream);
+    hcStream_t metax_stream = reinterpret_cast<hcStream_t>(stream);
 
     constexpr uint32_t BLOCK_SIZE = 256;
     uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
@@ -146,7 +147,7 @@ static inline infiniStatus_t launch_addcmul_kernel(
         t2_ptr,
         desc->getValue());
 
-    CHECK_METAX(mcGetLastError());
+    CHECK_METAX(hcGetLastError());
     return INFINI_STATUS_SUCCESS;
 }
 

diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.cc b/src/infiniop/ops/addr/cpu/addr_cpu.cc
@@ -1,6 +1,6 @@
 #include "addr_cpu.h"
 #include "../../../devices/cpu/common_cpu.h"
-#include <spdlog/spdlog.h>
+
 namespace op::addr::cpu {
 Descriptor::~Descriptor() = default;
 

diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc
@@ -37,27 +37,36 @@ infiniStatus_t calculateArgWhere(
     const void *x) {
 
     const Tdata *x_data = reinterpret_cast<const Tdata *>(x);
-    // int64_t *y_data = reinterpret_cast<int64_t *>(y);
-    std::vector<size_t> positions;
-    // #pragma omp parallel for
+
+    std::vector<int64_t> positions;
+    const size_t ndim = info.shapes.size();
+
     for (size_t i = 0; i < info.num_elements; i++) {
-        size_t pos = 0, tem = i;
-        std::vector<size_t> position(info.strides.size());
-        for (size_t j = info.strides.size() - 1; j >= 0; j--) {
-            position[j] = tem % info.shapes[j];
-            tem /= info.shapes[j];
-            pos += position[j] * info.strides[j];
+        size_t pos = 0;
+        size_t tmp = i;
+
+        std::vector<int64_t> coord(ndim);
+
+        // unravel index
+        for (size_t j = ndim; j-- > 0;) {
+            coord[j] = tmp % info.shapes[j];
+            tmp /= info.shapes[j];
+            pos += coord[j] * info.strides[j];
         }
-        if (fabs(x_data[pos] - 0.0f) > 1e-5) {
-            for (auto p : position) {
-                positions.push_back(p);
+
+        // PyTorch semantics: != 0
+        if (x_data[pos] != Tdata(0)) {
+            for (size_t j = 0; j < ndim; j++) {
+                positions.push_back(coord[j]);
             }
         }
     }
 
+    *count = positions.size() / ndim;
+
     *y = new int64_t[positions.size()];
     memcpy(*y, positions.data(), positions.size() * sizeof(int64_t));
-    *count = positions.size() / info.strides.size();
+
     return INFINI_STATUS_SUCCESS;
 }
 

diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.mu b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu
@@ -3,16 +3,6 @@
 #include "argwhere_kernel.h"
 #include "argwhere_moore.h"
 #include "infinicore.h"
-#include <spdlog/spdlog.h>
-
-// template <typename T>
-// INFINIOP_MOORE_KERNEL parallel_block_argwhere(T *data, int64_t *results, size_t N,
-//                             size_t M, const size_t *shapes,
-//                             const ptrdiff_t *strides, size_t ndim,
-//                             size_t *count) {
-//        parallel_block_argwhere_kernel<float><<<1, M / 2, M>>>(
-//         data, results, N, shapes, strides, ndim, count);
-// }
 
 infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N,
                             size_t M, const size_t *shapes,
@@ -90,12 +80,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
     musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim,
                     musaMemcpyDeviceToHost, moore_stream);
 
-    // cudaStreamSynchronize(cuda_stream);
-    // for (size_t i = 0; i < (*count) * ndim; i++) {
-    //   spdlog::debug("(*y)[{}]:{}", i, static_cast<size_t *>(*y)[i]);
-    // }
-    // cudaFreeAsync(result, cuda_stream);
-    // cudaFreeAsync(count_cuda, cuda_stream);
     return INFINI_STATUS_SUCCESS;
 }
 

diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh
@@ -1,9 +1,6 @@
 #ifndef __ATANH_CUDA_H__
 #define __ATANH_CUDA_H__
 
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
 namespace op::atanh::cuda {
 typedef struct AtanhOp {
 public:

diff --git a/...op/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca b/...op/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca
@@ -1,8 +1,9 @@
 #include "../../../devices/metax/metax_common.h"
 #include "../../../devices/metax/metax_handle.h"
 #include "../../../devices/metax/metax_kernel_common.h"
+
 #include "binary_cross_entropy_with_logits_metax.h"
-#include <mc_runtime.h>
+
 #include <type_traits>
 
 namespace op::bce_with_logits::metax {
@@ -191,7 +192,7 @@ infiniStatus_t Descriptor::calculate(
     const void *pos_weight,
     void *stream) const {
 
-    mcStream_t custream = (mcStream_t)stream;
+    hcStream_t custream = (hcStream_t)stream;
     size_t n = _info.num_elements;
 
     // F16/BF16 + 归约需要 float workspace
@@ -219,7 +220,7 @@ infiniStatus_t Descriptor::calculate(
     case INFINI_DTYPE_F32: {
         // 如果是规约操作，计算前需将输出位置清零
         if (_reduction != INFINIOP_REDUCTION_NONE) {
-            mcMemsetAsync(out, 0, sizeof(float), custream);
+            hcMemsetAsync(out, 0, sizeof(float), custream);
         }
 
         bce_logits_kernel<float, float><<<grid, block, 0, custream>>>(
@@ -255,7 +256,7 @@ infiniStatus_t Descriptor::calculate(
             out_raw = out;
         } else {
             workspace_f = static_cast<float *>(workspace);
-            mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
+            hcMemsetAsync(workspace_f, 0, sizeof(float), custream);
             out_raw = workspace_f;
         }
 
@@ -294,7 +295,7 @@ infiniStatus_t Descriptor::calculate(
             out_raw = out;
         } else {
             workspace_f = static_cast<float *>(workspace);
-            mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
+            hcMemsetAsync(workspace_f, 0, sizeof(float), custream);
             out_raw = workspace_f;
         }
 
@@ -324,8 +325,8 @@ infiniStatus_t Descriptor::calculate(
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
-    mcError_t err = mcGetLastError();
-    if (err != mcSuccess) {
+    hcError_t err = hcGetLastError();
+    if (err != hcSuccess) {
         return INFINI_STATUS_INTERNAL_ERROR;
     }
     return INFINI_STATUS_SUCCESS;

diff --git a/src/infiniop/ops/cdist/metax/cdist_metax.maca b/src/infiniop/ops/cdist/metax/cdist_metax.maca
@@ -1,4 +1,6 @@
+#include "../../../devices/metax/metax_common.h"
 #include "../../../devices/metax/metax_handle.h"
+#include "../../../devices/metax/metax_kernel_common.h"
 #include "cdist_metax.h"
 #include <iostream>
 namespace op::cdist::metax {
@@ -126,7 +128,7 @@ infiniStatus_t Descriptor::calculate(
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
-    mcStream_t custream = (mcStream_t)stream;
+    hcStream_t custream = (hcStream_t)stream;
     dim3 block(16, 16);
     dim3 grid(
         static_cast<unsigned int>((_info.n + block.x - 1) / block.x),
@@ -151,8 +153,8 @@ infiniStatus_t Descriptor::calculate(
         _info.y_matrix.col_stride,
         _p);
 
-    auto err = mcGetLastError();
-    if (err != mcSuccess) {
+    auto err = hcGetLastError();
+    if (err != hcSuccess) {
         return INFINI_STATUS_INTERNAL_ERROR;
     }
 

diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh
@@ -1,13 +1,6 @@
 #ifndef __EQUAL_CUDA_H__
 #define __EQUAL_CUDA_H__
 
-#if defined(__MACACC__)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 #include <type_traits>
 
 namespace op::equal::cuda {

diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -2,13 +2,6 @@
 #define __HARDSWISH_CUDA_H__
 
 #include <cmath>
-#if defined(__MACACC__)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 
 namespace op::hardswish::cuda {
 

diff --git a/src/infiniop/ops/hardtanh/cuda/kernel.cuh b/src/infiniop/ops/hardtanh/cuda/kernel.cuh
@@ -1,13 +1,6 @@
 #ifndef __HARDTANH_CUDA_H__
 #define __HARDTANH_CUDA_H__
 
-#if defined(__MACACC__)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 #include <type_traits>
 
 namespace op::hardtanh::cuda {

diff --git a/src/infiniop/ops/hypot/cuda/kernel.cuh b/src/infiniop/ops/hypot/cuda/kernel.cuh
@@ -3,14 +3,6 @@
 
 #include <cmath>
 #include <type_traits>
-#if ENABLE_METAX_API
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-using nv_bfloat162 = __maca_bfloat162;
-#else
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 
 namespace op::hypot::cuda {
 

diff --git a/src/infiniop/ops/hypot/metax/hypot_metax.maca b/src/infiniop/ops/hypot/metax/hypot_metax.maca
@@ -61,7 +61,7 @@ infiniStatus_t Descriptor::calculate(
     case INFINI_DTYPE_F16:
         return _device_info->calculate<256, cuda::HypotOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::HypotOp, nv_bfloat162>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::HypotOp, cuda_bfloat162>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
         return _device_info->calculate<256, cuda::HypotOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:

diff --git a/src/infiniop/ops/index_add/cuda/kernel.cuh b/src/infiniop/ops/index_add/cuda/kernel.cuh
@@ -1,16 +1,6 @@
 #ifndef __INDEX_ADD_CUDA_H__
 #define __INDEX_ADD_CUDA_H__
 
-#if ENABLE_METAX_API
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#define __nv_bfloat16 __maca_bfloat16
-#define __nv_bfloat162 __maca_bfloat162
-#else
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#endif
 #include <cstdint>
 
 namespace op::index_add::cuda {
@@ -40,7 +30,7 @@ __device__ __forceinline__ void atomic_add_custom(__half *address, __half val) {
 #endif
 }
 
-__device__ __forceinline__ void atomic_add_custom(__nv_bfloat16 *address, __nv_bfloat16 val) {
+__device__ __forceinline__ void atomic_add_custom(cuda_bfloat16 *address, cuda_bfloat16 val) {
 #if __CUDA_ARCH__ >= 800
     atomicAdd(address, val);
 #else
@@ -52,9 +42,9 @@ __device__ __forceinline__ void atomic_add_custom(__nv_bfloat16 *address, __nv_b
     do {
         assumed = old;
         unsigned short old_val_raw = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-        __nv_bfloat16 old_val = *reinterpret_cast<__nv_bfloat16 *>(&old_val_raw);
+        cuda_bfloat16 old_val = *reinterpret_cast<cuda_bfloat16 *>(&old_val_raw);
 
-        __nv_bfloat16 new_val = old_val + val;
+        cuda_bfloat16 new_val = old_val + val;
         unsigned short new_val_raw = *reinterpret_cast<unsigned short *>(&new_val);
 
         unsigned int new_int = (size_t)address & 2 ? (old & 0xffff) | (new_val_raw << 16)