Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/infiniop/devices/metax/metax_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
#ifdef ENABLE_METAX_MC_API
#include <mcblas/mcblas.h>
#include <mcdnn/mcdnn.h>
#include <mcr/mc_runtime.h>
#else
#include <hcblas/hcblas.h>
#include <hcdnn/hcdnn.h>
#include <hcr/hc_runtime.h>
#endif
#include <functional>
#include <memory>
Expand Down
2 changes: 2 additions & 0 deletions src/infiniop/devices/metax/metax_ht2mc.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,6 @@
#define hcGraphDestroy mcGraphDestroy
#define hcGraphExecDestroy mcGraphExecDestroy
#define hcGraphLaunch mcGraphLaunch
#define hcMemsetAsync mcMemsetAsync
#define hcGetLastError mcGetLastError
#endif
4 changes: 4 additions & 0 deletions src/infiniop/devices/metax/metax_kernel_common.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
#define INFINIOP_METAX_KERNEL __global__ void

#ifdef ENABLE_METAX_MC_API
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#include <maca_fp8.h>
#else
#include <hpcc_bfloat16.h>
#include <hpcc_fp16.h>
#include <hpcc_fp8.h>
#endif

Expand Down
2 changes: 0 additions & 2 deletions src/infiniop/ops/addcmul/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
#ifndef __ADDCMUL_CUDA_CUH__
#define __ADDCMUL_CUDA_CUH__

#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <type_traits>

namespace op::addcmul::cuda {
Expand Down
5 changes: 3 additions & 2 deletions src/infiniop/ops/addcmul/metax/addcmul_metax.maca
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include "../../../devices/metax/metax_handle.h"
#include "../../../elementwise/metax/elementwise_metax.h"

#include "addcmul_metax.h"
Expand Down Expand Up @@ -129,7 +130,7 @@ static inline infiniStatus_t launch_addcmul_kernel(
auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));

mcStream_t metax_stream = reinterpret_cast<mcStream_t>(stream);
hcStream_t metax_stream = reinterpret_cast<hcStream_t>(stream);

constexpr uint32_t BLOCK_SIZE = 256;
uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
Expand All @@ -146,7 +147,7 @@ static inline infiniStatus_t launch_addcmul_kernel(
t2_ptr,
desc->getValue());

CHECK_METAX(mcGetLastError());
CHECK_METAX(hcGetLastError());
return INFINI_STATUS_SUCCESS;
}

Expand Down
2 changes: 1 addition & 1 deletion src/infiniop/ops/addr/cpu/addr_cpu.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "addr_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include <spdlog/spdlog.h>

namespace op::addr::cpu {
Descriptor::~Descriptor() = default;

Expand Down
35 changes: 22 additions & 13 deletions src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,36 @@ infiniStatus_t calculateArgWhere(
const void *x) {

const Tdata *x_data = reinterpret_cast<const Tdata *>(x);
// int64_t *y_data = reinterpret_cast<int64_t *>(y);
std::vector<size_t> positions;
// #pragma omp parallel for

std::vector<int64_t> positions;
const size_t ndim = info.shapes.size();

for (size_t i = 0; i < info.num_elements; i++) {
size_t pos = 0, tem = i;
std::vector<size_t> position(info.strides.size());
for (size_t j = info.strides.size() - 1; j >= 0; j--) {
position[j] = tem % info.shapes[j];
tem /= info.shapes[j];
pos += position[j] * info.strides[j];
size_t pos = 0;
size_t tmp = i;

std::vector<int64_t> coord(ndim);

// unravel index
for (size_t j = ndim; j-- > 0;) {
coord[j] = tmp % info.shapes[j];
tmp /= info.shapes[j];
pos += coord[j] * info.strides[j];
}
if (fabs(x_data[pos] - 0.0f) > 1e-5) {
for (auto p : position) {
positions.push_back(p);

// PyTorch semantics: != 0
if (x_data[pos] != Tdata(0)) {
for (size_t j = 0; j < ndim; j++) {
positions.push_back(coord[j]);
}
}
}

*count = positions.size() / ndim;

*y = new int64_t[positions.size()];
memcpy(*y, positions.data(), positions.size() * sizeof(int64_t));
*count = positions.size() / info.strides.size();

return INFINI_STATUS_SUCCESS;
}

Expand Down
16 changes: 0 additions & 16 deletions src/infiniop/ops/argwhere/moore/argwhere_moore.mu
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,6 @@
#include "argwhere_kernel.h"
#include "argwhere_moore.h"
#include "infinicore.h"
#include <spdlog/spdlog.h>

// template <typename T>
// INFINIOP_MOORE_KERNEL parallel_block_argwhere(T *data, int64_t *results, size_t N,
// size_t M, const size_t *shapes,
// const ptrdiff_t *strides, size_t ndim,
// size_t *count) {
// parallel_block_argwhere_kernel<float><<<1, M / 2, M>>>(
// data, results, N, shapes, strides, ndim, count);
// }

infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N,
size_t M, const size_t *shapes,
Expand Down Expand Up @@ -90,12 +80,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim,
musaMemcpyDeviceToHost, moore_stream);

// cudaStreamSynchronize(cuda_stream);
// for (size_t i = 0; i < (*count) * ndim; i++) {
// spdlog::debug("(*y)[{}]:{}", i, static_cast<size_t *>(*y)[i]);
// }
// cudaFreeAsync(result, cuda_stream);
// cudaFreeAsync(count_cuda, cuda_stream);
return INFINI_STATUS_SUCCESS;
}

Expand Down
3 changes: 0 additions & 3 deletions src/infiniop/ops/atanh/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
#ifndef __ATANH_CUDA_H__
#define __ATANH_CUDA_H__

#include <cuda_bf16.h>
#include <cuda_fp16.h>

namespace op::atanh::cuda {
typedef struct AtanhOp {
public:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_handle.h"
#include "../../../devices/metax/metax_kernel_common.h"

#include "binary_cross_entropy_with_logits_metax.h"
#include <mc_runtime.h>

#include <type_traits>

namespace op::bce_with_logits::metax {
Expand Down Expand Up @@ -191,7 +192,7 @@ infiniStatus_t Descriptor::calculate(
const void *pos_weight,
void *stream) const {

mcStream_t custream = (mcStream_t)stream;
hcStream_t custream = (hcStream_t)stream;
size_t n = _info.num_elements;

// F16/BF16 + 归约需要 float workspace
Expand Down Expand Up @@ -219,7 +220,7 @@ infiniStatus_t Descriptor::calculate(
case INFINI_DTYPE_F32: {
// 如果是规约操作,计算前需将输出位置清零
if (_reduction != INFINIOP_REDUCTION_NONE) {
mcMemsetAsync(out, 0, sizeof(float), custream);
hcMemsetAsync(out, 0, sizeof(float), custream);
}

bce_logits_kernel<float, float><<<grid, block, 0, custream>>>(
Expand Down Expand Up @@ -255,7 +256,7 @@ infiniStatus_t Descriptor::calculate(
out_raw = out;
} else {
workspace_f = static_cast<float *>(workspace);
mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
hcMemsetAsync(workspace_f, 0, sizeof(float), custream);
out_raw = workspace_f;
}

Expand Down Expand Up @@ -294,7 +295,7 @@ infiniStatus_t Descriptor::calculate(
out_raw = out;
} else {
workspace_f = static_cast<float *>(workspace);
mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
hcMemsetAsync(workspace_f, 0, sizeof(float), custream);
out_raw = workspace_f;
}

Expand Down Expand Up @@ -324,8 +325,8 @@ infiniStatus_t Descriptor::calculate(
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

mcError_t err = mcGetLastError();
if (err != mcSuccess) {
hcError_t err = hcGetLastError();
if (err != hcSuccess) {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
Expand Down
8 changes: 5 additions & 3 deletions src/infiniop/ops/cdist/metax/cdist_metax.maca
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_handle.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "cdist_metax.h"
#include <iostream>
namespace op::cdist::metax {
Expand Down Expand Up @@ -126,7 +128,7 @@ infiniStatus_t Descriptor::calculate(
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

mcStream_t custream = (mcStream_t)stream;
hcStream_t custream = (hcStream_t)stream;
dim3 block(16, 16);
dim3 grid(
static_cast<unsigned int>((_info.n + block.x - 1) / block.x),
Expand All @@ -151,8 +153,8 @@ infiniStatus_t Descriptor::calculate(
_info.y_matrix.col_stride,
_p);

auto err = mcGetLastError();
if (err != mcSuccess) {
auto err = hcGetLastError();
if (err != hcSuccess) {
return INFINI_STATUS_INTERNAL_ERROR;
}

Expand Down
7 changes: 0 additions & 7 deletions src/infiniop/ops/equal/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
#ifndef __EQUAL_CUDA_H__
#define __EQUAL_CUDA_H__

#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
#include <type_traits>

namespace op::equal::cuda {
Expand Down
7 changes: 0 additions & 7 deletions src/infiniop/ops/hardswish/cuda/kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,6 @@
#define __HARDSWISH_CUDA_H__

#include <cmath>
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif

namespace op::hardswish::cuda {

Expand Down
7 changes: 0 additions & 7 deletions src/infiniop/ops/hardtanh/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
#ifndef __HARDTANH_CUDA_H__
#define __HARDTANH_CUDA_H__

#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
#include <type_traits>

namespace op::hardtanh::cuda {
Expand Down
8 changes: 0 additions & 8 deletions src/infiniop/ops/hypot/cuda/kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,6 @@

#include <cmath>
#include <type_traits>
#if ENABLE_METAX_API
#include <maca_bfloat16.h>
#include <maca_fp16.h>
using nv_bfloat162 = __maca_bfloat162;
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif

namespace op::hypot::cuda {

Expand Down
2 changes: 1 addition & 1 deletion src/infiniop/ops/hypot/metax/hypot_metax.maca
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ infiniStatus_t Descriptor::calculate(
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::HypotOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::HypotOp, nv_bfloat162>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::HypotOp, cuda_bfloat162>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::HypotOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
Expand Down
16 changes: 3 additions & 13 deletions src/infiniop/ops/index_add/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
#ifndef __INDEX_ADD_CUDA_H__
#define __INDEX_ADD_CUDA_H__

#if ENABLE_METAX_API
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#define __nv_bfloat16 __maca_bfloat16
#define __nv_bfloat162 __maca_bfloat162
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#endif
#include <cstdint>

namespace op::index_add::cuda {
Expand Down Expand Up @@ -40,7 +30,7 @@ __device__ __forceinline__ void atomic_add_custom(__half *address, __half val) {
#endif
}

__device__ __forceinline__ void atomic_add_custom(__nv_bfloat16 *address, __nv_bfloat16 val) {
__device__ __forceinline__ void atomic_add_custom(cuda_bfloat16 *address, cuda_bfloat16 val) {
#if __CUDA_ARCH__ >= 800
atomicAdd(address, val);
#else
Expand All @@ -52,9 +42,9 @@ __device__ __forceinline__ void atomic_add_custom(__nv_bfloat16 *address, __nv_b
do {
assumed = old;
unsigned short old_val_raw = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
__nv_bfloat16 old_val = *reinterpret_cast<__nv_bfloat16 *>(&old_val_raw);
cuda_bfloat16 old_val = *reinterpret_cast<cuda_bfloat16 *>(&old_val_raw);

__nv_bfloat16 new_val = old_val + val;
cuda_bfloat16 new_val = old_val + val;
unsigned short new_val_raw = *reinterpret_cast<unsigned short *>(&new_val);

unsigned int new_int = (size_t)address & 2 ? (old & 0xffff) | (new_val_raw << 16)
Expand Down
Loading
Loading