Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 119 additions & 1 deletion source/source_base/kernels/dsp/dsp_connector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,66 @@ void zgemm_mth_(const char* transa,
free_ht(bet);
} // zgemm that needn't malloc_ht or free_ht

void zgemm_pack_mth_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<double>* alpha,
const std::complex<double>* a,
const int* lda,
const std::complex<double>* b,
const int* ldb,
const std::complex<double>* beta,
std::complex<double>* c,
const int* ldc,
int cluster_id)
{
const bool transa_not = (transa[0] == 'N' || transa[0] == 'n');
const bool transb_not = (transb[0] == 'N' || transb[0] == 'n');
Comment on lines +421 to +422
// const size_t a_elems = static_cast<size_t>(*lda) * (transa_not ? static_cast<size_t>(*k) : static_cast<size_t>(*m));
// const size_t b_elems = static_cast<size_t>(*ldb) * (transb_not ? static_cast<size_t>(*n) : static_cast<size_t>(*k));
const size_t c_elems = static_cast<size_t>(*ldc) * static_cast<size_t>(*n);

// std::complex<double>* A_dsp = static_cast<std::complex<double>*>(malloc_ht(a_elems * sizeof(std::complex<double>), cluster_id));
// std::complex<double>* B_dsp = static_cast<std::complex<double>*>(malloc_ht(b_elems * sizeof(std::complex<double>), cluster_id));
std::complex<double>* C_dsp = static_cast<std::complex<double>*>(malloc_ht(c_elems * sizeof(std::complex<double>), cluster_id));
std::complex<double>* alp = static_cast<std::complex<double>*>(malloc_ht(sizeof(std::complex<double>), cluster_id));
std::complex<double>* bet = static_cast<std::complex<double>*>(malloc_ht(sizeof(std::complex<double>), cluster_id));

// memcpy(A_dsp, a, a_elems * sizeof(std::complex<double>));
// memcpy(B_dsp, b, b_elems * sizeof(std::complex<double>));
memcpy(C_dsp, c, c_elems * sizeof(std::complex<double>));
*alp = *alpha;
*bet = *beta;
Comment on lines +433 to +437

mt_hthread_zgemm(CBLAS_ORDER::CblasColMajor,
convertBLASTranspose(transa),
convertBLASTranspose(transb),
*m,
*n,
*k,
alp,
a,
// A_dsp,
*lda,
b,
// B_dsp,
*ldb,
Comment on lines +439 to +451
bet,
// c,
C_dsp,
*ldc,
cluster_id);
memcpy(c, C_dsp, c_elems * sizeof(std::complex<double>));

// free_ht(A_dsp);
// free_ht(B_dsp);
free_ht(C_dsp);
free_ht(alp);
free_ht(bet);
}

void cgemm_mth_(const char* transa,
const char* transb,
const int* m,
Expand Down Expand Up @@ -443,6 +503,64 @@ void cgemm_mth_(const char* transa,
free_ht(bet);
} // cgemm that needn't malloc_ht or free_ht

void cgemm_pack_mth_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<float>* alpha,
const std::complex<float>* a,
const int* lda,
const std::complex<float>* b,
const int* ldb,
const std::complex<float>* beta,
std::complex<float>* c,
const int* ldc,
int cluster_id)
{
const bool transa_not = (transa[0] == 'N' || transa[0] == 'n');
const bool transb_not = (transb[0] == 'N' || transb[0] == 'n');
const size_t a_elems = static_cast<size_t>(*lda) * (transa_not ? static_cast<size_t>(*k) : static_cast<size_t>(*m));
const size_t b_elems = static_cast<size_t>(*ldb) * (transb_not ? static_cast<size_t>(*n) : static_cast<size_t>(*k));
const size_t c_elems = static_cast<size_t>(*ldc) * static_cast<size_t>(*n);

std::complex<float>* A_dsp = static_cast<std::complex<float>*>(malloc_ht(a_elems * sizeof(std::complex<float>), cluster_id));
std::complex<float>* B_dsp = static_cast<std::complex<float>*>(malloc_ht(b_elems * sizeof(std::complex<float>), cluster_id));
std::complex<float>* C_dsp = static_cast<std::complex<float>*>(malloc_ht(c_elems * sizeof(std::complex<float>), cluster_id));
std::complex<float>* alp = static_cast<std::complex<float>*>(malloc_ht(sizeof(std::complex<float>), cluster_id));
std::complex<float>* bet = static_cast<std::complex<float>*>(malloc_ht(sizeof(std::complex<float>), cluster_id));

Comment on lines +527 to +532
memcpy(A_dsp, a, a_elems * sizeof(std::complex<float>));
memcpy(B_dsp, b, b_elems * sizeof(std::complex<float>));
memcpy(C_dsp, c, c_elems * sizeof(std::complex<float>));
*alp = *alpha;
*bet = *beta;

mt_hthread_cgemm(CBLAS_ORDER::CblasColMajor,
convertBLASTranspose(transa),
convertBLASTranspose(transb),
*m,
*n,
*k,
(const void*)alp,
(const void*)A_dsp,
*lda,
(const void*)B_dsp,
*ldb,
(const void*)bet,
(void*)C_dsp,
*ldc,
cluster_id);

memcpy(c, C_dsp, c_elems * sizeof(std::complex<float>));

free_ht(A_dsp);
free_ht(B_dsp);
free_ht(C_dsp);
free_ht(alp);
free_ht(bet);
}

void sgemv_mth_(const char* transa,
const int* m,
const int* n,
Expand Down Expand Up @@ -570,4 +688,4 @@ void cgemv_mth_(const char* transa,
free_ht(alp);
free_ht(bet);
}
} // namespace mtfunc
} // namespace mtfunc
61 changes: 46 additions & 15 deletions source/source_base/kernels/dsp/dsp_connector.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,20 +61,51 @@ void zgemm_mt_(const char* transa,
const int* ldc,
int cluster_id);

void cgemm_mt_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<float>* alpha,
const std::complex<float>* a,
const int* lda,
const std::complex<float>* b,
const int* ldb,
const std::complex<float>* beta,
std::complex<float>* c,
const int* ldc,
int cluster_id);
void zgemm_pack_mth_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<double>* alpha,
const std::complex<double>* a,
const int* lda,
const std::complex<double>* b,
const int* ldb,
const std::complex<double>* beta,
std::complex<double>* c,
const int* ldc,
int cluster_id);


void cgemm_mth_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<float>* alpha,
const std::complex<float>* a,
const int* lda,
const std::complex<float>* b,
const int* ldb,
const std::complex<float>* beta,
std::complex<float>* c,
const int* ldc,
int cluster_id);

Comment on lines +79 to +94
void cgemm_pack_mth_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<float>* alpha,
const std::complex<float>* a,
const int* lda,
const std::complex<float>* b,
const int* ldb,
const std::complex<float>* beta,
std::complex<float>* c,
const int* ldc,
int cluster_id);

void sgemv_mt_(const char* transa,
const int* m,
Expand Down Expand Up @@ -282,4 +313,4 @@ void dsp_dav_subspace_reduce(T* hcc, T* scc, int nbase, int nbase_x, int notconv
} // namespace mtfunc

#endif
#endif
#endif
16 changes: 12 additions & 4 deletions source/source_base/module_external/blas_connector_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ void BlasConnector::gemm(const char transa,
#ifdef __DSP
else if (device_type == base_device::AbacusDevice_t::DspDevice)
{
mtfunc::cgemm_mth_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
mtfunc::cgemm_pack_mth_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
// cgemm_mth_ for raw dsp mth;
// cgemm_pack_mth_ for dsp mth with memcpy to DSP buffer
}
#endif
else if (device_type == base_device::AbacusDevice_t::GpuDevice)
Expand Down Expand Up @@ -158,7 +160,9 @@ void BlasConnector::gemm(const char transa,
#ifdef __DSP
else if (device_type == base_device::AbacusDevice_t::DspDevice)
{
mtfunc::zgemm_mth_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
mtfunc::zgemm_pack_mth_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
// zgemm_mth_ for raw dsp mth;
// zgemm_pack_mth_ for dsp mth with memcpy to DSP buffer
}
#endif
else if (device_type == base_device::AbacusDevice_t::GpuDevice)
Expand Down Expand Up @@ -277,7 +281,9 @@ void BlasConnector::gemm_cm(const char transa,
#ifdef __DSP
else if (device_type == base_device::AbacusDevice_t::DspDevice)
{
mtfunc::cgemm_mth_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
mtfunc::cgemm_pack_mth_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
// cgemm_mth_ for raw dsp mth;
// cgemm_pack_mth_ for dsp mth with memcpy to DSP buffer
}
#endif
#ifdef __CUDA
Expand Down Expand Up @@ -328,7 +334,9 @@ void BlasConnector::gemm_cm(const char transa,
#ifdef __DSP
else if (device_type == base_device::AbacusDevice_t::DspDevice)
{
mtfunc::zgemm_mth_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
mtfunc::zgemm_pack_mth_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
// zgemm_mth_ for raw dsp mth;
// zgemm_pack_mth_ for dsp mth with memcpy to DSP buffer
}
#endif
#ifdef __CUDA
Expand Down
Loading