From 87e01f8ca42c451c7ef034cd06e54a8f1366a072 Mon Sep 17 00:00:00 2001 From: Anna Maresova Date: Tue, 5 May 2026 20:15:54 +0200 Subject: [PATCH] Enable SymSgdNative on arm64 via system BLAS Provide an ARM-native libMklImports.so backed by the system BLAS (via CMake find_package(BLAS)), with portable implementations of the two MKL-specific sparse BLAS extensions (cblas_saxpyi, cblas_sdoti) and stubs for DFTI (FFT) functions referenced by the managed initializer but unused by SymSGD. Fixes #5798 Co-Authored-By: Claude Opus 4.6 (1M context) --- src/Native/CMakeLists.txt | 6 ++- src/Native/MklImportsArm/CMakeLists.txt | 24 ++++++++++ src/Native/MklImportsArm/MklImportsArm.c | 57 ++++++++++++++++++++++++ src/Native/SymSgdNative/CMakeLists.txt | 6 ++- src/Native/SymSgdNative/SparseBLAS.h | 16 ++++--- 5 files changed, 101 insertions(+), 8 deletions(-) create mode 100644 src/Native/MklImportsArm/CMakeLists.txt create mode 100644 src/Native/MklImportsArm/MklImportsArm.c diff --git a/src/Native/CMakeLists.txt b/src/Native/CMakeLists.txt index 9e3647ede1..846522a489 100644 --- a/src/Native/CMakeLists.txt +++ b/src/Native/CMakeLists.txt @@ -265,9 +265,11 @@ if(NOT ${ARCHITECTURE} MATCHES "arm.*") add_subdirectory(CpuMathNative) add_subdirectory(FastTreeNative) add_subdirectory(MklProxyNative) - # TODO: once we fix the 4 intel MKL methods, SymSgdNative will need to go back in. add_subdirectory(SymSgdNative) - endif() +else() + add_subdirectory(MklImportsArm) + add_subdirectory(SymSgdNative) +endif() if(${ARCHITECTURE} MATCHES "[xX].*64") add_subdirectory(OneDalNative) diff --git a/src/Native/MklImportsArm/CMakeLists.txt b/src/Native/MklImportsArm/CMakeLists.txt new file mode 100644 index 0000000000..ca068336bd --- /dev/null +++ b/src/Native/MklImportsArm/CMakeLists.txt @@ -0,0 +1,24 @@ +project(MklImportsArm) + +# On ARM platforms, Intel MKL is not available. This target provides +# a compatible libMklImports.so backed by the system BLAS (typically +# OpenBLAS) with stubs for MKL-specific sparse BLAS and FFT functions. + +find_package(BLAS REQUIRED) + +set(SOURCES + MklImportsArm.c +) + +if(NOT WIN32) + list(APPEND SOURCES ${VERSION_FILE_PATH}) + SET(CMAKE_SKIP_BUILD_RPATH FALSE) + SET(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) + SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + SET(CMAKE_INSTALL_RPATH "$ORIGIN/") +endif() + +add_library(MklImports SHARED ${SOURCES} ${RESOURCES}) +target_link_libraries(MklImports PUBLIC ${BLAS_LIBRARIES}) + +install_library_and_symbols(MklImports) diff --git a/src/Native/MklImportsArm/MklImportsArm.c b/src/Native/MklImportsArm/MklImportsArm.c new file mode 100644 index 0000000000..06eb71d4cb --- /dev/null +++ b/src/Native/MklImportsArm/MklImportsArm.c @@ -0,0 +1,57 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +// ARM replacement for Intel MKL (libMklImports.so). +// +// Standard CBLAS functions (sgemm, sgemv, saxpy, sdot, etc.) are +// forwarded to OpenBLAS, which exports them with identical signatures. +// +// Sparse CBLAS extensions (saxpyi, sdoti) are provided here since +// OpenBLAS does not include them. +// +// MKL DFTI (FFT) functions are stubbed — they are referenced by the +// managed MKL Components initializer but not used by SymSGD. The stubs +// return error codes so any actual FFT call fails cleanly rather than +// crashing. + +// --- Sparse BLAS (MKL extensions, not in OpenBLAS) --- + +void cblas_saxpyi(const int nz, const float a, + const float *x, const int *indx, float *y) +{ + for (int i = 0; i < nz; i++) + y[indx[i]] += a * x[i]; +} + +float cblas_sdoti(const int nz, const float *x, + const int *indx, const float *y) +{ + float result = 0.0f; + for (int i = 0; i < nz; i++) + result += x[i] * y[indx[i]]; + return result; +} + +// --- DFTI (FFT) stubs --- + +const char* DftiErrorMessage(long status) +{ + return "DFTI not available (OpenBLAS arm64 build)"; +} + +long DftiCreateDescriptor(void **h, int precision, int domain, int dim, ...) +{ + *h = (void*)0; + return -1; +} + +long DftiSetValue(void *h, int param, ...) +{ + return -1; +} + +long DftiCommitDescriptor(void *h) { return -1; } +long DftiComputeForward(void *h, ...) { return -1; } +long DftiComputeBackward(void *h, ...) { return -1; } +long DftiFreeDescriptor(void **h) { return 0; } diff --git a/src/Native/SymSgdNative/CMakeLists.txt b/src/Native/SymSgdNative/CMakeLists.txt index 01652d2aab..f014b1db76 100644 --- a/src/Native/SymSgdNative/CMakeLists.txt +++ b/src/Native/SymSgdNative/CMakeLists.txt @@ -33,7 +33,11 @@ else() endif() endif() -if(NOT ${ARCHITECTURE} MATCHES "arm.*") +if(${ARCHITECTURE} MATCHES "arm.*") + # On ARM, MklImports is built from MklImportsArm (OpenBLAS-backed). + # Link against the CMake target directly. + set(MKL_LIBRARY MklImports) +else() find_library(MKL_LIBRARY MklImports HINTS ${MKL_LIB_PATH}) endif() diff --git a/src/Native/SymSgdNative/SparseBLAS.h b/src/Native/SymSgdNative/SparseBLAS.h index fdfa1740e2..10e0b6dc83 100644 --- a/src/Native/SymSgdNative/SparseBLAS.h +++ b/src/Native/SymSgdNative/SparseBLAS.h @@ -5,10 +5,16 @@ #pragma once #include "../Stdafx.h" -extern "C" float __cdecl cblas_sdot(const int vecSize, const float* denseVecX, const int incX, const float* denseVecY, const int incY); -extern "C" float __cdecl cblas_sdoti(const int sparseVecSize, const float* sparseVecValues, const int* sparseVecIndices, float* denseVec); -extern "C" void __cdecl cblas_saxpy(const int vecSize, const float coef, const float* denseVecX, const int incX, float* denseVecY, const int incY); -extern "C" void __cdecl cblas_saxpyi(const int sparseVecSize, const float coef, const float* sparseVecValues, const int* sparseVecIndices, float* denseVec); +#ifdef _WIN32 +#define CBLAS_CALLING_CONV __cdecl +#else +#define CBLAS_CALLING_CONV +#endif + +extern "C" float CBLAS_CALLING_CONV cblas_sdot(const int vecSize, const float* denseVecX, const int incX, const float* denseVecY, const int incY); +extern "C" float CBLAS_CALLING_CONV cblas_sdoti(const int sparseVecSize, const float* sparseVecValues, const int* sparseVecIndices, float* denseVec); +extern "C" void CBLAS_CALLING_CONV cblas_saxpy(const int vecSize, const float coef, const float* denseVecX, const int incX, float* denseVecY, const int incY); +extern "C" void CBLAS_CALLING_CONV cblas_saxpyi(const int sparseVecSize, const float coef, const float* sparseVecValues, const int* sparseVecIndices, float* denseVec); float SDOT(const int vecSize, const float* denseVecX, const float* denseVecY) { @@ -28,4 +34,4 @@ void SAXPY(const int vecSize, const float* denseVecX, float* denseVecY, float co void SAXPYI(const int sparseVecSize, const int* sparseVecIndices, const float* sparseVecValues, float* denseVec, float coef) { cblas_saxpyi(sparseVecSize, coef, sparseVecValues, sparseVecIndices, denseVec); -} \ No newline at end of file +}