diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 8ffe5b0..810120c 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -114,7 +114,9 @@ jobs: path: ./wheelhouse/*.whl name: wheels-${{ matrix.os }} - # disabling QBLAS optimization for windows due to incompatibility with MSVC + # QBLAS is auto-disabled on Windows by meson.build (it uses GCC/POSIX-only + # constructs that MSVC does not support); the wheel falls back to the + # naive matmul kernel. No CFLAGS hack needed. build_wheels_windows: name: Build wheels on Windows runs-on: windows-latest @@ -153,9 +155,6 @@ jobs: CIBW_BUILD_VERBOSITY: "3" DISTUTILS_USE_SDK: "1" MSSdk: "1" - CIBW_ENVIRONMENT: > - CFLAGS="/DDISABLE_QUADBLAS $CFLAGS" - CXXFLAGS="/DDISABLE_QUADBLAS $CXXFLAGS" CIBW_REPAIR_WHEEL_COMMAND: 'delvewheel repair -w {dest_dir} {wheel} --add-path C:\sleef\bin' CIBW_TEST_COMMAND_WINDOWS: pip install numpy && pip install --no-deps {wheel} && pip install pytest pytest-run-parallel && pytest -s {project}/tests CIBW_TEST_EXTRAS: test diff --git a/.gitignore b/.gitignore index 6a64dd3..f5cec6c 100644 --- a/.gitignore +++ b/.gitignore @@ -146,3 +146,4 @@ compile_commands.json # docs /docs/_build/ +build_log.txt diff --git a/meson.build b/meson.build index d0aa2d8..506c41b 100644 --- a/meson.build +++ b/meson.build @@ -15,10 +15,7 @@ if is_windows add_project_arguments('-DWIN32', '-D_WINDOWS', language : ['c', 'cpp']) endif -qblas_dep = dependency('qblas', fallback: ['qblas', 'qblas_dep']) -# Try to find SLEEF system-wide first, fall back to subproject if not found -# Required SLEEF version (must match sleef.wrap revision) required_sleef_version = '3.9.0' # Don't use fallback here - we need to call subproject() explicitly later with disable_fma option sleef_dep = dependency('sleef', version: '>=' + required_sleef_version, required: false) @@ -30,7 +27,7 @@ if sleef_dep.found() and sleef_dep.version().startswith(required_sleef_version) # SLEEF found system-wide - verify quad-precision support cpp = meson.get_compiler('cpp') sleefquad_lib = cpp.find_library('sleefquad', required: false) - + if sleefquad_lib.found() sleefquad_test_code = ''' #include @@ -48,7 +45,7 @@ if sleef_dep.found() and sleef_dep.version().startswith(required_sleef_version) dependencies: [sleef_dep, sleefquad_lib], name: 'SLEEF quad-precision support' ) - + if quad_works sleefquad_dep = declare_dependency( dependencies: [sleef_dep, sleefquad_lib] @@ -80,6 +77,23 @@ else message('Proceeding with vendored SLEEF subproject instead') endif +# QBLAS does not build under MSVC (GCC-only flags, POSIX-only APIs, GCC +# built-ins for CPUID); force-disable it on Windows. Users on other +# platforms can opt out via -Ddisable_quadblas=true to fall back to the +# naive matmul kernel. +disable_quadblas = is_windows or get_option('disable_quadblas') +if disable_quadblas + if is_windows + message('QBLAS disabled (Windows / MSVC) - using naive matmul kernel') + else + message('QBLAS disabled by user option - using naive matmul kernel') + endif + add_project_arguments('-DDISABLE_QUADBLAS', language: ['c', 'cpp']) + qblas_dep = declare_dependency() +else + qblas_dep = dependency('qblas', fallback: ['qblas', 'qblas_dep']) +endif + incdir_numpy = run_command(py, ['-c', 'import numpy; print(numpy.get_include())'], check : true @@ -101,11 +115,14 @@ npymath_lib = c.find_library('npymath', dirs: npymath_path) dependencies = [py_dep, qblas_dep, sleef_dep, sleefquad_dep, npymath_lib] -# Add OpenMP dependency (optional, for threading) -openmp_dep = dependency('openmp', required: false, static: false) -if openmp_dep.found() - dependencies += openmp_dep -endif +# OpenMP is not used directly by any source in this package (#pragma omp +# is absent); it only matters because qblas's static lib has OpenMP +# objects baked in. qblas_dep already propagates the OpenMP requirement +# transitively, so adding a second dependency('openmp') here would put +# OpenMP into the compile-args closure twice. On Apple's clang++ that +# duplication leaves an orphan '-Xpreprocessor' in $ARGS which then +# pairs with '-MD' from the dep-gen flags ('-Xpreprocessor -MD'), and +# the preprocessor rejects -MD as unknown - failing all C++ compiles. # compiler flags for QBLAS compatibility if not is_windows diff --git a/meson.options b/meson.options index d871c14..41d7686 100644 --- a/meson.options +++ b/meson.options @@ -1,3 +1,9 @@ option('disable_fma', type: 'boolean', value: false, description: 'Disable FMA (Fused Multiply-Add) code paths' + - 'Set to true when building for older CPUs like Sandy Bridge that lack FMA support.') \ No newline at end of file + 'Set to true when building for older CPUs like Sandy Bridge that lack FMA support.') + +option('disable_quadblas', type: 'boolean', value: false, + description: 'Skip the QBLAS subproject and fall back to naive ' + + 'matmul kernels. Auto-enabled on Windows because ' + + 'QBLAS uses GCC/POSIX-only constructs that do not ' + + 'build under MSVC.') \ No newline at end of file diff --git a/src/csrc/quadblas_interface.cpp b/src/csrc/quadblas_interface.cpp index 9d8c762..f2d27f2 100644 --- a/src/csrc/quadblas_interface.cpp +++ b/src/csrc/quadblas_interface.cpp @@ -1,116 +1,74 @@ +// numpy-quaddtype shim around QBLAS. + #include "quadblas_interface.h" #include #include #ifndef DISABLE_QUADBLAS -#include "quadblas/quadblas.hpp" -#endif // DISABLE_QUADBLAS +#include +#endif extern "C" { +#ifndef DISABLE_QUADBLAS -#ifndef DISABLE_QUADBLAS +static inline QBLAS_LAYOUT to_layout(char c) { + return (c == 'C' || c == 'c') ? QblasColMajor : QblasRowMajor; +} +static inline QBLAS_TRANSPOSE to_trans(char c) { + if (c == 'T' || c == 't') return QblasTrans; + if (c == 'C' || c == 'c') return QblasConjTrans; + return QblasNoTrans; +} int -qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result) +qblas_dot(size_t n, Sleef_quad *x, size_t incx, + Sleef_quad *y, size_t incy, Sleef_quad *result) { if (!x || !y || !result || n == 0) { return -1; } - - try { - *result = QuadBLAS::dot(n, x, incx, y, incy); - return 0; - } - catch (...) { - return -1; - } + *result = cblas_qdot((int)n, x, (int)incx, y, (int)incy); + return 0; } int -qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A, - size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy) +qblas_gemv(char layout, char trans, size_t m, size_t n, + Sleef_quad *alpha, Sleef_quad *A, size_t lda, + Sleef_quad *x, size_t incx, + Sleef_quad *beta, Sleef_quad *y, size_t incy) { if (!alpha || !A || !x || !beta || !y || m == 0 || n == 0) { return -1; } - - try { - // Convert layout - QuadBLAS::Layout qblas_layout; - if (layout == 'R' || layout == 'r') { - qblas_layout = QuadBLAS::Layout::RowMajor; - } - else if (layout == 'C' || layout == 'c') { - qblas_layout = QuadBLAS::Layout::ColMajor; - } - else { - return -1; // Invalid layout - } - - // Handle transpose (swap dimensions for transpose) - size_t actual_m = m, actual_n = n; - if (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c') { - std::swap(actual_m, actual_n); - // For transpose, we need to adjust the layout - if (qblas_layout == QuadBLAS::Layout::RowMajor) { - qblas_layout = QuadBLAS::Layout::ColMajor; - } - else { - qblas_layout = QuadBLAS::Layout::RowMajor; - } - } - - // Call QBLAS GEMV - QuadBLAS::gemv(qblas_layout, actual_m, actual_n, *alpha, A, lda, x, incx, *beta, y, incy); - - return 0; - } - catch (...) { - return -1; - } + cblas_qgemv(to_layout(layout), to_trans(trans), + (int)m, (int)n, + *alpha, A, (int)lda, + x, (int)incx, + *beta, y, (int)incy); + return 0; } int -qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha, - Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C, - size_t ldc) +qblas_gemm(char layout, char transa, char transb, + size_t m, size_t n, size_t k, + Sleef_quad *alpha, Sleef_quad *A, size_t lda, + Sleef_quad *B, size_t ldb, + Sleef_quad *beta, Sleef_quad *C, size_t ldc) { if (!alpha || !A || !B || !beta || !C || m == 0 || n == 0 || k == 0) { return -1; } - - try { - QuadBLAS::Layout qblas_layout; - if (layout == 'R' || layout == 'r') { - qblas_layout = QuadBLAS::Layout::RowMajor; - } - else if (layout == 'C' || layout == 'c') { - qblas_layout = QuadBLAS::Layout::ColMajor; - } - else { - return -1; // Invalid layout - } - - // For now, we only support no transpose - // TODO: Implement transpose support if needed - if ((transa != 'N' && transa != 'n') || (transb != 'N' && transb != 'n')) { - return -1; // Transpose not implemented yet - } - - QuadBLAS::gemm(qblas_layout, m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc); - - return 0; - } - catch (...) { - return -1; - } + cblas_qgemm(to_layout(layout), to_trans(transa), to_trans(transb), + (int)m, (int)n, (int)k, + *alpha, A, (int)lda, B, (int)ldb, + *beta, C, (int)ldc); + return 0; } int qblas_supports_backend(QuadBackendType backend) { - // QBLAS only supports SLEEF backend return (backend == BACKEND_SLEEF) ? 1 : 0; } @@ -121,113 +79,94 @@ py_quadblas_set_num_threads(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "i", &num_threads)) { return NULL; } - if (num_threads <= 0) { PyErr_SetString(PyExc_ValueError, "Number of threads must be positive"); return NULL; } - - QuadBLAS::set_num_threads(num_threads); + qblas_set_num_threads(num_threads); Py_RETURN_NONE; } PyObject * py_quadblas_get_num_threads(PyObject *self, PyObject *args) { - int num_threads = QuadBLAS::get_num_threads(); - return PyLong_FromLong(num_threads); + return PyLong_FromLong(qblas_get_num_threads()); } PyObject * py_quadblas_get_version(PyObject *self, PyObject *args) { - return PyUnicode_FromString("QuadBLAS 1.0.0 - High Performance Quad Precision BLAS"); + /* qblas_get_version() returns "QBLAS X.Y.Z"; pair it with the + * runtime-detected SIMD tier so callers can confirm what's active. */ + const char *ver = qblas_get_version(); + const char *tier = qblas_get_dispatch_tier(); + char buf[256]; + PyOS_snprintf(buf, sizeof buf, "%s (dispatch: %s)", ver, tier); + return PyUnicode_FromString(buf); } int _quadblas_set_num_threads(int num_threads) { - QuadBLAS::set_num_threads(num_threads); + qblas_set_num_threads(num_threads); return 0; } int _quadblas_get_num_threads(void) { - int num_threads = QuadBLAS::get_num_threads(); - return num_threads; + return qblas_get_num_threads(); } -#else // DISABLE_QUADBLAS - +#else /* DISABLE_QUADBLAS */ int qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result) -{ - return -1; // QBLAS is disabled, dot product not available -} +{ return -1; } int qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A, size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy) -{ - return -1; // QBLAS is disabled, GEMV not available -} +{ return -1; } int qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha, Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C, size_t ldc) -{ - return -1; // QBLAS is disabled, GEMM not available -} +{ return -1; } -int -qblas_supports_backend(QuadBackendType backend) -{ - return -1; // QBLAS is disabled, backend support not available -} +int qblas_supports_backend(QuadBackendType backend) { return -1; } PyObject * py_quadblas_set_num_threads(PyObject *self, PyObject *args) { - // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); return NULL; } - PyObject * py_quadblas_get_num_threads(PyObject *self, PyObject *args) { - // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); return NULL; } - PyObject * py_quadblas_get_version(PyObject *self, PyObject *args) { - // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); return NULL; } -int -_quadblas_set_num_threads(int num_threads) +int _quadblas_set_num_threads(int num_threads) { - // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); return -1; } - -int -_quadblas_get_num_threads(void) +int _quadblas_get_num_threads(void) { - // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); return -1; } -#endif // DISABLE_QUADBLAS +#endif /* DISABLE_QUADBLAS */ -} // extern "C" \ No newline at end of file +} /* extern "C" */ diff --git a/subprojects/packagefiles/qblas/meson.build b/subprojects/packagefiles/qblas/meson.build deleted file mode 100644 index b7a972f..0000000 --- a/subprojects/packagefiles/qblas/meson.build +++ /dev/null @@ -1,8 +0,0 @@ -project('qblas', meson_version: '>=1.1') - -qblas_inc = include_directories('include') - -qblas_dep = declare_dependency( - include_directories: qblas_inc, - version: meson.project_version() -) diff --git a/subprojects/qblas.wrap b/subprojects/qblas.wrap index d8c2a89..019cb0f 100644 --- a/subprojects/qblas.wrap +++ b/subprojects/qblas.wrap @@ -1,8 +1,7 @@ [wrap-git] -directory=qblas -url=https://github.com/SwayamInSync/QBLAS.git -revision=42126fd78cbc04e9b031475fe39f4f46eaa51e01 -patch_directory = qblas +directory = qblas +url = https://github.com/SwayamInSync/QBLAS.git +revision = 8deb36b67ae4d2c81dfca2ceac8957deb8f23c9b [provide] qblas = qblas_dep