diff --git a/.clang-format b/.clang-format index 6d5353f9..4f33a094 100644 --- a/.clang-format +++ b/.clang-format @@ -1,3 +1,4 @@ +--- Language: Cpp # BasedOnStyle: Google AccessModifierOffset: -1 @@ -17,22 +18,22 @@ AllowShortLoopsOnASingleLine: true AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: false +AlwaysBreakTemplateDeclarations: true +BinPackArguments: false BinPackParameters: false BraceWrapping: - AfterClass: false + AfterClass: false AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false # disabling the below splits, else, they'll just add to the vertical length of source files! SplitEmptyFunction: false SplitEmptyRecord: false @@ -61,23 +62,23 @@ FixNamespaceComments: true ForEachMacros: IncludeBlocks: Preserve IncludeCategories: - - Regex: '^<.*\.h>' - Priority: 1 - - Regex: '^<.*' - Priority: 2 - - Regex: '.*' - Priority: 3 -IncludeIsMainRegex: '([-_](test|unittest))?$' + - Regex: ^<.*\.h> + Priority: 1 + - Regex: ^<.* + Priority: 2 + - Regex: .* + Priority: 3 +IncludeIsMainRegex: ([-_](test|unittest))?$ IndentCaseLabels: true IndentPPDirectives: None -IndentWidth: 2 +IndentWidth: 2 IndentWrappedFunctionNames: false InsertBraces: true JavaScriptQuotes: Leave JavaScriptWrapImports: true KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' -MacroBlockEnd: '' +MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCBinPackProtocolList: Never @@ -95,14 +96,7 @@ PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Left RawStringFormats: - Language: Cpp - Delimiters: - - cc - - CC - - cpp - - Cpp - - CPP - - 'c++' - - 'C++' + Delimiters: [cc, CC, cpp, Cpp, CPP, c++, C++] CanonicalDelimiter: '' # Enabling comment reflow causes doxygen comments to be messed up in their formats! ReflowComments: true diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 04eea09b..00000000 --- a/.flake8 +++ /dev/null @@ -1,10 +0,0 @@ -[flake8] -ignore = - # 'foo' is too complex (N) - C901, - # continuation line missing indentation or outdented - E122, - E203, E501, - F403, F821, W503 -max-line-length = 80 -max-complexity = 18 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b24026ed..b42e51d7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,31 +1,44 @@ +--- repos: - - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.5.1' - hooks: - - id: mypy - language: system - pass_filenames: false - args: ['legate_sparse'] - - repo: https://github.com/psf/black - rev: 23.9.1 - hooks: - - id: black - - repo: https://github.com/PyCQA/isort - rev: 5.12.0 - hooks: - - id: isort - args: ["--profile", "black"] - - repo: https://github.com/PyCQA/flake8 - rev: 6.1.0 - hooks: - - id: flake8 - args: [--config=.flake8] - - repo: https://github.com/pre-commit/mirrors-clang-format - rev: 'v16.0.6' # Use the sha / tag you want to point at - hooks: - - id: clang-format - files: \.(cu|cuh|h|cc|inl)$ - types_or: [] - + - repo: https://github.com/adrienverge/yamllint + rev: v1.38.0 + hooks: + - id: yamllint + types: [yaml] + args: [-c, ./scripts/pre-commit/yamllint.yml] + exclude: meta\.yaml$ + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-json # checks that all json files have proper syntax + - id: check-toml # checks that all toml files have proper syntax + - id: end-of-file-fixer # check all files end in a newline + # handled by clang-format + exclude_types: [c, c++, cuda] + - id: pretty-format-json + args: [--autofix, --indent=4] + - id: trailing-whitespace # remove trailing whitespace + # don't mess up diff files + exclude: ^src/cmake/patches/.*\.diff$ + # handled by clang-format + exclude_types: [c, c++, cuda] + - id: check-symlinks + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-shebang-scripts-are-executable + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.15.9 + hooks: + - id: ruff-format + - id: ruff + args: ["--config=./pyproject.toml", "--fix"] + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v22.1.2 # Use the sha / tag you want to point at + hooks: + - id: clang-format + files: \.(cu|cuh|h|cc|inl)$ + types_or: [] +ci: + autoupdate_schedule: quarterly default_language_version: - python: python3 + python: python3 diff --git a/.style.yapf b/.style.yapf index 02b96779..df2b8071 100644 --- a/.style.yapf +++ b/.style.yapf @@ -339,4 +339,3 @@ split_penalty_logical_operator=300 # Use the Tab character for indentation. use_tabs=False - diff --git a/CMakeLists.txt b/CMakeLists.txt index c32254c3..d91a3119 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,7 @@ include(rapids-find) ################################### # Project -set(legate_sparse_version 25.07.00) +set(legate_sparse_version 26.02.00) set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g") @@ -110,4 +110,3 @@ if(CMAKE_GENERATOR STREQUAL "Ninja") endfunction() add_touch_legate_sparse_ninja_build_target() endif() - diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7b663aad..8b55877c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -55,25 +55,25 @@ git push -u origin : ``` Developer Certificate of Origin Version 1.1 - + Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 1 Letterman Drive Suite D4700 San Francisco, CA, 94129 - + Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. ``` ``` Developer's Certificate of Origin 1.1 - + By making a contribution to this project, I certify that: - + (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or - + (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or - + (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it. - + (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. - ``` \ No newline at end of file + ``` diff --git a/LICENSE b/LICENSE index 4947287f..f433b1a5 100644 --- a/LICENSE +++ b/LICENSE @@ -174,4 +174,4 @@ incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. - END OF TERMS AND CONDITIONS \ No newline at end of file + END OF TERMS AND CONDITIONS diff --git a/README.md b/README.md index b03ea8c6..5cc45556 100644 --- a/README.md +++ b/README.md @@ -21,27 +21,27 @@ limitations under the License. Legate Sparse is a [Legate](https://github.com/nv-legate/legate) library that aims to provide a distributed and accelerated drop-in replacement for the [scipy.sparse](https://docs.scipy.org/doc/scipy/reference/sparse.html) library -on top of the [Legate](https://github.com/nv-legate/legate) runtime. -Legate Sparse interoperates with +on top of the [Legate](https://github.com/nv-legate/legate) runtime. +Legate Sparse interoperates with [cuPyNumeric](https://github.com/nv-legate/cupynumeric), -a distributed and accelerated drop-in replacement +a distributed and accelerated drop-in replacement for [NumPy](https://numpy.org/doc/stable/reference/index.html#reference), to enable writing programs that operate on distributed dense and sparse arrays. -Take a look at the `examples` directory for some applications that can +Take a look at the `examples` directory for some applications that can use Legate Sparse. We have implemented an explicit partial-differential equation (PDE) [solver](examples/pde.py). More complex and interesting applications are on the way -- stay tuned! -Legate Sparse is currently in alpha and supports a subset of APIs -and options from scipy.sparse, so if you need an API, please open -an issue and give us a summary of its usage. +Legate Sparse is currently in alpha and supports a subset of APIs +and options from scipy.sparse, so if you need an API, please open +an issue and give us a summary of its usage. # Installation -To use Legate Sparse, `legate` and `cupynumeric` libraries have to be installed. -They can be installed either by pulling the respective conda packages -or by manually building from source. For more information, -see build instructions for [Legate](https://github.com/nv-legate/legate) +To use Legate Sparse, `legate` and `cupynumeric` libraries have to be installed. +They can be installed either by pulling the respective conda packages +or by manually building from source. For more information, +see build instructions for [Legate](https://github.com/nv-legate/legate) and [cuPyNumeric](https://github.com/nv-legate/cupynumeric). Follow the steps in this section. @@ -51,7 +51,7 @@ Follow the steps in this section. The `legate-sparse` conda package already depends on `legate` and `cupynumeric`, and it will install these dependencies automatically. -To create a new environment and install: +To create a new environment and install: ``` conda create -n myenv -c legate -c conda-forge legate-sparse ``` @@ -65,9 +65,9 @@ conda install -c legate -c conda-forge legate-sparse To write programs using Legate Sparse, import the `legate_sparse` module, which contains methods and types found in `scipy.sparse`. Note that the module is imported as `legate_sparse` -and not `legate.sparse`. Here is an example program saved as `main.py`. +and not `legate.sparse`. Here is an example program saved as `main.py`. -For more details on how to run legate programs, check +For more details on how to run legate programs, check our [documentation](https://docs.nvidia.com/cupynumeric). To run the application on a single GPU, use this command: @@ -79,10 +79,10 @@ import legate_sparse as sparse import cupynumeric as np # number of diagonals in the matrix (including main diagonal) -n_diagonals = 3 +n_diagonals = 3 # number of rows in the matrix -nrows = 5 +nrows = 5 # generate two tridiaonal matrices (n_diagonals=3) and multiply them A = sparse.diags( @@ -102,13 +102,13 @@ B = sparse.diags( ) # spGEMM operation: multiplication of two sparse matrices -C = A @ B +C = A @ B print(C.todense()) print() # spMV operation: multiplication of a sparse matrix and a dense vector x = np.ones(nrows) -C = A @ x +C = A @ x print(C) assert np.array_equal(A.todense().sum(axis=1), C) diff --git a/cmake/thirdparty/get_cudss.cmake b/cmake/thirdparty/get_cudss.cmake new file mode 100644 index 00000000..0ebfc199 --- /dev/null +++ b/cmake/thirdparty/get_cudss.cmake @@ -0,0 +1,28 @@ +#============================================================================= +# Copyright 2022-2024 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_or_configure_cudss) + + if(TARGET cudss) + return() + endif() + + # cuDSS provides its own CMake config, so we use find_package directly + find_package(cudss REQUIRED) + +endfunction() + +find_or_configure_cudss() diff --git a/cmake/thirdparty/get_legate.cmake b/cmake/thirdparty/get_legate.cmake index 727671fd..142bd22a 100644 --- a/cmake/thirdparty/get_legate.cmake +++ b/cmake/thirdparty/get_legate.cmake @@ -18,56 +18,19 @@ function(find_or_configure_legate) set(oneValueArgs VERSION REPOSITORY BRANCH EXCLUDE_FROM_ALL) cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - include("${rapids-cmake-dir}/export/detail/parse_version.cmake") - rapids_export_parse_version(${PKG_VERSION} legate PKG_VERSION) - include("${rapids-cmake-dir}/cpm/detail/package_details.cmake") rapids_cpm_package_details(legate version git_repo git_branch shallow exclude_from_all) - set(version ${PKG_VERSION}) - set(exclude_from_all ${PKG_EXCLUDE_FROM_ALL}) - if(PKG_BRANCH) - set(git_branch "${PKG_BRANCH}") - endif() - if(PKG_REPOSITORY) - set(git_repo "${PKG_REPOSITORY}") - endif() + # Normalize version to match conda pkg naming (e.g., 26.01.00 -> 26.01.0) + string(REPLACE "00" "0" version "${version}") set(FIND_PKG_ARGS GLOBAL_TARGETS legate::legate BUILD_EXPORT_SET legate-sparse-exports INSTALL_EXPORT_SET legate-sparse-exports) - # First try to find legate via find_package() - # so the `Legion_USE_*` variables are visible - # Use QUIET find by default. - set(_find_mode QUIET) - # If legate_DIR/legate_ROOT are defined as something other than empty or NOTFOUND - # use a REQUIRED find so that the build does not silently download legate. - if(legate_DIR OR legate_ROOT) - set(_find_mode REQUIRED) - endif() - rapids_find_package(legate ${version} EXACT CONFIG ${_find_mode} ${FIND_PKG_ARGS}) - - if(legate_FOUND) - message(STATUS "CPM: using local package legate@${version}") - else() - include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cpm_helpers.cmake) - get_cpm_git_args(legate_cpm_git_args REPOSITORY ${git_repo} BRANCH ${git_branch}) - - message(VERBOSE "legate.sparse: legate version: ${version}") - message(VERBOSE "legate.sparse: legate git_repo: ${git_repo}") - message(VERBOSE "legate.sparse: legate git_branch: ${git_branch}") - message(VERBOSE "legate.sparse: legate exclude_from_all: ${exclude_from_all}") - message(VERBOSE "legate.sparse: legate legate_cpm_git_args: ${legate_cpm_git_args}") - - rapids_cpm_find(legate ${version} ${FIND_PKG_ARGS} - CPM_ARGS - ${legate_cpm_git_args} - FIND_PACKAGE_ARGUMENTS EXACT - EXCLUDE_FROM_ALL ${exclude_from_all} - ) - endif() + # Require legate to be pre-installed; do not fall back to cloning. + rapids_find_package(legate ${version} EXACT CONFIG REQUIRED ${FIND_PKG_ARGS}) set(Legion_USE_CUDA ${Legion_USE_CUDA} PARENT_SCOPE) set(Legion_USE_OpenMP ${Legion_USE_OpenMP} PARENT_SCOPE) diff --git a/cmake/versions.json b/cmake/versions.json index 6c5440f4..85c7e7ae 100644 --- a/cmake/versions.json +++ b/cmake/versions.json @@ -1,24 +1,24 @@ { - "packages" : { - "legate" : { - "repo": "legate.internal", - "org": "nv-legate", - "version": "25.07.00", - "git_url" : "git@github.com:nv-legate/legate.git", - "git_shallow": false, - "always_download": false, - "git_tag" : "a46dc3d5b176ff9546bc831409c394c1bbc3b936", - "anaconda_label": "main" - }, - "cupynumeric" : { - "repo": "cupynumeric.internal", - "org": "nv-legate", - "version": "25.07.00", - "git_url" : "git@github.com:nv-legate/cupynumeric", - "git_shallow": false, - "always_download": false, - "git_tag" : "6132d8450049a7abd7786fb4d60444eb5b4e25db", - "anaconda_label": "main" + "packages": { + "cupynumeric": { + "always_download": false, + "anaconda_label": "main", + "git_shallow": false, + "git_tag": "ae1c787828a9327ad00a076739706f41d196a043", + "git_url": "git@github.com:nv-legate/cupynumeric.internal", + "org": "nv-legate", + "repo": "cupynumeric.internal", + "version": "26.01.00" + }, + "legate": { + "always_download": false, + "anaconda_label": "main", + "git_shallow": false, + "git_tag": "3ccb639605eecd8e9fee52c2d7d56ea799f4864e", + "git_url": "git@github.com:nv-legate/legate.internal.git", + "org": "nv-legate", + "repo": "legate.internal", + "version": "26.01.00" + } } - } } diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh old mode 100644 new mode 100755 diff --git a/conda/conda-build/conda_build_config.yaml b/conda/conda-build/conda_build_config.yaml index ada8dda2..a67aaba9 100644 --- a/conda/conda-build/conda_build_config.yaml +++ b/conda/conda-build/conda_build_config.yaml @@ -10,6 +10,14 @@ python: - 3.12 - 3.13 +# Pin sysroot glibc to match Legate's current baseline and avoid newer +# toolchains needing RELR-aware binutils. +c_stdlib: + - sysroot + +c_stdlib_version: + - "2.28" + numpy_version: # Not 2.1.0 which segfaults on asarray() sometimes, see # https://github.com/numpy/numpy/pull/27249 @@ -17,3 +25,6 @@ numpy_version: cmake_version: - ">=3.20.1,!=3.23.0" + +cuda_compiler: + - cuda-nvcc diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml index 9bdad28f..209abd04 100644 --- a/conda/conda-build/meta.yaml +++ b/conda/conda-build/meta.yaml @@ -10,8 +10,11 @@ ## The placeholder version is strictly for making two-pass conda build process. ## It should not be used for any other purpose, and this is not a default version. {% set placeholder_version = '0.0.0.dev' %} -{% set default_cuda_version = '12.2.2' %} -{% set cuda_version='.'.join(environ.get('CUDA', default_cuda_version).split('.')[:2]) %} +{% set legate_cuda_version = environ.get('LEGATE_CUDA_VERSION') %} +{% if not legate_cuda_version %} +invalid_yaml_missing_cuda_version: LEGATE_CUDA_VERSION must be set +{% endif %} +{% set cuda_version='.'.join(legate_cuda_version.split('.')[:2]) %} {% set cuda_major=cuda_version.split('.')[0]|int %} {% set py_version=environ.get('CONDA_PY', '') %} @@ -100,14 +103,16 @@ requirements: - make - ninja - cmake {{ cmake_version }} - - {{ compiler('c') }} =11.2 - - {{ compiler('cxx') }} =11.2 - # the nvcc requirement is necessary because it contains crt/host_config.h used by cuda runtime. This is a packaging bug that has been reported. - - cuda-nvcc - # cudart needed for CPU and GPU builds because of curand - - cuda-cudart-dev + - {{ stdlib("c") }} + - {{ compiler('c') }} =14 + - {{ compiler('cxx') }} =14 + - pkg-config +{% if gpu_enabled_bool %} - cuda-version ={{ cuda_version }} + - {{ compiler('cuda') }} + - cuda-cudart-dev - libcusparse-dev +{% endif %} host: @@ -127,11 +132,12 @@ requirements: # legate, there may not be a cupynumeric package that is compatible. So, we # list cupynumeric here to get a pair of legate and cupynumeric that are # compatible. + - cuda-version ={{ cuda_version }} - cuda-cccl - libcusparse - - cuda-version ={{ cuda_version }} + - libcudss-dev + - nccl <2.29 - cuda-cudart - - nccl {% endif %} run: @@ -142,6 +148,9 @@ requirements: {% if gpu_enabled_bool %} - libnvjitlink - libcusparse + # ship the NCCL comm layer so multi-GPU cudss runs can load libcudss_commlayer_nccl.so + - libcudss-commlayer-nccl + - nccl >=2.0,<2.29 # Pin to all minor versions of CUDA newer than the one built against, within the same major version. # cuda-version constrains the CUDA runtime version and ensures a compatible driver is available - {{ pin_compatible('cuda-version', min_pin='x.x', max_pin='x') }} diff --git a/examples/common.py b/examples/common.py index 99174ed6..e7cfb396 100644 --- a/examples/common.py +++ b/examples/common.py @@ -11,15 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import argparse import importlib +from typing import TYPE_CHECKING, Any, Protocol, cast -import numpy -from typing_extensions import Protocol +if TYPE_CHECKING: + from types import ModuleType + import numpy.typing as npt + from legate.timing._lib.timing import PyTime + from legate_sparse import csr_array -def get_arg_number(arg): +np: ModuleType +sparse: ModuleType +linalg: ModuleType + + +def get_arg_number(arg: str) -> int: """Parse a string argument that may contain size suffixes. Parameters @@ -68,11 +78,11 @@ class Timer(Protocol): for measuring execution time in the examples. """ - def start(self): + def start(self) -> None: """Start timing.""" ... - def stop(self): + def stop(self) -> float: """Stop timing and return duration. Blocks execution until everything before it has completed. @@ -92,19 +102,21 @@ class LegateTimer(Timer): measurement of GPU operations. """ - def __init__(self): - self._start = None + def __init__(self) -> None: + self._start: PyTime | None = None - def start(self): + def start(self) -> None: """Start timing using Legate's time function.""" from legate.timing import time self._start = time() - def stop(self): + def stop(self) -> float: """Stop timing and return duration in milliseconds.""" from legate.timing import time + assert self._start is not None + _end = time() return (_end - self._start) / 1000.0 @@ -116,24 +128,26 @@ class CuPyTimer(Timer): in CuPy applications. """ - def __init__(self): - self._start_event = None + def __init__(self) -> None: + self._start_event: Any | None = None - def start(self): + def start(self) -> None: """Start timing using CUDA events.""" - from cupy import cuda + from cupy import cuda # type: ignore [import-untyped] self._start_event = cuda.Event() self._start_event.record() - def stop(self): + def stop(self) -> float: """Stop timing and return duration in milliseconds.""" from cupy import cuda + assert self._start_event is not None + end_event = cuda.Event() end_event.record() end_event.synchronize() - return cuda.get_elapsed_time(self._start_event, end_event) + return cast(float, cuda.get_elapsed_time(self._start_event, end_event)) class NumPyTimer(Timer): @@ -143,19 +157,21 @@ class NumPyTimer(Timer): of CPU operations in NumPy/SciPy applications. """ - def __init__(self): - self._start_time = None + def __init__(self) -> None: + self._start_time: float | None = None - def start(self): + def start(self) -> None: """Start timing using perf_counter_ns.""" from time import perf_counter_ns self._start_time = perf_counter_ns() / 1000.0 - def stop(self): + def stop(self) -> float: """Stop timing and return duration in milliseconds.""" from time import perf_counter_ns + assert self._start_time is not None + end_time = perf_counter_ns() / 1000.0 return (end_time - self._start_time) / 1000.0 @@ -171,32 +187,31 @@ class DummyScope: that may or may not use resource scoping. """ - def __init__(self): - ... + def __init__(self) -> None: ... - def __enter__(self): + def __enter__(self) -> None: """Enter the context (no-op).""" ... - def __exit__(self, _, __, ___): + def __exit__(self, _: Any, __: Any, ___: Any) -> None: """Exit the context (no-op).""" ... - def __getitem__(self, item): + def __getitem__(self, item: Any) -> DummyScope: """Return self for any indexing (no-op).""" return self - def count(self, _): + def count(self, _: Any) -> int: """Return 1 for any count operation.""" return 1 @property - def preferred_kind(self): + def preferred_kind(self) -> None: """Return None for preferred kind.""" return None -def get_phase_procs(use_legate: bool): +def get_phase_procs(use_legate: bool) -> tuple[Any, Any]: """Get processor configurations for different phases of computation. Parameters @@ -252,7 +267,9 @@ def get_phase_procs(use_legate: bool): return DummyScope(), DummyScope() -def parse_common_args(): +def parse_common_args() -> tuple[ + str, Timer, ModuleType, ModuleType, ModuleType, bool +]: """Parse common command line arguments for example scripts. Returns @@ -274,6 +291,8 @@ def parse_common_args(): - "cupy": Uses cupy, cupyx.scipy.sparse, and cupyx.scipy.sparse.linalg - "scipy": Uses numpy, scipy.sparse, and scipy.sparse.linalg """ + global np, sparse, linalg + parser = argparse.ArgumentParser() parser.add_argument( "--package", @@ -283,6 +302,8 @@ def parse_common_args(): ) args, _ = parser.parse_known_args() + timer: Timer + if args.package == "legate": timer = LegateTimer() np_name = "cupynumeric" @@ -306,9 +327,9 @@ def parse_common_args(): use_legate = False - globals()["np"] = importlib.import_module(np_name) - globals()["sparse"] = importlib.import_module(sp_name) - globals()["linalg"] = importlib.import_module(lg_name) + np = importlib.import_module(np_name) + sparse = importlib.import_module(sp_name) + linalg = importlib.import_module(lg_name) return args.package, timer, np, sparse, linalg, use_legate @@ -317,7 +338,9 @@ def parse_common_args(): # # `diags` construct csr from dia array, while when from_diags=False # we construct csr arrya directly - might be slightly faster -def banded_matrix(N, nnz_per_row, from_diags=False): +def banded_matrix( + N: int, nnz_per_row: int, from_diags: bool = False +) -> csr_array: """Construct a banded matrix with 1.0 as values. Parameters @@ -375,7 +398,9 @@ def banded_matrix(N, nnz_per_row, from_diags=False): pred = np.arange(nnz_per_row - half_nnz, nnz_per_row + 1) post = np.flip(pred) - nnz_arr = np.concatenate((pred, np.ones(main_rows) * nnz_per_row, post)) + nnz_arr = np.concatenate( + (pred, np.ones(main_rows) * nnz_per_row, post) + ) row_offsets = np.zeros(N + 1).astype(sparse.coord_ty) row_offsets[1 : N + 1] = np.cumsum(nnz_arr) nnz = row_offsets[-1] @@ -399,7 +424,12 @@ def banded_matrix(N, nnz_per_row, from_diags=False): ) -def stencil_grid(S, grid, dtype=None, format=None): +def stencil_grid( + S: Any, + grid: tuple[int, int], + dtype: npt.dtype[Any] | None = None, + format: str | None = None, +) -> csr_array: """Construct a sparse matrix resulting from a stencil discretization on rectilinear grids. @@ -437,6 +467,8 @@ def stencil_grid(S, grid, dtype=None, format=None): >>> A = stencil_grid(S, (3, 3)) >>> print(A.toarray()) """ + import numpy + N_v = int(numpy.prod(grid)) # number of vertices in the mesh N_s = int((S != 0).sum(dtype=int)) # number of nonzero stencil entries @@ -497,7 +529,7 @@ def stencil_grid(S, grid, dtype=None, format=None): return sparse.dia_array((data, diags), shape=(N_v, N_v)).tocsr() -def poisson2D(N): +def poisson2D(N: int) -> csr_array: """Construct the 2D Poisson matrix. Parameters @@ -536,7 +568,9 @@ def poisson2D(N): diag_size = N * N - 1 first = np.full((N - 1), -1.0) chunks = np.concatenate([np.zeros(1), first]) - diag_a = np.concatenate([first, np.tile(chunks, (diag_size - (N - 1)) // N)]) + diag_a = np.concatenate( + [first, np.tile(chunks, (diag_size - (N - 1)) // N)] + ) diag_g = -1.0 * np.ones(N * (N - 1)) diag_c = 4.0 * np.ones(N * N) @@ -549,7 +583,7 @@ def poisson2D(N): return sparse.diags(diagonals, offsets, dtype=np.float64).tocsr() -def diffusion2D(N, epsilon=1.0, theta=0.0): +def diffusion2D(N: int, epsilon: float = 1.0, theta: float = 0.0) -> csr_array: """Construct a 2D diffusion matrix with anisotropy. Parameters diff --git a/examples/direct_solve_banded_system.py b/examples/direct_solve_banded_system.py new file mode 100644 index 00000000..1876acfa --- /dev/null +++ b/examples/direct_solve_banded_system.py @@ -0,0 +1,86 @@ +import argparse +from common import get_arg_number, parse_common_args + +"""Sparse Direct Solve Benchmark. + +This script benchmarks sparse direct solve for a banded system of equations + +""" + + +def create_system_of_eqns(nrows, dtype): + """ + Creates a system of equations A*x = b where: + - A has 4 on the main diagonal (k=0), 1 on the first and second upper diagonal (k=1, 2) + - and 1 on the first lower diagonal (k=-1) + - The solution x is [1, 2, 3, ..., nrows] + - b is computed as A @ x + """ + + main_diag = np.full(nrows, 4.0) + upper1_diag = np.ones(nrows - 1) + upper2_diag = np.ones(nrows - 2) + lower1_diag = np.ones(nrows - 1) + + A = sparse.diags( + [lower1_diag, main_diag, upper1_diag, upper2_diag], + offsets=[-1, 0, 1, 2], + shape=(nrows, nrows), + dtype=np.float64, + format="csr", + ) + x_expected = np.arange(1, nrows + 1, dtype=dtype) + b = A @ x_expected + + return (A, b, x_expected) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-n", + "--nrows", + type=str, + default="12", + dest="nrows", + help="Number of rows in the generated matrix (accepts suffixes 'k', 'm', 'g')", + ) + + parser.add_argument( + "--nwarmups", + type=int, + default=2, + dest="nwarmups", + help="Number of warmup iterations before spsolve is timed", + ) + + args, _ = parser.parse_known_args() + package, timer, np, sparse, _, _ = parse_common_args() + + nrows = get_arg_number(args.nrows) + nwarmups = args.nwarmups + + assert nrows > 0, "Matrix must contain atleast one row" + assert nwarmups >= 0, "Warmup iterations must be >= 0" + + timer.start() + A, b, x_expected = create_system_of_eqns(nrows, np.float64) + elapsed_time_setup = timer.stop() + + for _ in range(nwarmups): + x = sparse.linalg.spsolve(A, b) + + timer.start() + x = sparse.linalg.spsolve(A, b) + elapsed_time_solve = timer.stop() + + error_l2_norm = np.linalg.norm(x_expected - x) / np.linalg.norm(x_expected) + + print(f"Dimension of A : {A.shape}") + print(f"Dimension of b : {b.shape}") + print(f"Dimension of x : {x.shape}") + print(f"NNZ of A : {A.nnz}") + print(f"Elapsed time for setup (ms) : {elapsed_time_setup}") + print(f"Elapsed time for solve (ms) : {elapsed_time_solve}") + print(f"Error in solution : {error_l2_norm}") diff --git a/examples/gmg.py b/examples/gmg.py deleted file mode 100644 index 6491ef59..00000000 --- a/examples/gmg.py +++ /dev/null @@ -1,492 +0,0 @@ -# Copyright 2022-2024 NVIDIA Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# Portions of this file are also subject to the following license: -# -# The MIT License (MIT) -# -# Copyright (c) 2008-2015 PyAMG Developers -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -import argparse - -# for some small data manipulations on host -import numpy -from common import diffusion2D, get_phase_procs, parse_common_args, poisson2D - - -def max_eigenvalue(A, iters=15): - # Compute eigenvector associated with maximum eigenvalue via power - # iteration. This is the same as Steven's imp for estimating spectral - # radius. - x1 = np.random.rand(A.shape[1]).reshape(-1, 1) - for _ in range(iters): - x1 = A @ x1 - x1 /= np.linalg.norm(x1) - # Compute and return max eigenvalue via Raleigh quotient. - # This is np.dot(A @ x1, x1) / np.dot(x1, x1) - # but since x1 is a unit vector, we can assume denominator is 1. - return np.dot(x1.T, A @ x1).item() - - -class GMG(object): - """ - Geometric Multigrid solver for the 2D Poisson problem. - - - Source on correctness of restriction / prolongation operators: [1] - - Sources on V-cycle algorithm: [1, 2, 3, 4] - - Source on preconditioned conjugate gradient and Gauss-Seidel smoothing: [4] - - [1] https://www.researchgate.net/publication/220690328_A_Multigrid_Tutorial_2nd_Edition - [2] https://github.com/pyamg/pyamg - [3] http://www.cs.columbia.edu/cg/pdfs/28_GPUSim.pdf - [4] https://netlib.org/utk/people/JackDongarra/PAPERS/HPCG-benchmark.pdf - """ # noqa: E501 - - def __init__(self, A, shape, levels, smoother, gridop, machine): - self.A = A - self.shape = shape - self.N = numpy.prod(self.shape) - self.levels = levels - self.restriction_op = { - "injection": injection_operator, - "linear": linear_operator, - }[gridop] - self.smoother = {"jacobi": WeightedJacobi}[smoother]() - self.operators = self.compute_operators(A) - self.temp = None - self.machine = machine - self.proc_kind = machine.preferred_target - - def compute_operators(self, A): - operators = [] - dim = self.N - self.smoother.init_level_params(A, 0) - for level in range(self.levels): - R, dim = self.compute_restriction_level(dim) - P = R.T - # assert sparse.issparse(P) - A = R @ A @ P - # assert sparse.issparse(A) - self.smoother.init_level_params(A, level + 1) - operators.append((R, A, P)) - return operators - - def cycle(self, r): - # Kick off the cycle with the top-level machine. - # TODO (marsaev): there are issues with scoping - # disabling it for now - return self._cycle(self.A, r, 0, self.machine) - - def _cycle(self, A, r, level, machine): - if level == self.levels - 1: - return self.smoother.coarse(A, r, None, level=level) - x = None - # Do one pre-smoothing iteration. - R, coarse_A, P = self.operators[level] - x = self.smoother.pre(A, r, x, level=level) - # Compute the residual. - fine_r = r - A.dot(x) - - # Restrict the residual. - if use_legate: - # TODO (marsaev): there col-split splmv optimization - coarse_r = R.dot(fine_r) - else: - coarse_r = R.dot(fine_r) - - # Compute coarse solution using a subset of the machine. - # TODO (marsaev): there are issues with scoping - # disabling it for now - coarse_x = self._cycle(coarse_A, coarse_r, level + 1, self.machine) - - fine_x = P @ coarse_x - x_corrected = x + fine_x - # Do one post-smoothing iteration. - return self.smoother.post(A, r, x_corrected, level=level) - - def compute_restriction_level(self, fine_dim): - return self.restriction_op(fine_dim) - - def linear_operator(self): - return linalg.LinearOperator( - self.A.shape, dtype=float, matvec=lambda r: self.cycle(r) - ) - - -class WeightedJacobi(object): - def __init__(self, omega=4.0 / 3.0): - # Basically, similar solution to PyAMG. - self.level_params = [] - self._init_omega = omega - - def init_level_params(self, A, level): - D_inv = 1.0 / A.diagonal() - # We need to create a new sparse matrix with just this modified - # diagonal of A. sparse.eye doesn't have this nob, but we can take - # the output of sparse.eye and mess with it to get the matrix - # that we want. - D_inv_nnz = min(A.shape[0], A.shape[1]) - D_inv_mat = sparse.csr_array( - ( - np.ones(D_inv_nnz).astype(A.dtype), - ( - np.arange(D_inv_nnz).astype(sparse.coord_ty), - np.arange(D_inv_nnz).astype(sparse.coord_ty), - ), - ), - shape=A.shape, - dtype=A.dtype, - copy=False, - ) - """ - sparse.eye( - A.shape[0], n=A.shape[1], dtype=A.dtype, format="csr" - ) - """ - D_inv_mat.data = 1.0 / D_inv - spectral_radius = max_eigenvalue(A @ D_inv_mat, 1) - omega = self._init_omega / spectral_radius - self.level_params.append((omega, D_inv)) - assert len(self.level_params) - 1 == level - - def __call__(self, A, r, x, level): - omega, D_inv = self.level_params[level] - return (1 - omega) * x + omega * (r - A @ x + x / D_inv) * D_inv - - def pre(self, A, r, x, level): - if x is not None: - raise Exception("Expected x is None.") - omega, D_inv = self.level_params[level] - return omega * r * D_inv - - def post(self, A, r, x, level): - omega, D_inv = self.level_params[level] - return x + omega * (r - A @ x) * D_inv - - def coarse(self, A, r, x, level): - return self.pre(A, r, x, level) - # return sparse.linalg.spsolve(A, r) - - -def injection_operator(fine_dim): - fine_shape = (int(np.sqrt(fine_dim)),) * 2 - coarse_shape = fine_shape[0] // 2, fine_shape[1] // 2 - coarse_dim = numpy.prod(coarse_shape) - Rp = np.arange(coarse_dim + 1) - Rx = np.ones((coarse_dim,), dtype=np.float64) - ij = np.arange(coarse_dim, dtype=np.int64) - i = ij % coarse_shape[1] - j = ij // coarse_shape[1] - Rj = 2 * i + 2 * j * coarse_shape[1] - R = sparse.csr_matrix((Rx, Rj, Rp), shape=(coarse_dim, fine_dim), dtype=np.float64) - return R, coarse_dim - - -def linear_operator(fine_dim): - fine_shape = (int(np.sqrt(fine_dim)),) * 2 - coarse_shape = fine_shape[0] // 2, fine_shape[1] // 2 - coarse_dim = np.prod(coarse_shape) - # Construct CSR directly. - Rp = numpy.empty(coarse_dim + 1, dtype=np.int64) - # Get an upper bound on the total number of non-zeroes, and construct Rj - # and Rx based on this bound. Computing this value exactly is tedious and - # the extra allocation can be truncated at the end. We won't need more - # than 9*coarse_dim rows. - nnz = 9 * coarse_dim - Rj = numpy.empty((nnz,), dtype=np.int64) - Rx = numpy.empty((nnz,), dtype=np.float64) - p = 0 - - def flatten(i, j): - return i * fine_shape[1] + j - - for ij in range(coarse_dim): - Rp[ij] = p - # For linear interpolation, - # we have 9 points over which to average in the 2d case. - # The coefficient matrix will act as a stencil operator. - i, j = (ij // coarse_shape[1]), (ij % coarse_shape[1]) - # Corners. - # r[2*i-1, 2*j-1] = 1/16 - # r[2*i-1, 2*j+1] = 1/16 - # r[2*i+1, 2*j-1] = 1/16 - # r[2*i+1, 2*j+1] = 1/16 - # Edges. - # r[2*i, 2*j+1] = 2/16 - # r[2*i, 2*j-1] = 2/16 - # r[2*i-1, 2*j] = 2/16 - # r[2*i+1, 2*j] = 2/16 - # Center. - # r[2 * i, 2 * j] = 4/16 - # Ensure indices are constructed in order. - # Assumes row-major ordering. - if 0 <= 2 * i - 1: - if 0 <= 2 * j - 1: - # top-left - Rj[p], Rx[p] = flatten(2 * i - 1, 2 * j - 1), 1 / 16 - p += 1 - # top-middle - Rj[p], Rx[p] = flatten(2 * i - 1, 2 * j), 2 / 16 - p += 1 - if 2 * j + 1 < fine_dim: - # top-right - Rj[p], Rx[p] = flatten(2 * i - 1, 2 * j + 1), 1 / 16 - p += 1 - if 0 <= 2 * j - 1: - # middle-left - Rj[p], Rx[p] = flatten(2 * i, 2 * j - 1), 2 / 16 - p += 1 - # middle-middle - Rj[p], Rx[p] = flatten(2 * i, 2 * j), 4 / 16 - p += 1 - if 2 * j + 1 < fine_dim: - # middle-right - Rj[p], Rx[p] = flatten(2 * i, 2 * j + 1), 2 / 16 - p += 1 - if 2 * i + 1 < fine_dim: - if 0 <= 2 * j - 1: - # bottom-left - Rj[p], Rx[p] = flatten(2 * i + 1, 2 * j - 1), 1 / 16 - p += 1 - # bottom-middle - Rj[p], Rx[p] = flatten(2 * i + 1, 2 * j), 2 / 16 - p += 1 - if 2 * j + 1 < fine_dim: - # bottom-right - Rj[p], Rx[p] = flatten(2 * i + 1, 2 * j + 1), 1 / 16 - p += 1 - - Rp[coarse_dim] = p - Rx, Rj, Rp = np.array(Rx[:p]), np.array(Rj[:p]), np.array(Rp) - R = sparse.csr_matrix((Rx[:p], Rj[:p], Rp), shape=(coarse_dim, fine_dim)) - return R, coarse_dim - - -def required_driver_memory(N): - NN = N * N - fine_shape = (int(np.sqrt(NN)),) * 2 - coarse_shape = fine_shape[0] // 2, fine_shape[1] // 2 - coarse_dim = numpy.prod(coarse_shape) - nnz = 9 * coarse_dim - elements = nnz + coarse_dim + 1 - bytes = elements * 8 - mb = bytes / 10**6 - print("Max required driver memory for N=%d is %fMB" % (N, mb)) - - -def print_diagnostics(operators): - """Print basic statistics about the multigrid hierarchy.""" - output = "MultilevelSolver\n" - output += f"Number of Levels: {len(operators)}\n" - # output += f"Operator Complexity: {operator_complexity(levels):6.3f}\n" - # output += f"Grid Complexity: {grid_complexity(levels):6.3f}\n" - - total_nnz = sum(level[1].nnz for level in operators) - - # 123456712345678901 123456789012 123456789 - # 0 10000 49600 [52.88%] - output += " level unknowns nonzeros\n" - for n, level in enumerate(operators): - A = level[1] - ratio = 100 * A.nnz / total_nnz - output += f"{n:>6} {A.shape[1]:>11} {A.nnz:>12} [{ratio:2.2f}%]\n" - - print(output) - - -def execute(N, data, smoother, gridop, levels, maxiter, tol, verbose, warmup, timer): - build, solve = get_phase_procs(use_legate) - - if warmup: - tA = diffusion2D(64, epsilon=0.1, theta=np.pi / 4) - tB = tA.T - tC = tB @ tA # noqa: F841 - - # Generate matrix - timer.start() - if data == "poisson": - A = poisson2D(N) - b = np.random.rand(N**2) - elif data == "diffusion": - A = diffusion2D(N) - b = np.random.rand(N**2) - else: - raise NotImplementedError(data) - print(f"GMG: {A.shape}") - print(f"Data creation time: {timer.stop()} ms") - - assert smoother == "jacobi", "Only Jacobi smoother is currently supported." - - if verbose: - - def callback(x): - print(f"Residual: {np.linalg.norm(b - (A @ x))}") - - else: - callback = None - - required_driver_memory(N) - # Setup - timer.start() - mg_solver = GMG( - A=A, - shape=(N, N), - levels=levels, - smoother=smoother, - gridop=gridop, - machine=solve, - ) - M = mg_solver.linear_operator() - print(f"GMG init time: {timer.stop()} ms") - - print_diagnostics(mg_solver.operators) - - # Warm up the runtime. - float( - np.linalg.norm( - A.dot( - np.zeros( - A.shape[1], - ) - ) - ) - ) - float( - np.linalg.norm( - M.matvec( - np.zeros( - M.shape[1], - ) - ) - ) - ) - # Make another call to random here as well. - float(np.linalg.norm(np.random.rand(b.shape[0]))) - - # Solve - timer.start() - x, iters = linalg.cg(A, b, rtol=tol, maxiter=maxiter, M=M, callback=callback) - total = timer.stop() - - norm_ini = np.linalg.norm(b) - norm_res = np.linalg.norm(b - (A @ x)) - - # Check convergence with relative tolerance - convergence_status = True if norm_res <= norm_ini * tol else False - print(f"Dimension of A : {A.shape}") - print(f"Did the solution converge : {convergence_status}") - print(f"Final relative residual norm : {norm_res / norm_ini}") - print(f"Number of iterations : {iters}") - print(f"Total elapsed time (ms) : {total}") - print(f"Time per iteration (ms) : {total / iters}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-n", - "--num", - type=int, - default=16, - dest="N", - help="number of elements in one dimension", - ) - parser.add_argument( - "-d", - "--data", - dest="data", - choices=["poisson", "diffusion"], - type=str, - default="poisson", - help="The problem instance to solve.", - ) - parser.add_argument( - "-s", - "--smoother", - dest="smoother", - choices=["jacobi"], - type=str, - default="jacobi", - help="Smoother to use.", - ) - parser.add_argument( - "-g", - "--gridop", - dest="gridop", - choices=["linear", "injection"], - type=str, - default="injection", - help="Intergrid transfer operator to use.", - ) - parser.add_argument( - "-l", - "--levels", - dest="levels", - type=int, - default=2, - help="Number of multigrid levels.", - ) - parser.add_argument( - "-m", - "--maxiter", - type=int, - default=200, - dest="maxiter", - help="bound the maximum number of iterations", - ) - parser.add_argument( - "-v", - "--verbose", - dest="verbose", - action="store_true", - help="print verbose output", - ) - parser.add_argument( - "--tol", - type=float, - default=1e-10, - dest="tol", - help="Convergence relative norm check threshold", - ) - - parser.add_argument( - "-w", - "--warmup", - dest="warmup", - action="store_true", - help="Perform some Warmup operations before running timings", - ) - - args, _ = parser.parse_known_args() - _, timer, np, sparse, linalg, use_legate = parse_common_args() - execute(**vars(args), timer=timer) diff --git a/examples/matrix_power.py b/examples/matrix_power.py index cc52c08b..69a807aa 100644 --- a/examples/matrix_power.py +++ b/examples/matrix_power.py @@ -32,11 +32,17 @@ --package: Backend to use (legate, cupy, scipy) """ +from __future__ import annotations + import argparse from functools import reduce +from typing import TYPE_CHECKING + +from common import Timer, get_arg_number, parse_common_args -import numpy.typing as npt -from common import get_arg_number, parse_common_args +if TYPE_CHECKING: + import numpy.typing as npt + from legate_sparse import csr_array # global states random_seed, rng global random_seed, rng @@ -46,7 +52,9 @@ # ---------------------------- -def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike = None): +def create_csr_with_nnz_per_row( + nrows: int, nnz_per_row: int, dtype: npt.DTypeLike | None = None +) -> csr_array: """Create a CSR matrix with a prescribed number of nonzeros in each row. Parameters @@ -84,7 +92,9 @@ def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike = return matrix -def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None): +def create_csr_with_nnz_total( + nrows: int, nnz_total: int, dtype: npt.DTypeLike | None = None +) -> csr_array: """Create a CSR matrix with a prescribed total number of nonzeros. Parameters @@ -113,7 +123,9 @@ def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None): coo_rows = rng.integers(0, nrows, nnz_total) coo_cols = rng.integers(0, ncols, nnz_total) vals = np.ones(nnz_total, dtype=dtype) - matrix = sparse.csr_matrix((vals, (coo_rows, coo_cols)), shape=(nrows, ncols)) + matrix = sparse.csr_matrix( + (vals, (coo_rows, coo_cols)), shape=(nrows, ncols) + ) return matrix @@ -123,7 +135,9 @@ def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None): # ------------------------ -def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4): +def compute_A_power_k( + A: csr_array, timer: Timer, nwarmups: int = 2, k: int = 4 +) -> None: """Compute A^k and measure performance. Parameters @@ -180,7 +194,9 @@ def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4): print( f"Elapsed time for spgemm for hop {hop} (ms) : {elapsed_time_spgemm[hop]}" ) - print(f"Elapsed time for copy for hop {hop} (ms) : {elapsed_time_copy[hop]}") + print( + f"Elapsed time for copy for hop {hop} (ms) : {elapsed_time_copy[hop]}" + ) if __name__ == "__main__": @@ -243,13 +259,11 @@ def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4): ) args, _ = parser.parse_known_args() - _, timer, np, sparse, linalg, use_legate = parse_common_args() + package, timer, np, sparse, linalg, use_legate = parse_common_args() nrows = get_arg_number(args.nrows) nnz_total = get_arg_number(args.nnz_total) - # this is a global variable - global random_seed, rng random_seed = args.random_seed if args.same_sparsity_for_cpu_and_gpu: @@ -277,4 +291,6 @@ def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4): compute_A_power_k(A, timer, int(args.nwarmups), int(args.k)) - print(f"Elapsed time in matrix creation (ms) : {elapsed_time_matrix_gen}") + print( + f"Elapsed time in matrix creation (ms) : {elapsed_time_matrix_gen}" + ) diff --git a/examples/pde.py b/examples/pde.py index d9ca0095..6745ee32 100644 --- a/examples/pde.py +++ b/examples/pde.py @@ -38,16 +38,19 @@ --package: Backend to use (legate, cupy, scipy) """ +from __future__ import annotations + # This PDE solving application is derived from # https://aquaulb.github.io/book_solving_pde_mooc/solving_pde_mooc/notebooks/05_IterativeMethods/05_01_Iteration_and_2D.html. import argparse import sys +from typing import Any -from common import get_phase_procs, parse_common_args +from common import Timer, get_phase_procs, parse_common_args -def d2_mat_dirichlet_2d(nx, ny, dx, dy): +def d2_mat_dirichlet_2d(nx: int, ny: int, dx: float, dy: float) -> Any: """ Constructs the matrix for the centered second-order accurate second-order derivative for Dirichlet boundary conditions in 2D @@ -114,7 +117,7 @@ def d2_mat_dirichlet_2d(nx, ny, dx, dy): return d2mat -def p_exact_2d(X, Y): +def p_exact_2d(X: Any, Y: Any) -> Any: """Computes the exact solution of the Poisson equation in the domain [0, 1]x[-0.5, 0.5] with rhs: b = (np.sin(np.pi * X) * np.cos(np.pi * Y) + @@ -133,14 +136,26 @@ def p_exact_2d(X, Y): exact solution of the Poisson equation """ - sol = -1.0 / (2.0 * np.pi**2) * np.sin(np.pi * X) * np.cos(np.pi * Y) - 1.0 / ( - 50.0 * np.pi**2 - ) * np.sin(5.0 * np.pi * X) * np.cos(5.0 * np.pi * Y) + sol = -1.0 / (2.0 * np.pi**2) * np.sin(np.pi * X) * np.cos( + np.pi * Y + ) - 1.0 / (50.0 * np.pi**2) * np.sin(5.0 * np.pi * X) * np.cos( + 5.0 * np.pi * Y + ) return sol -def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters, timer): +def execute( + nx: int, + ny: int, + plot: bool, + plot_fname: str, + throughput: bool, + tol: float, + max_iters: int, + warmup_iters: int, + timer: Timer, +) -> None: # Grid parameters. xmin, xmax = 0.0, 1.0 # limits in the x direction ymin, ymax = -0.5, 0.5 # limits in the y direction @@ -181,9 +196,9 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters, # Compute the rhs. Note that we non-dimensionalize the coordinates # x and y with the size of the domain in their respective dire- # ctions. - b = np.sin(np.pi * X) * np.cos(np.pi * Y) + np.sin(5.0 * np.pi * X) * np.cos( - 5.0 * np.pi * Y - ) + b = np.sin(np.pi * X) * np.cos(np.pi * Y) + np.sin( + 5.0 * np.pi * X + ) * np.cos(5.0 * np.pi * Y) # b is currently a 2D array. We need to convert it to a column-major # ordered 1D array. This is done with the flatten numpy function. @@ -194,7 +209,7 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters, # count combinations as well. Even more annoyingly, doing any sort # of flatten results in some bad assignment of equivalence sets within # Legion's dependence analysis. So if we're just testing solve - # throughput, use an array of all ones. + # throughpu: boolt, use an array of all ones. if throughput: n = b.shape[0] - 2 bflat = np.ones((n * n,)) @@ -218,7 +233,13 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters, # If we're testing throughput, run only the prescribed number of iterations. if throughput: if use_legate: - p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters, conv_test_iters=max_iters) + p_sol, iters = linalg.cg( + A, + bflat, + rtol=tol, + maxiter=max_iters, + conv_test_iters=max_iters, + ) else: p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters) else: @@ -242,8 +263,12 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters, # Check convergence with relative tolerance convergence_status = True if norm_res <= norm_ini * tol else False - print(f"Did the solution converge : {convergence_status}") - print(f"Final relative residual norm : {norm_res / norm_ini}") + print( + f"Did the solution converge : {convergence_status}" + ) + print( + f"Final relative residual norm : {norm_res / norm_ini}" + ) if iters > 0: print(f"Number of iterations : {iters}") print(f"Time per iteration (ms) : {total / iters}") @@ -321,10 +346,14 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters, ) args, _ = parser.parse_known_args() - _, timer, np, sparse, linalg, use_legate = parse_common_args() + package, timer, np, sparse, linalg, use_legate = parse_common_args() - if args.throughput and (args.max_iters is None or args.warmup_iters is None): - print("Must provide --max-iters and --warmup-iters when using --throughput.") + if args.throughput and ( + args.max_iters is None or args.warmup_iters is None + ): + print( + "Must provide --max-iters and --warmup-iters when using --throughput." + ) sys.exit(1) execute(**vars(args), timer=timer) diff --git a/examples/poisson_5point_example.py b/examples/poisson_5point_example.py new file mode 100644 index 00000000..e2685a17 --- /dev/null +++ b/examples/poisson_5point_example.py @@ -0,0 +1,193 @@ +# Copyright 2022-2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Solve Poisson equation: -∇²u = f(x,y) on domain [0,1]×[0,1] +With Dirichlet boundary conditions: u = 0 on boundary. We use +a manufactured solution approach u(x,y) = sin(2πx) * sin(2πy) +and use that to compute the RHS. +""" + +from __future__ import annotations + +import argparse + +from common import parse_common_args, get_arg_number + + +def create_poisson_mat(n, h): + """ + Create the 2D Poisson equation discretization matrix using 5-point stencil. + + The 5-point stencil for -∇²u at point (i,j) is: + -u_{i,j-1} - u_{i-1,j} + 4*u_{i,j} - u_{i+1,j} - u_{i,j+1} = h²*f_{i,j} + + Parameters + ---------- + n : int + Number of interior grid points in each direction (n×n grid) + h : float + Grid spacing (h = 1/(n+1)) + + Returns + ------- + A : sparse CSR matrix + The discretization matrix of shape (n**2, n**2) + """ + N = n * n # Total number of unknowns + + # stencil: + # -1 + # -1 4 -1 + # -1 + main_diag = 4.0 * np.ones(N) / (h * h) + off_diag1 = -1.0 * np.ones(N - 1) / (h * h) + off_diag2 = -1.0 * np.ones(N - n) / (h * h) + + # cupynumeric doesn't support non-unit strides in indexing, + # so use a mask array to set every "n" elements to zero + zero_out_indices = np.array(range(n - 1, N - 1, n), dtype=int) + off_diag1[zero_out_indices] = 0.0 + + # The offsets : [-n, -1, 0, 1, n ] + # correspond to : [below, left, center, right, above ] + diagonals = [off_diag2, off_diag1, main_diag, off_diag1, off_diag2] + offsets = [-n, -1, 0, 1, n] + + # Create the sparse matrix and convert to CSR format + return sparse.diags( + diagonals, offsets, shape=(N, N), dtype=np.float64, format="csr" + ) + + +def manufactured_solution(x, y): + "u(x,y) = sin(2πx) * sin(2πy) satisfies u=0 on the boundary of [0,1]×[0,1]" + return np.sin(2 * np.pi * x) * np.sin(2 * np.pi * y) + + +def compute_rhs(x, y): + """ + Compute the right-hand side f(x,y) for the manufactured solution. + + For u(x,y) = sin(2πx) * sin(2πy), we have: + -∇²u = 8π² * sin(2πx) * sin(2πy) = f(x,y) + """ + return 8 * np.pi**2 * np.sin(2 * np.pi * x) * np.sin(2 * np.pi * y) + + +def solve_poisson_2d(n, verbose=True) -> float: + """ + Solve the 2D Poisson equation with Dirichlet boundary conditions. + + Parameters + ---------- + n : int + Number of interior grid points in each direction + verbose : bool + Whether to print detailed output + + Returns + ------- + error : float + The L2 error between numerical and analytical solutions + """ + h = 1.0 / (n + 1) + + if verbose: + print(f"Solving 2D Poisson equation on {n}×{n} grid") + print(f"Grid spacing h = {h:.6f}") + print(f"Total unknowns: {n * n}") + + # Create grid points (interior points only) and flatten it + x = np.linspace(h, 1 - h, n) + y = np.linspace(h, 1 - h, n) + X, Y = np.meshgrid(x, y, indexing="ij") + X_flat = X.flatten() + Y_flat = Y.flatten() + + A = create_poisson_mat(n, h) + b = compute_rhs(X_flat, Y_flat) + + if verbose: + print(f"Matrix shape : {A.shape}") + print(f"Matrix non-zeros : {A.nnz}") + print(f"Sparsity : {A.nnz / (n * n) ** 2:.6f}") + print("\nSolving linear system Ax = b using spsolve...") + + x_numerical = linalg.spsolve(A, b) + x_analytical = manufactured_solution(X_flat, Y_flat) + + error_vec = x_numerical - x_analytical + l2_error = np.linalg.norm(error_vec) * h # Scale by h for L2 norm + l_inf_error = np.max(np.abs(error_vec)) + relative_error = l2_error / (np.linalg.norm(x_analytical) * h) + + residual = A @ x_numerical - b + residual_norm = np.linalg.norm(residual) + + if verbose: + print("\nResults:") + print(f"L2 error : {l2_error:.6e}") + print(f"L∞ error : {l_inf_error:.6e}") + print(f"Relative L2 error: {relative_error:.6e}") + print(f"Residual norm ||Ax - b||: {residual_norm:.6e}") + + return l2_error + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Solve 2D Poisson equation with 5-point stencil" + ) + parser.add_argument( + "--size", + "-n", + type=str, + default="32", + help="Number of interior grid points in each direction (default: 32)", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Use this argument for verbose output", + ) + + args, _ = parser.parse_known_args() + package, timer, np, sparse, linalg, use_legate = parse_common_args() + + n_interior = get_arg_number(args.size) + + solve_poisson_2d(n_interior, verbose=args.verbose) + + print("\n" + "=" * 60) + print("Verification: Testing with smaller grid for convergence check") + print("=" * 60) + + # Perform convergence tests + n1, n2 = n_interior, n_interior * 2 + l2_error1 = solve_poisson_2d(n1, verbose=False) + l2_error2 = solve_poisson_2d(n2, verbose=False) + + convergence_rate = np.log2(l2_error1 / l2_error2) + print(f"Grid refinement : {n1}×{n1} → {n2}×{n2}") + print(f"Error reduction factor : {l2_error1 / l2_error2:.3f}") + print(f"Convergence rate : {convergence_rate:.3f}") + print("Expected rate for 5-point stencil: ~2.0") + + if abs(convergence_rate - 2.0) < 0.5: + print( + "\n✓ Solution verified: convergence rate is close to expected value" + ) + else: + print("\n⚠ Warning: convergence rate differs from expected value") diff --git a/examples/spgemm_microbenchmark.py b/examples/spgemm_microbenchmark.py index e30c05dd..0f97be69 100644 --- a/examples/spgemm_microbenchmark.py +++ b/examples/spgemm_microbenchmark.py @@ -32,12 +32,24 @@ --package: Backend to use (legate, cupy, scipy) """ +from __future__ import annotations + import argparse +from typing import TYPE_CHECKING + +from common import ( + Timer, + banded_matrix, + get_arg_number, + get_phase_procs, + parse_common_args, +) -from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args +if TYPE_CHECKING: + from legate_sparse import csr_array -def spgemm_dispatch(A, B): +def spgemm_dispatch(A: csr_array, B: csr_array) -> csr_array: """Dispatch sparse matrix-matrix multiplication operation. Parameters @@ -61,7 +73,9 @@ def spgemm_dispatch(A, B): return C -def get_matrices(N, nnz_per_row, fname1, fname2): +def get_matrices( + N: int, nnz_per_row: int, fname1: str, fname2: str +) -> tuple[csr_array, csr_array]: """Get matrices for SpGEMM benchmark. Parameters @@ -100,7 +114,15 @@ def get_matrices(N, nnz_per_row, fname1, fname2): return A, A.copy() -def run_spgemm(N, nnz_per_row, fname1, fname2, iters, stable, timer): +def run_spgemm( + N: int, + nnz_per_row: int, + fname1: str, + fname2: str, + iters: int, + stable: bool, + timer: Timer, +) -> None: """Run sparse matrix-matrix multiplication benchmark. Parameters @@ -229,7 +251,7 @@ def run_spgemm(N, nnz_per_row, fname1, fname2, iters, stable, timer): ) args, _ = parser.parse_known_args() - _, timer, np, sparse, linalg, use_legate = parse_common_args() + package, timer, np, sparse, linalg, use_legate = parse_common_args() init_procs, bench_procs = get_phase_procs(use_legate) diff --git a/examples/spmv_microbenchmark.py b/examples/spmv_microbenchmark.py index c6f11ff8..b449b026 100644 --- a/examples/spmv_microbenchmark.py +++ b/examples/spmv_microbenchmark.py @@ -34,13 +34,27 @@ --package: Backend to use (legate, cupy, scipy) """ +from __future__ import annotations + import argparse +from typing import TYPE_CHECKING, Any + +from common import ( + Timer, + banded_matrix, + get_arg_number, + get_phase_procs, + parse_common_args, +) -from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args +if TYPE_CHECKING: + from legate_sparse import csr_array # Writing to pre-allocated array is preferred -def spmv_dispatch(A, x, y, i, repartition): +def spmv_dispatch( + A: csr_array, x: Any, y: Any, i: int, repartition: bool +) -> None: """Dispatch sparse matrix-vector multiplication operation. Parameters @@ -77,7 +91,9 @@ def spmv_dispatch(A, x, y, i, repartition): y = A @ x -def run_spmv(A, iters, repartition, timer): +def run_spmv( + A: csr_array, iters: int, repartition: bool, timer: Timer +) -> None: """Run sparse matrix-vector multiplication benchmark. Parameters @@ -105,9 +121,9 @@ def run_spmv(A, iters, repartition, timer): x = np.ones((A.shape[1],)) y = np.zeros((A.shape[0],)) - assert not repartition or ( - A.shape[0] == A.shape[1] - ), "Matrix should be square for switching x and y" + assert not repartition or (A.shape[0] == A.shape[1]), ( + "Matrix should be square for switching x and y" + ) # Warm up runs warmup_iters = 5 @@ -186,7 +202,7 @@ def run_spmv(A, iters, repartition, timer): ) args, _ = parser.parse_known_args() - _, timer, np, sparse, linalg, use_legate = parse_common_args() + package, timer, np, sparse, linalg, use_legate = parse_common_args() init_procs, bench_procs = get_phase_procs(use_legate) diff --git a/install.py b/install.py index c46e03a7..6be4fdae 100755 --- a/install.py +++ b/install.py @@ -109,7 +109,9 @@ def was_previously_built_with_different_build_isolation( legate_sparse_build_dir is not None and os.path.exists(legate_sparse_build_dir) and os.path.exists( - cmake_cache := os.path.join(legate_sparse_build_dir, "CMakeCache.txt") + cmake_cache := os.path.join( + legate_sparse_build_dir, "CMakeCache.txt" + ) ) ): try: @@ -298,9 +300,15 @@ def validate_path(path): cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")] cmake_flags += f"""\ --DCMAKE_BUILD_TYPE={( - "Debug" if debug else "RelWithDebInfo" if debug_release else "Release" -)} +-DCMAKE_BUILD_TYPE={ + ( + "Debug" + if debug + else "RelWithDebInfo" + if debug_release + else "Release" + ) + } -DBUILD_SHARED_LIBS=ON -DBUILD_MARCH={str(march)} -DCMAKE_CUDA_ARCHITECTURES={str(arch)} @@ -345,7 +353,9 @@ def validate_path(path): } ) - execute_command(pip_install_cmd, verbose, cwd=legate_sparse_dir, env=cmd_env) + execute_command( + pip_install_cmd, verbose, cwd=legate_sparse_dir, env=cmd_env + ) def driver(): diff --git a/legate_sparse/__init__.py b/legate_sparse/__init__.py index c8f44589..35d65fb9 100644 --- a/legate_sparse/__init__.py +++ b/legate_sparse/__init__.py @@ -17,12 +17,15 @@ """ -import scipy.sparse as _sp # type: ignore +from __future__ import annotations + +import scipy.sparse as _sp from .coverage import clone_module # noqa: F401 from .csr import csr_array, csr_matrix # noqa: F401 from .dia import dia_array, dia_matrix # noqa: F401 -from .module import * # noqa: F401 +from .module import * # noqa: F401,F403 +from .construct import block_array # noqa: F401 clone_module(_sp, globals()) diff --git a/legate_sparse/_version.py b/legate_sparse/_version.py index b50be7bd..ff2762af 100644 --- a/legate_sparse/_version.py +++ b/legate_sparse/_version.py @@ -69,7 +69,9 @@ def decorate(f): return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): +def run_command( + commands, args, cwd=None, verbose=False, hide_stderr=False, env=None +): """Call the given command(s).""" assert isinstance(commands, list) process = None @@ -263,7 +265,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) - _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + _, rc = runner( + GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True + ) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -292,7 +296,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None - branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) + branch_name, rc = runner( + GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root + ) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") @@ -341,7 +347,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? - pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + pieces["error"] = ( + "unable to parse git-describe output: '%s'" % describe_out + ) return pieces # tag @@ -370,7 +378,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] @@ -458,7 +468,9 @@ def render_pep440_pre(pieces): if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment - tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + tag_version, post_version = pep440_split_post( + pieces["closest-tag"] + ) rendered = tag_version if post_version is not None: rendered += ".post%d.dev%d" % ( @@ -647,7 +659,9 @@ def get_versions(): verbose = cfg.verbose try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) + return git_versions_from_keywords( + get_keywords(), cfg.tag_prefix, verbose + ) except NotThisMethod: pass diff --git a/legate_sparse/base.py b/legate_sparse/base.py index c9d99a31..fd01a266 100644 --- a/legate_sparse/base.py +++ b/legate_sparse/base.py @@ -44,8 +44,12 @@ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from __future__ import annotations -import cupynumeric +from typing import TYPE_CHECKING + +import cupynumeric as cn +import numpy as np from legate.core import LogicalStore, align from .config import SparseOpCode, rect1 @@ -58,6 +62,13 @@ store_to_cupynumeric_array, ) +if TYPE_CHECKING: + from typing import Any, Callable + + import numpy.typing as npt + + from cupynumeric.types import CastingKind + # CompressedBase is a base class for several different kinds of sparse # matrices, such as CSR, CSC, COO and DIA. @@ -74,8 +85,28 @@ class CompressedBase: Use specific format classes like csr_array instead. """ + shape: tuple[int, ...] + pos: LogicalStore + dtype: npt.dtype[Any] + format: str + crd: LogicalStore + _data: cn.ndarray + + def __init__(self, *args: Any, **kw: Any) -> None: + super().__init__(*args, **kw) + + @property + def data(self) -> cn.ndarray: + return self._data + + @property + def size(self) -> int: + raise NotImplementedError + @classmethod - def nnz_to_pos_cls(cls, q_nnz: LogicalStore): + def nnz_to_pos_cls( + cls, q_nnz: LogicalStore + ) -> tuple[LogicalStore, cn.ndarray]: """Convert non-zero counts to position arrays. This class method converts an array of non-zero counts per row/column @@ -93,15 +124,13 @@ def nnz_to_pos_cls(cls, q_nnz: LogicalStore): is the total number of non-zeros. """ q_nnz_arr = store_to_cupynumeric_array(q_nnz) - cs = cupynumeric.cumsum(q_nnz_arr) + cs = cn.cumsum(q_nnz_arr) cs_shifted = cs - q_nnz_arr cs_store = get_store_from_cupynumeric_array(cs) cs_shifted_store = get_store_from_cupynumeric_array(cs_shifted) # Zip the scan result into a rect1 region for the pos. pos = runtime.create_store( - rect1, # type: ignore - shape=(q_nnz.shape[0],), - optimize_scalar=False, + rect1, shape=(q_nnz.shape[0],), optimize_scalar=False ) task = runtime.create_auto_task(SparseOpCode.ZIP_TO_RECT1) pos_var = task.add_output(pos) @@ -113,7 +142,9 @@ def nnz_to_pos_cls(cls, q_nnz: LogicalStore): # Don't convert cs[-1] to an int to avoid blocking. return pos, cs[-1] - def nnz_to_pos(self, q_nnz: LogicalStore): + def nnz_to_pos( + self, q_nnz: LogicalStore + ) -> tuple[LogicalStore, cn.ndarray]: """Convert non-zero counts to position arrays for this instance. Parameters @@ -129,7 +160,12 @@ def nnz_to_pos(self, q_nnz: LogicalStore): """ return CompressedBase.nnz_to_pos_cls(q_nnz) - def asformat(self, format, copy=False): + def copy(self) -> CompressedBase: + raise NotImplementedError() + + def asformat( + self, format: str | None, copy: bool = False + ) -> CompressedBase: """Convert the matrix to a specified format. Parameters @@ -158,7 +194,9 @@ def asformat(self, format, copy=False): return self else: try: - convert_method = getattr(self, "to" + format) + convert_method: Callable[..., CompressedBase] = getattr( + self, "to" + format + ) except AttributeError as e: raise ValueError("Format {} is unknown.".format(format)) from e @@ -169,7 +207,12 @@ def asformat(self, format, copy=False): return convert_method() # The implementation of sum is mostly lifted from scipy.sparse. - def sum(self, axis=None, dtype=None, out=None): + def sum( + self, + axis: int | None = None, + dtype: npt.dtype[Any] | None = None, + out: cn.ndarray | None = None, + ) -> cn.ndarray: """Sum the matrix elements over a given axis. Parameters @@ -237,10 +280,10 @@ def sum(self, axis=None, dtype=None, out=None): # TODO: (marsaev) currently not supported as we don't have rmatmul yet # (need CSC to have easier sum over columns) raise NotImplementedError - ret = self.__rmatmul__(cupynumeric.ones((1, m), dtype=res_dtype)) + # ret = self.__rmatmul__(cn.ones((1, m), dtype=res_dtype)) else: # sum over rows - ret = self @ cupynumeric.ones((n, 1), dtype=res_dtype) + ret = self @ cn.ones((n, 1), dtype=res_dtype) if out is not None and out.shape != ret.shape: raise ValueError("dimensions do not match") @@ -248,7 +291,7 @@ def sum(self, axis=None, dtype=None, out=None): return ret.sum(axis=axis, dtype=dtype, out=out) # needed by _data_matrix - def _with_data(self, data, copy=True): + def _with_data(self, data: Any, copy: bool = True) -> CompressedBase: """Returns a matrix object with the same sparsity structure as self, but with different data. @@ -290,8 +333,13 @@ def _with_data(self, data, copy=True): copy=False, ) - def astype(self, dtype, casting="unsafe", copy=True): - dtype = cupynumeric.dtype(dtype) + def astype( + self, + dtype: npt.dtype[Any], + casting: CastingKind = "unsafe", + copy: bool = True, + ) -> CompressedBase: + dtype = np.dtype(dtype) # if type doesn't match, create a matrix copy with casted data array if self.dtype != dtype: return self._with_data( @@ -304,24 +352,24 @@ def astype(self, dtype, casting="unsafe", copy=True): # These univariate ufuncs preserve zeros. _ufuncs_with_fixed_point_at_zero = frozenset( [ - cupynumeric.sin, - cupynumeric.tan, - cupynumeric.arcsin, - cupynumeric.arctan, - cupynumeric.sinh, - cupynumeric.tanh, - cupynumeric.arcsinh, - cupynumeric.arctanh, - cupynumeric.rint, - cupynumeric.sign, - cupynumeric.expm1, - cupynumeric.log1p, - cupynumeric.deg2rad, - cupynumeric.rad2deg, - cupynumeric.floor, - cupynumeric.ceil, - cupynumeric.trunc, - cupynumeric.sqrt, + cn.sin, + cn.tan, + cn.arcsin, + cn.arctan, + cn.sinh, + cn.tanh, + cn.arcsinh, + cn.arctanh, + cn.rint, + cn.sign, + cn.expm1, + cn.log1p, + cn.deg2rad, + cn.rad2deg, + cn.floor, + cn.ceil, + cn.trunc, + cn.sqrt, ] ) @@ -329,14 +377,14 @@ def astype(self, dtype, casting="unsafe", copy=True): for npfunc in _ufuncs_with_fixed_point_at_zero: name = npfunc.__name__ - def _create_method(op): - def method(self): + def _create_method(op: Callable[[Any], Any]) -> Callable[[Any], Any]: + def method(self: Any) -> Any: result = op(self.data) return self._with_data(result) - method.__doc__ = "Element-wise %s.\n\nSee `numpy.%s` for more information." % ( - name, - name, + method.__doc__ = ( + "Element-wise %s.\n\nSee `numpy.%s` for more information." + % (name, name) ) method.__name__ = name @@ -345,56 +393,8 @@ def method(self): setattr(CompressedBase, name, _create_method(npfunc)) -# DenseSparseBase is a base class for sparse matrices that have a TACO -# format of {Dense, Sparse}. For our purposes, that means CSC and CSR -# matrices. -class DenseSparseBase: - """Base class for sparse matrices with dense-sparse format. - - This class provides functionality for sparse matrices that have a TACO - format of {Dense, Sparse}, which includes CSR and CSC matrices. - - Notes - ----- - This is an internal base class and should not be instantiated directly. - Use specific format classes like csr_array instead. - """ - - def __init__(self): - """Initialize the DenseSparseBase class.""" - self._balanced_pos_partition = None - - # consider using _with_data() here - @classmethod - def make_with_same_nnz_structure(cls, mat, arg, shape=None, dtype=None): - """Create a new matrix with the same non-zero structure as mat. - - Parameters - ---------- - mat : sparse matrix - The reference matrix whose structure to copy. - arg : array_like - The data for the new matrix. - shape : tuple, optional - The shape of the new matrix. If None, uses mat.shape. - dtype : dtype, optional - The data type of the new matrix. If None, uses mat.dtype. - - Returns - ------- - sparse matrix - A new matrix with the same structure as mat but with data from arg. - """ - if shape is None: - shape = mat.shape - if dtype is None: - dtype = mat.dtype - result = cls(arg, shape=shape, dtype=dtype) - return result - - # unpack_rect1_store unpacks a rect1 store into two int64 stores. -def unpack_rect1_store(pos): +def unpack_rect1_store(pos: LogicalStore) -> tuple[LogicalStore, LogicalStore]: """Unpack a rect1 store into two int64 stores. This function unpacks the compressed position array used in CSR/CSC @@ -423,7 +423,9 @@ def unpack_rect1_store(pos): # pack_to_rect1_store packs two int64 stores into a rect1 store. -def pack_to_rect1_store(lo, hi, output=None): +def pack_to_rect1_store( + lo: LogicalStore, hi: LogicalStore, output: LogicalStore | None = None +) -> LogicalStore: """Pack two int64 stores into a rect1 store. This function packs separate start and end position arrays into the diff --git a/legate_sparse/config.py b/legate_sparse/config.py index 8c601981..ab146521 100644 --- a/legate_sparse/config.py +++ b/legate_sparse/config.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import os import platform @@ -29,6 +30,10 @@ class _LegateSparseSharedLib: implements the core sparse matrix operations. """ + LEGATE_SPARSE_LOAD_CUDALIBS: int + LEGATE_SPARSE_UNLOAD_CUDALIBS: int + + LEGATE_SPARSE_CSR_TO_DENSE: int LEGATE_SPARSE_DENSE_TO_CSR: int LEGATE_SPARSE_DENSE_TO_CSR_NNZ: int LEGATE_SPARSE_ZIP_TO_RECT_1: int @@ -49,6 +54,9 @@ class _LegateSparseSharedLib: LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR: int LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU: int LEGATE_SPARSE_AXPBY: int + LEGATE_SPARSE_SPSOLVE: int + LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC: int + LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE: int def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any: @@ -88,7 +96,7 @@ class LegateSparseLib: library with the Legate runtime. """ - def __init__(self, name): + def __init__(self, name: str) -> None: """Initialize the Legate sparse library. Parameters @@ -98,9 +106,6 @@ def __init__(self, name): """ self.name = name self.runtime = None - self.shared_object = None - - self.name = name shared_lib_path = self.get_shared_library() assert shared_lib_path is not None @@ -118,7 +123,9 @@ def __init__(self, name): def register(self) -> None: """Register the library with the Legate runtime.""" - callback = getattr(self.shared_object, "legate_sparse_perform_registration") + callback = getattr( + self.shared_object, "legate_sparse_perform_registration" + ) callback() def get_shared_library(self) -> str: @@ -131,7 +138,9 @@ def get_shared_library(self) -> str: """ from legate_sparse.install_info import libpath - return os.path.join(libpath, "liblegate_sparse" + self.get_library_extension()) + return os.path.join( + libpath, "liblegate_sparse" + self.get_library_extension() + ) def get_legate_library(self) -> Library: """Get the Legate library object. @@ -181,7 +190,14 @@ def get_library_extension() -> str: """Name of the Legate sparse library.""" sparse_lib = LegateSparseLib(SPARSE_LIB_NAME) -sparse_lib.register() + +# Guard against double registration (can happen during Sphinx documentation builds) +try: + sparse_lib.register() +except Exception: + # Library may already be registered from a previous import + pass + _sparse = sparse_lib.shared_object # has to be called after register() _library = sparse_lib.get_legate_library() @@ -225,6 +241,10 @@ class SparseOpCode(IntEnum): SPGEMM_CSR_CSR_CSR = _sparse.LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR SPGEMM_CSR_CSR_CSR_GPU = _sparse.LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU + SPSOLVE = _sparse.LEGATE_SPARSE_SPSOLVE + GEAM_CSR_CSR_SYMBOLIC = _sparse.LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC + GEAM_CSR_CSR_COMPUTE = _sparse.LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE + # Register some types for us to use. rect1 = types.rect_type(1) diff --git a/legate_sparse/construct.py b/legate_sparse/construct.py new file mode 100644 index 00000000..89a16c6e --- /dev/null +++ b/legate_sparse/construct.py @@ -0,0 +1,260 @@ +# Copyright 2022-2024 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Portions of this file are also subject to the following license: +# +# Copyright (c) 2001-2002 Enthought, Inc. 2003-2022, SciPy Developers. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import cupynumeric as cn + +from .csr import csr_array + + +def _block(blocks, format="csr", dtype=None): + """Build a sparse CSR array from sparse sub-blocks using COO intermediate. + + 1. Extracts (row, col, data) from each block + 2. Adjusts indices by block offsets + 3. Concatenates all coordinates + 4. Builds CSR from COO format + """ + if format != "csr": + raise ValueError("Only 'csr' format is supported for block_array") + + if not isinstance(blocks, (list, tuple)): + blocks = list(blocks) + + blocks = [ + list(row) if isinstance(row, (list, tuple)) else [row] + for row in blocks + ] + + n_block_rows = len(blocks) + if n_block_rows == 0: + raise ValueError("blocks cannot be empty") + + n_block_cols = len(blocks[0]) + if n_block_cols == 0: + raise ValueError("blocks cannot be empty") + + # Row height and col width for a sub-block looks like this. + # +--------------+ + # | ^ | + # | | row height | + # | v | + # +--------------+ + # <- col width -> + + # store row heights and col widths of each sub-block + row_heights = [None] * n_block_rows + col_widths = [None] * n_block_cols + + for i in range(n_block_rows): + for j in range(n_block_cols): + block = blocks[i][j] + if block is None: + continue + + if not isinstance(block, csr_array): + raise TypeError( + f"blocks[{i}][{j}] must be a csr_array or None, " + f"got {type(block).__name__}" + ) + + block_nrows, block_ncols = block.shape + + # Check/set row height for this block row. + # The row heights of all the sub-blocks in a row of the input + # should be the same, else we can't concatenate horizontally + if row_heights[i] is None: + row_heights[i] = block_nrows + elif row_heights[i] != block_nrows: + raise ValueError( + f"blocks[{i}][{j}] has {block_nrows} rows, " + f"expected {row_heights[i]}" + ) + + # Check/set column width for this block column. + # The col widths of all the sub-blocks in a col of the input + # should be the same, else we can't concatenate vertically + if col_widths[j] is None: + col_widths[j] = block_ncols + elif col_widths[j] != block_ncols: + raise ValueError( + f"blocks[{i}][{j}] has {block_ncols} columns, " + f"expected {col_widths[j]}" + ) + + # The input can have None instead of a csr matrix. To correctly compute + # the row offsets for those cases, we set the row height to 0 if the + # input is None. + row_heights = cn.array([h if h is not None else 0 for h in row_heights]) + col_widths = cn.array([w if w is not None else 0 for w in col_widths]) + + # Compute the no. or rows and cols in the output matrix. + total_nrows = cn.sum(row_heights).item() + total_ncols = cn.sum(col_widths).item() + + # When the output matrix is empty, we don't need to concatenate. + if total_nrows == 0 or total_ncols == 0: + result_dtype = dtype if dtype is not None else cn.float64 + return csr_array((total_nrows, total_ncols), dtype=result_dtype) + + row_offsets = cn.concatenate([cn.array([0]), cn.cumsum(row_heights)]) + col_offsets = cn.concatenate([cn.array([0]), cn.cumsum(col_widths)]) + + if dtype is None: + dtypes = [b.dtype for row in blocks for b in row if b is not None] + dtype = cn.result_type(*dtypes) if dtypes else cn.float64 + + all_rows = [] + all_cols = [] + all_data = [] + + # Populate the concatenated (rows, cols, data) arrays for the + # output matrix. The outer loop concatenates the sub-blocks vertically + # while the inner loop concatenates them horizontally. This is done + # without creating any intermediate csr representation. + for i in range(n_block_rows): + row_offset = row_offsets[i].item() + + for j in range(n_block_cols): + block = blocks[i][j] + + # If block is empty, the (rows, cols, data) of the output matrix + # doesn't get modified, so we continue with the loop. + if block is None: + continue + + col_offset = col_offsets[j].item() + block_nrows = block.shape[0] + + indptr = block.indptr + indices = block.indices + data = block.data + + # Empty csr matrices don't modify the output matrix either, so we + # continue with the loop. + if data.size == 0: + continue + + # Expand the indptr array to store the row indices. + # For each row r, repeating r by (indptr[r+1] - indptr[r]) times + # the needed storage to store non-zero entries. + nnz_per_row = cn.diff(indptr) + block_rows = cn.repeat(cn.arange(block_nrows), nnz_per_row) + + # After concatenating the matrices, we get one block matrix that + # can be represented by (rows, cols, data) arrays. Note that + # we have to add the offsets for both the row and col indices + # that correspond to the non-zero in the previous sub-block as + # concatenate them horizontally. This is because the output matrix + # is going to be represented as one giant CSR matrix. + all_rows.append(block_rows + row_offset) + all_cols.append(indices + col_offset) + all_data.append(data) + + if not all_data: + result_dtype = dtype if dtype is not None else cn.float64 + return csr_array((total_nrows, total_ncols), dtype=result_dtype) + + concatenated_rows = cn.concatenate(all_rows) + concatenated_cols = cn.concatenate(all_cols) + concatenated_data = cn.concatenate(all_data).astype(dtype) + + return csr_array( + (concatenated_data, (concatenated_rows, concatenated_cols)), + shape=(total_nrows, total_ncols), + dtype=dtype, + ) + + +def block_array(blocks, format="csr", dtype=None): + """Build a sparse array from sparse sub-blocks. + + Parameters + ---------- + blocks : array_like + A 2-D array-like of shape (M, N) where each element is a sparse + CSR array or None. None elements are treated as zero matrices. + format : str, optional + Output format. Currently only 'csr' is supported. Default is 'csr'. + dtype : dtype, optional + Data type of the output array. If None, inferred from the blocks. + + Returns + ------- + csr_array + A sparse CSR array formed by combining the sub-blocks. + + Raises + ------ + ValueError + - If `format` is not 'csr'. + - If `blocks` is empty (has zero rows or zero columns). + - If sub-blocks in the same row have different numbers of rows. + - If sub-blocks in the same column have different numbers of columns. + TypeError + - If any non-None block is not a csr_array. + + Notes + ----- + This function may not be performant when the number of sub-blocks is large, + as it iterates over all blocks sequentially to extract and concatenate their + COO coordinates. + + Examples + -------- + >>> import legate_sparse as sparse + >>> A = sparse.csr_array([[1, 2], [3, 4]]) + >>> B = sparse.csr_array([[5], [6]]) + >>> C = sparse.csr_array([[7, 8, 9]]) + >>> result = sparse.block_array([[A, B], [C, None]]) + >>> result.todense() + array([[1, 2, 5], + [3, 4, 6], + [7, 8, 9]]) + """ + return _block(blocks, format, dtype) diff --git a/legate_sparse/coverage.py b/legate_sparse/coverage.py index 8765044e..a6fa2bae 100644 --- a/legate_sparse/coverage.py +++ b/legate_sparse/coverage.py @@ -16,7 +16,7 @@ from functools import wraps from types import FunctionType, MethodDescriptorType, MethodType, ModuleType -from typing import Any, Container, Mapping, Optional, cast +from typing import Any, Callable, Container, Mapping, TypeVar, cast from legate.core import track_provenance from typing_extensions import Protocol @@ -27,7 +27,7 @@ def filter_namespace( ns: Mapping[str, Any], *, - omit_names: Optional[Container[str]] = None, + omit_names: Container[str] | None = None, omit_types: tuple[type, ...] = (), ) -> dict[str, Any]: omit_names = omit_names or set() @@ -43,8 +43,7 @@ def should_wrap(obj: object) -> bool: class AnyCallable(Protocol): - def __call__(self, *args: Any, **kwargs: Any) -> Any: - ... + def __call__(self, *args: Any, **kwargs: Any) -> Any: ... def wrap(func: AnyCallable) -> Any: @@ -56,7 +55,9 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: return wrapper -def clone_module(origin_module: ModuleType, new_globals: dict[str, Any]) -> None: +def clone_module( + origin_module: ModuleType, new_globals: dict[str, Any] +) -> None: """Copy attributes from one module to another, excluding submodules Function types are wrapped with a decorator to report API calls. All @@ -84,7 +85,10 @@ def clone_module(origin_module: ModuleType, new_globals: dict[str, Any]) -> None new_globals[attr] = wrapped -def clone_scipy_arr_kind(origin_class: type) -> Any: +T = TypeVar("T") + + +def clone_scipy_arr_kind(origin_class: type) -> Callable[[T], T]: """Copy attributes from an origin class to the input class. Method types are wrapped with a decorator to report API calls. All @@ -92,7 +96,7 @@ def clone_scipy_arr_kind(origin_class: type) -> Any: """ - def body(cls: type): + def body(cls: T) -> T: for attr, value in cls.__dict__.items(): # Only need to wrap things that are in the origin class to begin # with diff --git a/legate_sparse/csr.py b/legate_sparse/csr.py index 3008356e..051298e9 100644 --- a/legate_sparse/csr.py +++ b/legate_sparse/csr.py @@ -44,12 +44,14 @@ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from __future__ import annotations import warnings +from typing import TYPE_CHECKING, cast -import cupynumeric -import numpy -import scipy # type: ignore +import cupynumeric as cn +import numpy as np +import scipy from legate.core import ( ImageComputationHint, Scalar, @@ -60,12 +62,7 @@ types, ) -from .base import ( - CompressedBase, - DenseSparseBase, - pack_to_rect1_store, - unpack_rect1_store, -) +from .base import CompressedBase, pack_to_rect1_store, unpack_rect1_store from .config import SparseOpCode, rect1 from .coverage import clone_scipy_arr_kind from .runtime import runtime @@ -75,22 +72,31 @@ SUPPORTED_DATATYPES, array_from_store_or_array, cast_arr, - cast_to_common_type, cast_to_store, copy_store, + find_common_type, find_last_user_stacklevel, get_storage_type, get_store_from_cupynumeric_array, + is_dense, is_dtype_supported, is_scalar_like, + is_sparse, sort_by_rows_then_cols, store_from_store_or_array, store_to_cupynumeric_array, ) +if TYPE_CHECKING: + from typing import Any, Callable + + import numpy.typing as npt + + from cupynumeric.types import CastingKind + @clone_scipy_arr_kind(scipy.sparse.csr_array) -class csr_array(CompressedBase, DenseSparseBase): +class csr_array(CompressedBase): """Compressed Sparse Row array. This can be instantiated in several ways: @@ -187,7 +193,13 @@ class csr_array(CompressedBase, DenseSparseBase): [4, 5, 6]]) """ - def __init__(self, arg, shape=None, dtype=None, copy=False): + def __init__( + self, + arg: Any, + shape: tuple[int, ...] | None = None, + dtype: npt.dtype[Any] | None = None, + copy: bool = False, + ) -> None: """Initialize a CSR array. Parameters @@ -233,11 +245,11 @@ def __init__(self, arg, shape=None, dtype=None, copy=False): # Note that cupynumeric.dtype(None) returns float64, so make # sure dtype is passed to csr_array if it is known apriori, # especially when copying the matrix - dtype = cupynumeric.dtype(dtype) + dtype = np.dtype(dtype) # If from numpy.array - convert to cupynumeric array first - if isinstance(arg, numpy.ndarray): - arg = cupynumeric.array(arg) + if isinstance(arg, np.ndarray): + arg = cn.array(arg) # from scipy.sparse.csr_array if isinstance(arg, scipy.sparse.csr_array) or isinstance( @@ -247,7 +259,7 @@ def __init__(self, arg, shape=None, dtype=None, copy=False): arg = (arg.data, arg.indices, arg.indptr) # from dense cupynumeric array - if isinstance(arg, cupynumeric.ndarray): + if isinstance(arg, cn.ndarray): assert arg.ndim == 2 shape = arg.shape @@ -257,18 +269,18 @@ def __init__(self, arg, shape=None, dtype=None, copy=False): src_store = get_store_from_cupynumeric_array(arg) q_nnz = runtime.create_store(nnz_ty, shape=Shape((shape[0],))) - task = runtime.create_auto_task(SparseOpCode.DENSE_TO_CSR_NNZ) + task1 = runtime.create_auto_task(SparseOpCode.DENSE_TO_CSR_NNZ) promoted_q_nnz = q_nnz.promote(1, shape[1]) - nnz_per_row_part = task.add_output(promoted_q_nnz) - src_part = task.add_input(src_store) - task.add_constraint(broadcast(nnz_per_row_part, (1,))) - task.add_constraint(align(nnz_per_row_part, src_part)) - task.execute() + nnz_per_row_part = task1.add_output(promoted_q_nnz) + src_part = task1.add_input(src_store) + task1.add_constraint(broadcast(nnz_per_row_part, (1,))) + task1.add_constraint(align(nnz_per_row_part, src_part)) + task1.execute() # Assemble the output CSR array using the non-zeros per row. - self.pos, nnz = self.nnz_to_pos(q_nnz) + self.pos, nnz_scalar = self.nnz_to_pos(q_nnz) # Block and convert the nnz future into an int. - nnz = int(nnz) + nnz = int(nnz_scalar) self.crd = runtime.create_store(coord_ty, shape=((nnz,))) self.vals = runtime.create_store(arg.dtype, shape=((nnz,))) @@ -276,14 +288,14 @@ def __init__(self, arg, shape=None, dtype=None, copy=False): # and 2-D input array, our only option is launch single process # which will handle all of the data, which makes this funciton not usable # on scale. - task = runtime.create_manual_task(SparseOpCode.DENSE_TO_CSR, (1,)) + task2 = runtime.create_manual_task(SparseOpCode.DENSE_TO_CSR, (1,)) promoted_pos = self.pos.promote(1, shape[1]) - task.add_input(promoted_pos) - src_part = task.add_input(src_store) - task.add_output(self.crd) - task.add_output(self.vals) - task.execute() + task2.add_input(promoted_pos) + task2.add_input(src_store) + task2.add_output(self.crd) + task2.add_output(self.vals) + task2.execute() # we ignore dtype (TODO: is this behaviour matches SciPy?) and use arg.dtype dtype = arg.dtype @@ -298,7 +310,9 @@ def __init__(self, arg, shape=None, dtype=None, copy=False): self.canonical_format = arg.canonical_format elif isinstance(arg, tuple): - dtype, shape = self._init_from_tuple_inputs(arg, dtype, shape, copy) + dtype, shape = self._init_from_tuple_inputs( + arg, dtype, shape, copy + ) else: raise NotImplementedError("Can't convert to CSR from the input") @@ -315,13 +329,19 @@ def __init__(self, arg, shape=None, dtype=None, copy=False): if dtype is None: dtype = temp_vals_type if temp_vals_type is not dtype: - self.data = self.data.astype(dtype) - if not isinstance(dtype, numpy.dtype): - dtype = numpy.dtype(dtype) + self._data = self._data.astype(dtype) + if not isinstance(dtype, np.dtype): + dtype = np.dtype(dtype) # Saving the type self._dtype = dtype - def _init_from_tuple_inputs(self, arg, dtype, shape, copy): + def _init_from_tuple_inputs( + self, + arg: tuple[Any, ...], + dtype: npt.dtype[Any] | None, + shape: tuple[int, ...] | None, + copy: bool, + ) -> tuple[npt.dtype[Any], tuple[int, ...]]: """Initialize CSR array from tuple inputs. This internal method handles the various tuple-based constructor formats: @@ -333,9 +353,9 @@ def _init_from_tuple_inputs(self, arg, dtype, shape, copy): ---------- arg : tuple The input tuple in one of the supported formats. - dtype : dtype, optional + dtype : dtype The desired data type. - shape : tuple, optional + shape : tuple The shape of the array. copy : bool Whether to copy the input data. @@ -353,12 +373,14 @@ def _init_from_tuple_inputs(self, arg, dtype, shape, copy): If the tuple format is not supported. """ - def _get_empty_csr(dtype, nrows_plus_one): + def _get_empty_csr( + dtype: npt.dtype[Any] | None, nrows_plus_one: int + ) -> tuple[cn.ndarray, cn.ndarray, cn.ndarray]: """Helper function to create empty CSR arrays.""" return ( - cupynumeric.zeros(0, dtype=dtype), - cupynumeric.zeros(0, dtype=coord_ty), - cupynumeric.zeros(nrows_plus_one, dtype=coord_ty), + cn.zeros(0, dtype=dtype), + cn.zeros(0, dtype=coord_ty), + cn.zeros(nrows_plus_one, dtype=coord_ty), ) # Couple of options here @@ -367,16 +389,14 @@ def _get_empty_csr(dtype, nrows_plus_one): # csr_array((M, N), [dtype]) if not isinstance(arg[1], tuple): (M, N) = arg - if not isinstance(M, (int, numpy.integer)) or not isinstance( - N, (int, numpy.integer) + if not isinstance(M, (int, np.integer)) or not isinstance( + N, (int, np.integer) ): NotImplementedError( "Input tuple for empty CSR ctor should be it's shape" ) shape = arg - dtype = ( - cupynumeric.float64 if dtype is None else cupynumeric.dtype(dtype) - ) + dtype = np.float64 if dtype is None else np.dtype(dtype) # and pass this to next ctor arg = _get_empty_csr(dtype, M + 1) @@ -394,12 +414,12 @@ def _get_empty_csr(dtype, nrows_plus_one): copy = False else: # if passed numpy arrays - convert them - if isinstance(st_row, numpy.ndarray): - st_row = cupynumeric.array(st_row) - if isinstance(st_col, numpy.ndarray): - st_col = cupynumeric.array(st_col) - if isinstance(st_data, numpy.ndarray): - st_data = cupynumeric.array(st_data) + if isinstance(st_row, np.ndarray): + st_row = cn.array(st_row) + if isinstance(st_col, np.ndarray): + st_col = cn.array(st_col) + if isinstance(st_data, np.ndarray): + st_data = cn.array(st_data) if not self.indices_sorted: # NOTE that CSR format does not require sorting the data @@ -407,9 +427,15 @@ def _get_empty_csr(dtype, nrows_plus_one): # sorted by rows and then by columns, so we sort the data # by columns as well - row_array = array_from_store_or_array(st_row, copy=copy) - col_array = array_from_store_or_array(st_col, copy=copy) - new_data = array_from_store_or_array(st_data, copy=copy) + row_array = array_from_store_or_array( + st_row, copy=copy + ) + col_array = array_from_store_or_array( + st_col, copy=copy + ) + new_data = array_from_store_or_array( + st_data, copy=copy + ) indices = sort_by_rows_then_cols(row_array, col_array) @@ -417,10 +443,10 @@ def _get_empty_csr(dtype, nrows_plus_one): row_array = row_array[indices] col_array = col_array[indices] - row_offsets = cupynumeric.append( - cupynumeric.array([0]), - cupynumeric.cumsum( - cupynumeric.bincount(row_array, minlength=shape[0]) + row_offsets = cn.append( + cn.array([0]), + cn.cumsum( + cn.bincount(row_array, minlength=shape[0]) ), ) @@ -432,10 +458,10 @@ def _get_empty_csr(dtype, nrows_plus_one): else: # we need to convert row indices to row offsets/indptr row_array = array_from_store_or_array(st_row) - row_offsets = cupynumeric.append( - cupynumeric.array([0]), - cupynumeric.cumsum( - cupynumeric.bincount(row_array, minlength=shape[0]) + row_offsets = cn.append( + cn.array([0]), + cn.cumsum( + cn.bincount(row_array, minlength=shape[0]) ), ) if copy: @@ -452,12 +478,12 @@ def _get_empty_csr(dtype, nrows_plus_one): (data, indices, indptr) = arg # if passed numpy arrays - convert them - if isinstance(data, numpy.ndarray): - data = cupynumeric.array(data) - if isinstance(indices, numpy.ndarray): - indices = cupynumeric.array(indices).astype(coord_ty) - if isinstance(indptr, numpy.ndarray): - indptr = cupynumeric.array(indptr).astype(coord_ty) + if isinstance(data, np.ndarray): + data = cn.array(data) + if isinstance(indices, np.ndarray): + indices = cn.array(indices).astype(coord_ty) + if isinstance(indptr, np.ndarray): + indptr = cn.array(indptr).astype(coord_ty) # checking that shape matches with expectations for row_offsets if indptr.shape[0] == shape[0] + 1: @@ -470,8 +496,12 @@ def _get_empty_csr(dtype, nrows_plus_one): ) # copy explicitly, just in case (there are paths that won't create temp object) # For crd we enforce our internal type - self.crd = store_from_store_or_array(cast_arr(indices, coord_ty), copy) - self.vals = store_from_store_or_array(cast_to_store(data), copy) + self.crd = store_from_store_or_array( + cast_arr(indices, coord_ty), copy + ) + self.vals = store_from_store_or_array( + cast_to_store(data), copy + ) # Otherwise we assume that we are passing pos store from existing csr_array # This is internal only functionality, and we assume here only Store or cupynumeric.array @@ -487,15 +517,22 @@ def _get_empty_csr(dtype, nrows_plus_one): dtype = get_storage_type(data) + assert dtype is not None + assert shape is not None + return dtype, shape + # correct return type value on this subclass + def _with_data(self, data: Any, copy: bool = True) -> csr_array: + return cast(csr_array, super()._with_data(data, copy)) + @property - def dim(self): + def dim(self) -> int: """Number of dimensions (always 2 for CSR arrays).""" return self.ndim @property - def nnz(self): + def nnz(self) -> int: """Number of stored values, including explicit zeros. Returns @@ -506,7 +543,12 @@ def nnz(self): return self.vals.shape[0] @property - def dtype(self): + def size(self) -> int: + """Number of stored values""" + return self.nnz + + @property + def dtype(self) -> npt.dtype[Any]: """Data type of the array. Returns @@ -518,7 +560,7 @@ def dtype(self): return self._dtype # Enable direct operation on the values array. - def get_data(self): + def get_data(self) -> cn.ndarray: """Get the data array of the CSR matrix. Returns @@ -529,7 +571,7 @@ def get_data(self): return store_to_cupynumeric_array(self.vals) # From array, - def set_data(self, data): + def set_data(self, data: cn.ndarray) -> None: """Set the data array of the CSR matrix. Parameters @@ -542,9 +584,9 @@ def set_data(self, data): AssertionError If data is not a cupynumeric.ndarray. """ - if isinstance(data, numpy.ndarray): - data = cupynumeric.array(data) - assert isinstance(data, cupynumeric.ndarray) + if isinstance(data, np.ndarray): + data = cn.array(data) + assert isinstance(data, cn.ndarray) self.vals = get_store_from_cupynumeric_array(data) self._dtype = data.dtype @@ -553,7 +595,7 @@ def set_data(self, data): ) # Enable direct operation on the indices array. - def get_indices(self): + def get_indices(self) -> cn.ndarray: """Get the column indices array of the CSR matrix. Returns @@ -563,7 +605,7 @@ def get_indices(self): """ return store_to_cupynumeric_array(self.crd) - def set_indices(self, indices): + def set_indices(self, indices: cn.ndarray) -> None: """Set the column indices array of the CSR matrix. Parameters @@ -581,19 +623,21 @@ def set_indices(self, indices): Setting new indices will mark the matrix as not having sorted indices and not being in canonical format. """ - if isinstance(indices, numpy.ndarray): - indices = cupynumeric.array(indices) - assert isinstance(indices, cupynumeric.ndarray) + if isinstance(indices, np.ndarray): + indices = cn.array(indices) + assert isinstance(indices, cn.ndarray) self.crd = get_store_from_cupynumeric_array(indices) # we can't guarantee new indices are sorted self.canonical_format = False self.indices_sorted = False indices = property( - fget=get_indices, fset=set_indices, doc="CSR format index array of the matrix" + fget=get_indices, + fset=set_indices, + doc="CSR format index array of the matrix", ) - def get_indptr(self): + def get_indptr(self) -> cn.ndarray: """Get the index pointer array of the CSR matrix. Returns @@ -605,14 +649,14 @@ def get_indptr(self): """ row_start_st, row_end_st = unpack_rect1_store(self.pos) row_start = store_to_cupynumeric_array(row_start_st) - return cupynumeric.append(row_start, [self.nnz]) + return cn.append(row_start, [self.nnz]) # Disallow changing intptrs directly indptr = property( fget=get_indptr, doc="CSR format index pointer array of the matrix" ) - def _get_row_indices(self): + def _get_row_indices(self) -> cn.ndarray: """Helper routine that converts pos to row indices. This internal method expands the compressed row storage format's position @@ -638,7 +682,7 @@ def _get_row_indices(self): task.execute() return store_to_cupynumeric_array(row_indices) - def has_sorted_indices(self): + def has_sorted_indices(self) -> bool: """Determine whether the matrix has sorted indices. Returns @@ -648,7 +692,7 @@ def has_sorted_indices(self): """ return self.indices_sorted - def has_canonical_format(self): + def has_canonical_format(self) -> bool: """Determine whether the matrix is in canonical format. Returns @@ -665,7 +709,7 @@ def has_canonical_format(self): return self.canonical_format # The rest of the methods - def diagonal(self, k=0): + def diagonal(self, k: int = 0) -> cn.ndarray: """Return the k-th diagonal of the matrix. Parameters @@ -691,7 +735,7 @@ def diagonal(self, k=0): """ rows, cols = self.shape if k <= -rows or k >= cols: - return cupynumeric.empty(0, dtype=self.dtype) + return cn.empty(0, dtype=self.dtype) output = runtime.create_store( self.dtype, shape=Shape((min(rows + min(k, 0), cols - max(k, 0)),)) ) @@ -713,7 +757,9 @@ def diagonal(self, k=0): task.execute() return store_to_cupynumeric_array(output) - def todense(self, order=None, out=None): + def todense( + self, order: str | None = None, out: cn.ndarray | None = None + ) -> cn.ndarray: """Return a dense matrix representation of this matrix. Parameters @@ -744,25 +790,25 @@ def todense(self, order=None, out=None): if order is not None: raise NotImplementedError if out is not None: - out = cupynumeric.array(out) + out = cn.array(out) if out.dtype != self.dtype: raise ValueError( f"Output type {out.dtype} is not consistent with dtype {self.dtype}" ) - out = get_store_from_cupynumeric_array(out) + out_store = get_store_from_cupynumeric_array(out) elif out is None: - out = runtime.create_store(self.dtype, shape=self.shape) + out_store = runtime.create_store(self.dtype, shape=self.shape) task = runtime.create_manual_task(SparseOpCode.CSR_TO_DENSE, (1,)) self.pos.promote(1, self.shape[1]) - task.add_output(out) + task.add_output(out_store) task.add_input(self.pos) task.add_input(self.crd) task.add_input(self.vals) task.execute() - return store_to_cupynumeric_array(out) + return store_to_cupynumeric_array(out_store) - def multiply(self, other): + def multiply(self, other: Any) -> csr_array: """Point-wise multiplication by another matrix, vector, or scalar. Parameters @@ -779,9 +825,9 @@ def multiply(self, other): ----- This is equivalent to the * operator. """ - return self * other + return cast(csr_array, self * other) - def __rmul__(self, other): + def __rmul__(self, other: Any) -> csr_array: """Right multiplication by a scalar. Parameters @@ -794,10 +840,10 @@ def __rmul__(self, other): csr_array The result of the multiplication. """ - return self * other + return cast(csr_array, self * other) # This is an element-wise operation now. - def __mul__(self, other): + def __mul__(self, other: Any) -> csr_array: """Element-wise multiplication. Parameters @@ -820,10 +866,10 @@ def __mul__(self, other): Currently only supports scalar multiplication. Array multiplication is not implemented. """ - if isinstance(other, numpy.ndarray): - other = cupynumeric.array(other) + if isinstance(other, np.ndarray): + other = cn.array(other) - if cupynumeric.ndim(other) == 0: + if cn.ndim(other) == 0: # If we have a scalar, then do an element-wise multiply on the # values array. new_vals = store_to_cupynumeric_array(self.vals) * other @@ -832,7 +878,7 @@ def __mul__(self, other): raise NotImplementedError # rmatmul represents the operation other @ self. - def __rmatmul__(self, other): + def __rmatmul__(self, other: Any) -> cn.ndarray | csr_array: """Right matrix multiplication (other @ self). Parameters @@ -858,7 +904,7 @@ def __rmatmul__(self, other): # Handle dense @ CSR raise NotImplementedError - def __matmul__(self, other): + def __matmul__(self, other: Any) -> cn.ndarray | csr_array: """Matrix multiplication (self @ other). Parameters @@ -877,7 +923,9 @@ def __matmul__(self, other): """ return self.dot(other) - def _compare_scalar(self, other, op): + def _compare_scalar( + self, other: object, op: Callable[..., cn.ndarray] + ) -> csr_array: """Helper method for element-wise comparison operations with scalars. This methods returns a boolean CSR array with True values where the comparison for op returns True. @@ -898,7 +946,7 @@ def _compare_scalar(self, other, op): mask = op(store_to_cupynumeric_array(self.vals), other) col_indices = store_to_cupynumeric_array(self.crd)[mask] row_indices = self._get_row_indices()[mask] - vals = cupynumeric.ones(row_indices.size, dtype=bool) + vals = cn.ones(row_indices.size, dtype=bool) # NOTE: # If the data was already sorted by rows and cols in self, @@ -906,12 +954,10 @@ def _compare_scalar(self, other, op): # but there's no clean way to pass to the class that the data # is already sorted return csr_array( - (vals, (row_indices, col_indices)), - shape=self.shape, - dtype=bool, + (vals, (row_indices, col_indices)), shape=self.shape, dtype=bool ) - def __gt__(self, other): + def __gt__(self, other: object) -> csr_array: """Element-wise greater than comparison with a scalar value. This operates only on the existing non-zero elements of the matrix. @@ -936,9 +982,9 @@ def __gt__(self, other): >>> A = csr_array(...) >>> mask = A > 0.5 # Returns boolean CSR array """ - return self._compare_scalar(other, cupynumeric.greater) + return self._compare_scalar(other, cn.greater) - def __lt__(self, other): + def __lt__(self, other: object) -> csr_array: """Element-wise less than comparison with a scalar value. This operates only on the existing non-zero elements of the matrix. @@ -963,9 +1009,9 @@ def __lt__(self, other): >>> A = csr_array(...) >>> mask = A < 0.5 # Returns boolean CSR array """ - return self._compare_scalar(other, cupynumeric.less) + return self._compare_scalar(other, cn.less) - def __ge__(self, other): + def __ge__(self, other: object) -> csr_array: """Element-wise greater than or equal comparison with a scalar value. This operates only on the existing non-zero elements of the matrix. @@ -990,9 +1036,9 @@ def __ge__(self, other): >>> A = csr_array(...) >>> mask = A >= 0.5 # Returns boolean CSR array """ - return self._compare_scalar(other, cupynumeric.greater_equal) + return self._compare_scalar(other, cn.greater_equal) - def __le__(self, other): + def __le__(self, other: object) -> csr_array: """Element-wise less than or equal comparison with a scalar value. This operates only on the existing non-zero elements of the matrix. @@ -1017,9 +1063,9 @@ def __le__(self, other): >>> A = csr_array(...) >>> mask = A <= 0.5 # Returns boolean CSR array """ - return self._compare_scalar(other, cupynumeric.less_equal) + return self._compare_scalar(other, cn.less_equal) - def __eq__(self, other): + def __eq__(self, other: object) -> csr_array: # type: ignore [override] """Element-wise equality comparison with a scalar value. This operates only on the existing non-zero elements of the matrix. @@ -1044,9 +1090,9 @@ def __eq__(self, other): >>> A = csr_array(...) >>> mask = A == 0.5 # Returns boolean CSR array """ - return self._compare_scalar(other, cupynumeric.equal) + return self._compare_scalar(other, cn.equal) - def __ne__(self, other): + def __ne__(self, other: object) -> csr_array: # type: ignore [override] """Element-wise not equal comparison with a scalar value. This operates only on the existing non-zero elements of the matrix. @@ -1071,9 +1117,11 @@ def __ne__(self, other): >>> A = csr_array(...) >>> mask = A != 0.5 # Returns boolean CSR array """ - return self._compare_scalar(other, cupynumeric.not_equal) + return self._compare_scalar(other, cn.not_equal) - def __setitem__(self, key, value): + def __setitem__( + self, key: csr_array | csr_matrix, value: Any + ) -> csr_array: """Set values in the matrix using a boolean CSR mask. Parameters @@ -1118,7 +1166,9 @@ def __setitem__(self, key, value): assert key.shape == self.shape assert key.dtype == bool - value_store = runtime.legate_runtime.create_store_from_scalar(Scalar(value)) + value_store = runtime.legate_runtime.create_store_from_scalar( + Scalar(value) + ) # launch c++ task task = runtime.create_auto_task(SparseOpCode.CSR_INDEXING_CSR) @@ -1144,7 +1194,76 @@ def __setitem__(self, key, value): return self - def dot(self, other, out=None): + def __neg__(self) -> csr_array: + """Return -self (negation of all values).""" + return self._with_data( + -store_to_cupynumeric_array(self.vals), copy=True + ) + + # self - other + def __sub__(self, other) -> csr_array: + if is_scalar_like(other): + if other == 0: + return self.copy() + raise NotImplementedError( + "Subtraction of a scalar from a Legate Sparse array " + "will break sparsity and is not supported." + "Use the method data() to manipulate only the nonzeros." + ) + elif is_sparse(other): + if other.shape != self.shape: + raise ValueError( + "Inconsistent shapes: ({self.shape}, {other.shape})" + ) + return geam(self, other, 1.0, -1.0, None) + elif is_dense(other): + return self.todense() - other + + return NotImplemented + + # other - self + def __rsub__(self, other: csr_array) -> csr_array: + if is_scalar_like(other): + if other == 0: + return -self.copy() + raise NotImplementedError( + "Subtraction of a scalar from a Legate Sparse array " + "will break sparsity and is not supported." + "Use the method data() to manipulate only the nonzeros." + ) + elif is_dense(other): + return other - self.todense() + + return NotImplemented + + # self + other + def __add__(self, other): + if is_scalar_like(other): + if other == 0: + return self.copy() + raise NotImplementedError( + "Addition of a scalar to a Legate Sparse array " + "will break sparsity and is not supported." + "Use the method data() to manipulate only the nonzeros." + ) + elif is_sparse(other): + if other.shape != self.shape: + raise ValueError( + "Inconsistent shapes: ({self.shape}, {other.shape})" + ) + return geam(self, other, 1.0, 1.0, None) + elif is_dense(other): + return self.todense() + other + + return NotImplemented + + # other + self + def __radd__(self, other): + return self.__add__(other) + + def dot( + self, other: cn.ndarray | csr_array, out: cn.ndarray | None = None + ) -> cn.ndarray | csr_array: """Ordinary dot product. Parameters @@ -1200,7 +1319,7 @@ def dot(self, other, out=None): """ # If output specified - it should be cupynumeric array if out is not None: - assert isinstance(out, cupynumeric.ndarray) + assert isinstance(out, cn.ndarray) # only floating point operations are supported by cusparse at the moment if runtime.num_gpus > 0: @@ -1214,10 +1333,12 @@ def dot(self, other, out=None): raise NotImplementedError(msg) # If other.shape = (M,) then it's SpMV - if len(other.shape) == 1 or (len(other.shape) == 2 and other.shape[1] == 1): + if len(other.shape) == 1 or ( + len(other.shape) == 2 and other.shape[1] == 1 + ): # convert X to the cupynumeric array if needed - if not isinstance(other, cupynumeric.ndarray): - other = cupynumeric.array(other) + if not isinstance(other, cn.ndarray): + other = cn.array(other) assert self.shape[1] == other.shape[0] # for the case of X shape == (M, 1) other_originally_2d = False @@ -1233,11 +1354,13 @@ def dot(self, other, out=None): category=RuntimeWarning, stacklevel=level, ) - other = cupynumeric.array(other) + other = cn.array(other) # Coerce A and x into a common type. Use that coerced type # to find the type of the output. - A, x = cast_to_common_type(self, other) + common_dtype = find_common_type(self, other) + A = self.astype(common_dtype, copy=False) + x = other.astype(common_dtype, copy=False) if out is None: y = store_to_cupynumeric_array( runtime.create_store(A.dtype, shape=(self.shape[0],)) @@ -1272,12 +1395,16 @@ def dot(self, other, out=None): if out is not None: raise ValueError("Cannot provide out for CSRxCSR matmul.") assert self.shape[1] == other.shape[0] - return spgemm_csr_csr_csr(*cast_to_common_type(self, other)) + common_dtype = find_common_type(self, other) + return spgemm_csr_csr_csr( + self.astype(common_dtype, copy=False), + other.astype(common_dtype, copy=False), + ) else: raise NotImplementedError # Misc - def _getpos(self): + def _getpos(self) -> list[tuple[int, int]]: """Helper method to get row start and end positions. This internal method unpacks the compressed row storage format's position array @@ -1295,7 +1422,7 @@ def _getpos(self): row_end = store_to_cupynumeric_array(row_end_st) return [(i, j) for (i, j) in zip(row_start, row_end)] - def copy(self): + def copy(self) -> csr_array: """Returns a copy of this matrix. Returns @@ -1305,7 +1432,7 @@ def copy(self): """ return csr_array(self, dtype=self.dtype) - def conj(self, copy=True): + def conj(self, copy: bool = True) -> csr_array: """Element-wise complex conjugate. Parameters @@ -1329,7 +1456,9 @@ def conj(self, copy=True): get_store_from_cupynumeric_array(self.data.conj()), copy=False ) - def transpose(self, axes=None, copy=False): + def transpose( + self, axes: Any | None = None, copy: bool = False + ) -> csr_array: """Reverses the dimensions of the sparse matrix. Parameters @@ -1373,7 +1502,9 @@ def transpose(self, axes=None, copy=False): task.execute() # sort - sort_mask = cupynumeric.argsort(self.crd, kind="stable") + sort_mask = cn.argsort( + store_to_cupynumeric_array(self.crd), kind="stable" + ) new_rows = self.get_indices()[sort_mask] new_ci = store_to_cupynumeric_array(rows_expanded)[sort_mask] new_data = self.get_data()[sort_mask] @@ -1388,7 +1519,7 @@ def transpose(self, axes=None, copy=False): T = property(transpose, doc="Transpose of the matrix") - def asformat(self, format, copy=False): + def asformat(self, format: str | None, copy: bool = False) -> csr_array: """Convert this matrix to a specified format. Parameters @@ -1417,7 +1548,16 @@ def asformat(self, format, copy=False): else: raise NotImplementedError("Only CSR format is supported right now") - def tocsr(self, copy=False): + # correct return type value on this subclass + def astype( + self, + dtype: npt.dtype[Any], + casting: CastingKind = "unsafe", + copy: bool = True, + ) -> csr_array: + return cast(csr_array, super().astype(dtype, casting, copy)) + + def tocsr(self, copy: bool = False) -> csr_array: """Convert this matrix to a CSR matrix. Parameters @@ -1439,7 +1579,7 @@ def tocsr(self, copy=False): return self.copy().tocsr(copy=False) return self - def nonzero(self): + def nonzero(self) -> tuple[cn.ndarray, cn.ndarray]: """Return the indices of the non-zero elements. Returns @@ -1455,13 +1595,15 @@ def nonzero(self): """ task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES) - row_indices = runtime.create_store(coord_ty, shape=self.crd.shape) - row_indices_part = task.add_output(row_indices) + row_indices_store = runtime.create_store( + coord_ty, shape=self.crd.shape + ) + row_indices_part = task.add_output(row_indices_store) pos_part = task.add_input(self.pos) task.add_constraint(image(pos_part, row_indices_part)) task.execute() - row_indices = store_to_cupynumeric_array(row_indices) + row_indices = store_to_cupynumeric_array(row_indices_store) col_indices = store_to_cupynumeric_array(self.crd) vals_array = store_to_cupynumeric_array(self.vals) mask = vals_array != 0.0 @@ -1474,7 +1616,7 @@ def nonzero(self): # spmv computes y = A @ x. -def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray): +def spmv(A: csr_array, x: cn.ndarray, y: cn.ndarray) -> None: """Perform sparse matrix vector product y = A @ x. Parameters @@ -1506,10 +1648,16 @@ def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray): x_var = task.add_input(x_store) task.add_constraint(align(y_var, pos_var)) - task.add_constraint(image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)) - task.add_constraint(image(pos_var, vals_var, hint=ImageComputationHint.FIRST_LAST)) + task.add_constraint( + image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST) + ) + task.add_constraint( + image(pos_var, vals_var, hint=ImageComputationHint.FIRST_LAST) + ) # exact or approximate image to X - task.add_constraint(image(crd_var, x_var, hint=ImageComputationHint.MIN_MAX)) + task.add_constraint( + image(crd_var, x_var, hint=ImageComputationHint.MIN_MAX) + ) task.execute() @@ -1553,7 +1701,7 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array: if runtime.num_gpus > 0: # replacement for the ImagePartition functor to get dense image # for rows of B, run separate task for this - pos_rect = runtime.create_store(rect1, shape=(A.shape[0],)) # type: ignore + pos_rect = runtime.create_store(rect1, shape=(A.shape[0],)) task = runtime.create_auto_task(SparseOpCode.FAST_IMAGE_RANGE) A_pos_part = task.add_input(A.pos) A_crd_part = task.add_input(A.crd) @@ -1566,7 +1714,7 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array: task.execute() - pos = runtime.create_store(rect1, shape=(A.shape[0],)) # type: ignore + pos = runtime.create_store(rect1, shape=(A.shape[0],)) crd = runtime.create_store(coord_ty, ndim=1) vals = runtime.create_store(A.dtype, ndim=1) @@ -1605,7 +1753,9 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array: # Array class should provide this functionality task.add_constraint(align(A_pos_part, B_pos_image_part)) task.add_constraint( - image(B_pos_image_part, B_pos_part, hint=ImageComputationHint.MIN_MAX) + image( + B_pos_image_part, B_pos_part, hint=ImageComputationHint.MIN_MAX + ) ) task.add_constraint( @@ -1659,11 +1809,11 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array: task.execute() - pos, nnz = CompressedBase.nnz_to_pos_cls(q_nnz) + pos, nnz_value = CompressedBase.nnz_to_pos_cls(q_nnz) # Block and convert the nnz future into an int. - nnz = int(nnz) - crd = runtime.create_store(coord_ty, shape=(nnz,)) - vals = runtime.create_store(A.dtype, shape=(nnz,)) + nnz = int(nnz_value) + crd = runtime.create_store(coord_ty, shape=Shape((nnz,))) + vals = runtime.create_store(A.dtype, shape=Shape((nnz,))) task = runtime.create_auto_task(SparseOpCode.SPGEMM_CSR_CSR_CSR) C_pos_part_out = task.add_output(pos) @@ -1692,7 +1842,124 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array: task.add_constraint(image(B_pos_part, B_vals_part)) task.execute() + return csr_array((vals, crd, pos), shape=(A.shape[0], B.shape[1])) + + +def geam(A: csr_array, B: csr_array, alpha: Any, beta: Any, C=None): + """Compute C = alpha * A + beta * B for CSR matrices. + + Parameters + ---------- + A : csr_array + First input sparse matrix. + B : csr_array + Second input sparse matrix. Must have same shape as A. + alpha : scalar-like + Scalar multiplier for A. Will be cast to A.dtype. + beta : scalar-like + Scalar multiplier for B. Will be cast to A.dtype. + C : csr_array, optional + Output sparse matrix. If provided, must have the correct sparsity + pattern to hold the result. If None, a new matrix is allocated. + + Returns + ------- + csr_array + The result C = alpha * A + beta * B. + + Notes + ----- + If C is provided, it is the user's responsibility to ensure the sparsity + pattern matches the result. Behavior is undefined otherwise. + + alpha and beta may be integers, floats, or complex values. They are + converted to A.dtype before computation. For complex inputs, A.dtype + should be a complex dtype to preserve the imaginary component. + """ + + if C is None: + perform_symbolic_phase = True + else: + # If C is provided, assume it has the correct sparsity pattern + assert isinstance(C, csr_array), "C must be a Legate Sparse CSR array" + perform_symbolic_phase = False + + # Symbolic phase: compute the sparsity pattern of the result + if perform_symbolic_phase: + nnz_per_row = runtime.create_store(nnz_ty, A.pos.shape) + task = runtime.create_auto_task(SparseOpCode.GEAM_CSR_CSR_SYMBOLIC) + A_pos_part = task.add_input(A.pos) + A_crd_part = task.add_input(A.crd) + B_pos_part = task.add_input(B.pos) + B_crd_part = task.add_input(B.crd) + nnz_per_row_part = task.add_output(nnz_per_row) + + task.add_constraint(image(A_pos_part, A_crd_part)) + task.add_constraint(image(B_pos_part, B_crd_part)) + task.add_constraint(align(A_pos_part, B_pos_part)) + task.add_constraint(align(A_pos_part, nnz_per_row_part)) + + task.execute() + + # Compute C_pos from nnz_per_row using the helper from CompressedBase + C_pos, nnz_scalar = CompressedBase.nnz_to_pos_cls(nnz_per_row) + nnz_total = int(nnz_scalar) + + # Allocate output arrays if needed + if perform_symbolic_phase: + C_vals = runtime.create_store(A.dtype, shape=(nnz_total,)) + C_crd = runtime.create_store(coord_ty, shape=(nnz_total,)) + else: + C_vals = C.vals + C_crd = C.crd + C_pos = C.pos + + # Create scalar stores for alpha and beta + alpha_store = runtime.legate_runtime.create_store_from_scalar( + Scalar(A.dtype.type(alpha)) + ) + beta_store = runtime.legate_runtime.create_store_from_scalar( + Scalar(A.dtype.type(beta)) + ) + + # Compute phase: C = alpha * A + beta * B + task = runtime.create_auto_task(SparseOpCode.GEAM_CSR_CSR_COMPUTE) + + # Inputs (order must match C++ template expectations) + A_pos_part = task.add_input(A.pos) + A_crd_part = task.add_input(A.crd) + A_vals_part = task.add_input(A.vals) + B_pos_part = task.add_input(B.pos) + B_crd_part = task.add_input(B.crd) + B_vals_part = task.add_input(B.vals) + + # C_pos is an INPUT (already computed in symbolic phase) + C_pos_part = task.add_input(C_pos) + + # C_crd and C_vals are outputs + C_crd_part = task.add_output(C_crd) + C_vals_part = task.add_output(C_vals) + + # Scalar inputs (alpha and beta) + task.add_input(alpha_store) + task.add_input(beta_store) + + # Align row partitions: A, B, C all partitioned by the same rows + task.add_constraint(align(A_pos_part, B_pos_part)) + task.add_constraint(align(A_pos_part, C_pos_part)) + + # Image constraints: crd and vals are partitioned via pos + task.add_constraint(image(A_pos_part, A_crd_part)) + task.add_constraint(image(A_pos_part, A_vals_part)) + task.add_constraint(image(B_pos_part, B_crd_part)) + task.add_constraint(image(B_pos_part, B_vals_part)) + task.add_constraint(image(C_pos_part, C_crd_part)) + task.add_constraint(image(C_pos_part, C_vals_part)) + + task.execute() + + if perform_symbolic_phase: return csr_array( - (vals, crd, pos), - shape=Shape((A.shape[0], B.shape[1])), + (C_vals, C_crd, C_pos), shape=A.shape, dtype=A.dtype, copy=False ) + return C diff --git a/legate_sparse/dia.py b/legate_sparse/dia.py index 20f2dc5c..4fc035ca 100644 --- a/legate_sparse/dia.py +++ b/legate_sparse/dia.py @@ -44,10 +44,13 @@ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from __future__ import annotations -import cupynumeric -import numpy -import scipy # type: ignore +from typing import TYPE_CHECKING + +import cupynumeric as cn +import numpy as np +import scipy from .base import CompressedBase from .coverage import clone_scipy_arr_kind @@ -59,6 +62,11 @@ store_to_cupynumeric_array, ) +if TYPE_CHECKING: + from typing import Any + + import numpy.typing as npt + # Temporary implementation for matrix generation in examples @clone_scipy_arr_kind(scipy.sparse.dia_array) @@ -128,7 +136,13 @@ class dia_array(CompressedBase): [0, 7, 9]]) """ - def __init__(self, arg, shape=None, dtype=None, copy=False): + def __init__( + self, + arg: tuple[cn.ndarray, cn.ndarray], + shape: tuple[int, ...] | None = None, + dtype: npt.dtype[Any] | None = None, + copy: bool = False, + ) -> None: """Initialize a DIA array. Parameters @@ -169,14 +183,14 @@ def __init__(self, arg, shape=None, dtype=None, copy=False): assert isinstance(arg, tuple) data, offsets = arg if isinstance(offsets, int): - offsets = cupynumeric.full((1,), offsets) + offsets = cn.full((1,), offsets) data, offsets = cast_arr(data), cast_arr(offsets) if dtype is not None: data = data.astype(dtype) dtype = data.dtype assert dtype is not None - if not isinstance(dtype, numpy.dtype): - dtype = numpy.dtype(dtype) + if not isinstance(dtype, np.dtype): + dtype = np.dtype(dtype) self.dtype = dtype # Ensure that we don't accidentally include ndarray @@ -185,10 +199,10 @@ def __init__(self, arg, shape=None, dtype=None, copy=False): # legate under the hood. self.shape = tuple(int(i) for i in shape) self._offsets = get_store_from_cupynumeric_array(offsets, copy=copy) - self._data = get_store_from_cupynumeric_array(data, copy=copy) + self._store = get_store_from_cupynumeric_array(data, copy=copy) @property - def nnz(self): + def nnz(self) -> int: """Number of stored values, including explicit zeros. Returns @@ -211,7 +225,7 @@ def nnz(self): return int(nnz) @property - def data(self): + def data(self) -> cn.ndarray: """Get the data array of the DIA matrix. Returns @@ -220,10 +234,10 @@ def data(self): The data array containing the diagonal values. Each row represents a diagonal, with shape (n_diagonals, max_diagonal_length). """ - return store_to_cupynumeric_array(self._data) + return store_to_cupynumeric_array(self._store) @property - def offsets(self): + def offsets(self) -> cn.ndarray: """Get the offsets array of the DIA matrix. Returns @@ -235,7 +249,7 @@ def offsets(self): """ return store_to_cupynumeric_array(self._offsets) - def copy(self): + def copy(self) -> dia_array: """Returns a copy of this matrix. Returns @@ -243,11 +257,13 @@ def copy(self): dia_array A copy of the matrix with the same data and structure. """ - data = cupynumeric.array(self.data) - offsets = cupynumeric.array(self.offsets) + data = cn.array(self.data) + offsets = cn.array(self.offsets) return dia_array((data, offsets), shape=self.shape, dtype=self.dtype) - def transpose(self, axes=None, copy=False): + def transpose( + self, axes: tuple[int, ...] | None = None, copy: bool = False + ) -> dia_array: """Reverses the dimensions of the sparse matrix. Parameters @@ -295,13 +311,13 @@ def transpose(self, axes=None, copy=False): offsets = -self.offsets # re-align the data matrix - r = cupynumeric.arange(len(offsets), dtype=coord_ty)[:, None] - c = cupynumeric.arange(num_rows, dtype=coord_ty) - (offsets % max_dim)[:, None] + r = cn.arange(len(offsets), dtype=coord_ty)[:, None] + c = cn.arange(num_rows, dtype=coord_ty) - (offsets % max_dim)[:, None] pad_amount = max(0, max_dim - self.data.shape[1]) - data = cupynumeric.hstack( + data = cn.hstack( ( self.data, - cupynumeric.zeros( + cn.zeros( (self.data.shape[0], pad_amount), dtype=self.data.dtype ), ) @@ -316,7 +332,7 @@ def transpose(self, axes=None, copy=False): T = property(transpose, doc="Transpose of the matrix") - def tocsr(self, copy=False): + def tocsr(self, copy: bool = False) -> csr_array: """Convert this matrix to a CSR matrix. Parameters @@ -341,7 +357,7 @@ def tocsr(self, copy=False): return self.transpose(copy=copy)._tocsr_transposed(copy=False) # This routine is lifted from scipy.sparse's converter. - def _tocsr_transposed(self, copy=False): + def _tocsr_transposed(self, copy: bool = False) -> csr_array: """Convert the transposed DIA matrix to CSR format. This internal method converts a transposed DIA matrix to CSR format. @@ -374,7 +390,7 @@ def _tocsr_transposed(self, copy=False): num_rows, num_cols = self.shape num_offsets, offset_len = self.data.shape - offset_inds = cupynumeric.arange(offset_len) + offset_inds = cn.arange(offset_len) row = offset_inds - self.offsets[:, None] mask = row >= 0 @@ -383,14 +399,14 @@ def _tocsr_transposed(self, copy=False): mask &= self.data != 0 idx_dtype = coord_ty - indptr = cupynumeric.zeros(num_cols + 1, dtype=idx_dtype) + indptr = cn.zeros(num_cols + 1, dtype=idx_dtype) # note that the output dtype in a reduction (e.g, sum) determines # the dtype of the accumulator that is used in the reduction # in cupynumeric, it looks like the output dtype is set to the src # dtype if unspecified and that results in the output not performing # an integer sum. But we want the integer sum, so specify # dtype as idx_dtype to mask.sum() - indptr[1 : offset_len + 1] = cupynumeric.cumsum( + indptr[1 : offset_len + 1] = cn.cumsum( mask.sum(axis=0, dtype=idx_dtype)[:num_cols] ) if offset_len < num_cols: @@ -398,7 +414,10 @@ def _tocsr_transposed(self, copy=False): indices = row.T[mask.T].astype(idx_dtype, copy=False) data = self.data.T[mask.T] return csr_array( - (data, indices, indptr), shape=self.shape, dtype=self.dtype, copy=False + (data, indices, indptr), + shape=self.shape, + dtype=self.dtype, + copy=False, ) diff --git a/legate_sparse/gallery.py b/legate_sparse/gallery.py index 371a4c44..e5583e6e 100644 --- a/legate_sparse/gallery.py +++ b/legate_sparse/gallery.py @@ -66,15 +66,29 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. +from __future__ import annotations +from typing import TYPE_CHECKING -import cupynumeric -import numpy +import cupynumeric as cn +import numpy as np +from .csr import csr_array from .dia import dia_array +if TYPE_CHECKING: + from typing import Any, Sequence -def diags(diagonals, offsets=0, shape=None, format=None, dtype=None): + import numpy.typing as npt + + +def diags( + diagonals: Sequence[cn.ndarray], + offsets: Sequence[int] | int = 0, + shape: tuple[int, ...] | None = None, + format: str | None = None, + dtype: npt.dtype[Any] | None = None, +) -> csr_array | dia_array: """Construct a sparse matrix from diagonals. Parameters @@ -159,22 +173,25 @@ def diags(diagonals, offsets=0, shape=None, format=None, dtype=None): [ 0., 0., 0., 0.]]) """ # if offsets is not a sequence, assume that there's only one diagonal - if numpy.isscalar(offsets): + diags: list[cn.ndarray] + if np.isscalar(offsets): # now check that there's actually only one diagonal - if len(diagonals) == 0 or numpy.isscalar(diagonals[0]): - diagonals = [cupynumeric.atleast_1d(diagonals)] + if len(diagonals) == 0 or np.isscalar(diagonals[0]): + diags = [cn.atleast_1d(diagonals)] # type: ignore [list-item, arg-type] else: raise ValueError("Different number of diagonals and offsets.") else: - diagonals = list(map(cupynumeric.atleast_1d, diagonals)) + diags = cn.atleast_1d(*diagonals) # type: ignore [assignment] + + assert not isinstance(offsets, int) # Basic check - if len(diagonals) != len(offsets): + if len(diags) != len(offsets): raise ValueError("Different number of diagonals and offsets.") # Determine shape, if omitted if shape is None: - m = len(diagonals[0]) + abs(int(offsets[0])) + m = len(diags[0]) + abs(int(offsets[0])) shape = (m, m) # Determine data type, if omitted @@ -187,34 +204,38 @@ def diags(diagonals, offsets=0, shape=None, format=None, dtype=None): # Construct data array m, n = shape - M = max([min(m + offset, n - offset) + max(0, offset) for offset in offsets]) + M = max( + [min(m + offset, n - offset) + max(0, offset) for offset in offsets] + ) M = max(0, M) - data_arr = cupynumeric.zeros((len(offsets), M), dtype=dtype) + data_arr = cn.zeros((len(offsets), M), dtype=dtype) K = min(m, n) - for j, diagonal in enumerate(diagonals): + for j, diag in enumerate(diags): offset = int(offsets[j]) k = max(0, offset) length = min(m + offset, n - offset, K) if length < 0: - raise ValueError("Offset %d (index %d) out of bounds" % (offset, j)) + raise ValueError( + "Offset %d (index %d) out of bounds" % (offset, j) + ) try: - data_arr[j, k : k + length] = diagonal[..., :length] + data_arr[j, k : k + length] = diag[..., :length] except ValueError as e: - if len(diagonal) != length and len(diagonal) != 1: + if len(diag) != length and len(diag) != 1: raise ValueError( "Diagonal length (index %d: %d at offset %d) does not " "agree with matrix size (%d, %d)." - % (j, len(diagonal), offset, m, n) + % (j, len(diag), offset, m, n) ) from e raise # We importantly don't perform this conversion to cupynumeric (involving # an attach operation) until we're done indexing into the list. This # avoid a cupynumeric crash involving restrictions in attach in pde.py. - offsets = cupynumeric.atleast_1d(offsets) - dia = dia_array((data_arr, offsets), shape=(m, n), dtype=dtype) + offsets_array: cn.ndarray = cn.atleast_1d(offsets) # type: ignore [arg-type, assignment] + dia = dia_array((data_arr, offsets_array), shape=(m, n), dtype=dtype) if format == "csr": return dia.tocsr() return dia diff --git a/legate_sparse/install_info.py.in b/legate_sparse/install_info.py.in index 84799ee4..3ad3ecd1 100644 --- a/legate_sparse/install_info.py.in +++ b/legate_sparse/install_info.py.in @@ -11,9 +11,13 @@ # IMPORTANT: # * install_info.py is a generated file and should not be modified by hand +from __future__ import annotations + def get_libpath(): - import os, sys, platform + import os + import platform + import sys join = os.path.join exists = os.path.exists dirname = os.path.dirname @@ -32,10 +36,10 @@ def get_libpath(): return None return ( - find_liblegate_sparse(join(cn_path, "build", "lib")) or - find_liblegate_sparse(join(dirname(dirname(dirname(cn_path))), "lib")) or - find_liblegate_sparse(join(dirname(dirname(sys.executable)), "lib")) or - "" + find_liblegate_sparse(join(cn_path, "build", "lib")) + or find_liblegate_sparse(join(dirname(dirname(dirname(cn_path))), "lib")) + or find_liblegate_sparse(join(dirname(dirname(sys.executable)), "lib")) + or "" ) diff --git a/legate_sparse/io.py b/legate_sparse/io.py index ecaf8e3c..6ce90ba5 100644 --- a/legate_sparse/io.py +++ b/legate_sparse/io.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import numpy as np from legate.core import track_provenance, types @@ -22,8 +23,8 @@ from .utils import store_to_cupynumeric_array -@track_provenance(runtime.sparse_library) -def mmread(source): +@track_provenance() +def mmread(source: str) -> csr_array: """Read a sparse matrix from a Matrix Market (.mtx) file. Parameters @@ -59,28 +60,34 @@ def mmread(source): # TODO (rohany): We'll assume for now that all of the nodes in the system # can access the file passed in, so we don't need to worry about where this # task gets mapped to. - rows = runtime.create_store(coord_ty, ndim=1) - cols = runtime.create_store(coord_ty, ndim=1) - vals = runtime.create_store(float64, ndim=1) - m = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,)) - n = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,)) - nnz = runtime.create_store(nnz_ty, optimize_scalar=True, shape=(1,)) + rows_store = runtime.create_store(coord_ty, ndim=1) + cols_store = runtime.create_store(coord_ty, ndim=1) + vals_store = runtime.create_store(float64, ndim=1) + m_store = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,)) + n_store = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,)) + nnz_store = runtime.create_store(nnz_ty, optimize_scalar=True, shape=(1,)) task = runtime.create_auto_task(SparseOpCode.READ_MTX_TO_COO) - task.add_output(m) - task.add_output(n) - task.add_output(nnz) - task.add_output(rows) - task.add_output(cols) - task.add_output(vals) + task.add_output(m_store) + task.add_output(n_store) + task.add_output(nnz_store) + task.add_output(rows_store) + task.add_output(cols_store) + task.add_output(vals_store) task.add_scalar_arg(source, types.string_type) task.execute() - m = int(np.asarray(m.get_physical_store().get_inline_allocation())[0]) - n = int(np.asarray(n.get_physical_store().get_inline_allocation())[0]) - nnz = int(np.asarray(nnz.get_physical_store().get_inline_allocation())[0]) + m = int( + np.asarray(m_store.get_physical_store().get_inline_allocation())[0] + ) + n = int( + np.asarray(n_store.get_physical_store().get_inline_allocation())[0] + ) + nnz = int( + np.asarray(nnz_store.get_physical_store().get_inline_allocation())[0] + ) # Slice down each store from the resulting size into the actual size. sl = slice(0, nnz) - rows = store_to_cupynumeric_array(rows.slice(0, sl)) - cols = store_to_cupynumeric_array(cols.slice(0, sl)) - vals = store_to_cupynumeric_array(vals.slice(0, sl)) + rows = store_to_cupynumeric_array(rows_store.slice(0, sl)) + cols = store_to_cupynumeric_array(cols_store.slice(0, sl)) + vals = store_to_cupynumeric_array(vals_store.slice(0, sl)) return csr_array((vals, (rows, cols)), shape=(m, n)) diff --git a/legate_sparse/linalg.py b/legate_sparse/linalg.py index 82aa0edb..789cd2a9 100644 --- a/legate_sparse/linalg.py +++ b/legate_sparse/linalg.py @@ -93,15 +93,30 @@ """ +from __future__ import annotations + import inspect import warnings +from typing import TYPE_CHECKING, Protocol -import cupynumeric as np -from legate.core import track_provenance, types +import cupynumeric as cn +import numpy as np +from legate.core import align, image, track_provenance, types from .config import SparseOpCode from .runtime import runtime -from .utils import get_store_from_cupynumeric_array +from .utils import get_store_from_cupynumeric_array, store_to_cupynumeric_array + +if TYPE_CHECKING: + from typing import Any + + import numpy.typing as npt + + +class LOCallable(Protocol): + def __call__( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: ... # We have to implement our own / copy the LinearOperator class from @@ -196,7 +211,7 @@ class LinearOperator: ndim = 2 - def __new__(cls, *args, **kwargs): + def __new__(cls, *args: Any, **kwargs: Any) -> LinearOperator: if cls is LinearOperator: # Operate as _CustomLinearOperator factory. return super(LinearOperator, cls).__new__(_CustomLinearOperator) @@ -216,7 +231,9 @@ def __new__(cls, *args, **kwargs): return obj - def __init__(self, dtype, shape): + def __init__( + self, dtype: npt.dtype[Any] | None, shape: tuple[int, ...] + ) -> None: """Initialize this LinearOperator. To be called by subclasses. ``dtype`` may be None; ``shape`` should @@ -229,13 +246,21 @@ def __init__(self, dtype, shape): self.dtype = dtype self.shape = shape - def _init_dtype(self): + def _init_dtype(self) -> None: """Called from subclasses at the end of the __init__ routine.""" if self.dtype is None: - v = np.zeros(self.shape[-1]) - self.dtype = np.asarray(self.matvec(v)).dtype + v = cn.zeros(self.shape[-1]) + self.dtype = cn.asarray(self.matvec(v)).dtype + + def _matmat( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: + """Default matrix-matrix multiplication handler.""" + raise NotImplementedError - def _matvec(self, x, out=None): + def _matvec( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: """Default matrix-vector multiplication handler. If self is a linear operator of shape (M, N), then this method will @@ -247,7 +272,9 @@ def _matvec(self, x, out=None): """ raise NotImplementedError - def matvec(self, x, out=None): + def matvec( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: """Matrix-vector multiplication. Performs the operation y=A*x where A is an MxN linear @@ -275,7 +302,7 @@ def matvec(self, x, out=None): if x.shape != (N,) and x.shape != (N, 1): raise ValueError("dimension mismatch") - y = np.asarray(self._matvec(x, out=out)) + y = cn.asarray(self._matvec(x, out=out)) if x.ndim == 1: # TODO (hme): This is a cuPyNumeric bug, reshape should accept an @@ -288,11 +315,15 @@ def matvec(self, x, out=None): return y - def _rmatvec(self, x, out=None): + def _rmatvec( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: """Default implementation of _rmatvec; defers to adjoint.""" raise NotImplementedError - def rmatvec(self, x, out=None): + def rmatvec( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: """Adjoint matrix-vector multiplication. Performs the operation y = A^H * x where A is an MxN linear @@ -320,14 +351,16 @@ def rmatvec(self, x, out=None): if x.shape != (M,) and x.shape != (M, 1): raise ValueError("dimension mismatch") - y = np.asarray(self._rmatvec(x, out=out)) + y = cn.asarray(self._rmatvec(x, out=out)) if x.ndim == 1: y = y.reshape(N) elif x.ndim == 2: y = y.reshape(N, 1) else: - raise ValueError("invalid shape returned by user-defined rmatvec()") + raise ValueError( + "invalid shape returned by user-defined rmatvec()" + ) return y @@ -337,88 +370,104 @@ def rmatvec(self, x, out=None): class _CustomLinearOperator(LinearOperator): """Linear operator defined in terms of user-specified operations.""" + _matvec_impl: LOCallable + _rmatvec_impl: LOCallable | None + def __init__( self, - shape, - matvec, - rmatvec=None, - matmat=None, - dtype=None, - rmatmat=None, - ): + shape: tuple[int, ...], + matvec: LOCallable, + rmatvec: LOCallable | None = None, + matmat: LOCallable | None = None, + dtype: npt.dtype[Any] | None = None, + rmatmat: LOCallable | None = None, + ) -> None: super().__init__(dtype, shape) self.args = () - self.__matvec_impl = matvec - self.__rmatvec_impl = rmatvec + self._matvec_impl = matvec + self._rmatvec_impl = rmatvec # Check if the implementations of matvec and rmatvec have the out= # parameter. - self._matvec_has_out = self._has_out(self.__matvec_impl) - self._rmatvec_has_out = self._has_out(self.__rmatvec_impl) + self._matvec_has_out = self._has_out(self._matvec_impl) + self._rmatvec_has_out = self._has_out(self._rmatvec_impl) self._init_dtype() - def _matvec(self, x, out=None): + def _matvec( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: if self._matvec_has_out: - return self.__matvec_impl(x, out=out) + return self._matvec_impl(x, out=out) else: if out is None: - return self.__matvec_impl(x) + return self._matvec_impl(x) else: - out[:] = self.__matvec_impl(x) + out[:] = self._matvec_impl(x) return out - def _rmatvec(self, x, out=None): - func = self.__rmatvec_impl + def _rmatvec( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: + assert self._rmatvec_impl is not None + func = self._rmatvec_impl if func is None: raise NotImplementedError("rmatvec is not defined") if self._rmatvec_has_out: - return self.__rmatvec_impl(x, out=out) + return self._rmatvec_impl(x, out=out) else: if out is None: - return self.__rmatvec_impl(x) + return self._rmatvec_impl(x) else: - result = self.__rmatvec_impl(x) + result = self._rmatvec_impl(x) out[:] = result return out - def _has_out(self, o): + def _has_out(self, o: LOCallable | None) -> bool: if o is None: return False sig = inspect.signature(o) - for key, param in sig.parameters.items(): - if key == "out": - return True - return False + return "out" in sig.parameters # _SparseMatrixLinearOperator is an overload of LinearOperator to wrap # sparse matrices as a linear operator. It caches the conjugate transpose # of the sparse matrices to avoid repeat conversions. class _SparseMatrixLinearOperator(LinearOperator): - def __init__(self, A): + AH: cn.ndarray | None + + def __init__(self, A: cn.ndarray) -> None: self.A = A self.AH = None super().__init__(A.dtype, A.shape) - def _matvec(self, x, out=None): + def _matvec( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: return self.A.dot(x, out=out) - def _rmatvec(self, x, out=None): + def _rmatvec( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: if self.AH is None: - self.AH = self.A.T.conj(copy=False) + self.AH = self.A.T.conj() + assert self.AH is not None return self.AH.dot(x, out=out) # IdentityOperator is a no-op linear operator, and is lifted from # scipy.sparse. class IdentityOperator(LinearOperator): - def __init__(self, shape, dtype=None): + def __init__( + self, shape: tuple[int, ...], dtype: npt.dtype[Any] | None = None + ) -> None: super().__init__(dtype, shape) - def _matvec(self, x, out=None): + def _matvec( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: # If out is specified, copy the input into the output. if out is not None: out[:] = x @@ -428,7 +477,9 @@ def _matvec(self, x, out=None): # the input to avoid silently aliasing the input array. return x.copy() - def _rmatvec(self, x, out=None): + def _rmatvec( + self, x: cn.ndarray, out: cn.ndarray | None = None + ) -> cn.ndarray: # If out is specified, copy the input into the output. if out is not None: out[:] = x @@ -439,7 +490,7 @@ def _rmatvec(self, x, out=None): return x.copy() -def make_linear_operator(A): +def make_linear_operator(A: Any | LinearOperator) -> LinearOperator: """Convert a matrix to a LinearOperator. Parameters @@ -473,7 +524,14 @@ def make_linear_operator(A): # future operations to compute new futures, and avoids # allocating unnecessary futures. @track_provenance(nested=True) -def cg_axpby(y, x, a, b, isalpha=True, negate=False): +def cg_axpby( + y: cn.ndarray, + x: cn.ndarray, + a: cn.ndarray, + b: cn.ndarray, + isalpha: bool = True, + negate: bool = False, +) -> cn.ndarray: """Perform fused vector operation for CG solvers. This function performs the operation y = alpha * x + beta * y where @@ -526,7 +584,12 @@ def cg_axpby(y, x, a, b, isalpha=True, negate=False): return y -def _get_atol_rtol(b_norm, tol=None, atol=0.0, rtol=1e-5): +def _get_atol_rtol( + b_norm: float | cn.ndarray, + tol: float | None = None, + atol: float = 0.0, + rtol: float = 1e-5, +) -> tuple[float, float]: """Compute absolute and relative tolerances for convergence. Parameters @@ -561,17 +624,17 @@ def _get_atol_rtol(b_norm, tol=None, atol=0.0, rtol=1e-5): def cg( - A, - b, - x0=None, - tol=None, - maxiter=None, - M=None, - callback=None, - atol=0.0, - rtol=1e-5, - conv_test_iters=25, -): + A: Any | LinearOperator, + b: cn.ndarray, + x0: cn.ndarray | None = None, + tol: float | None = None, + maxiter: int | None = None, + M: Any | LinearOperator | None = None, + callback: Any | None = None, + atol: float = 0.0, + rtol: float = 1e-5, + conv_test_iters: int = 25, +) -> tuple[cn.ndarray, int]: """Solve a linear system using the Conjugate Gradient method. Parameters @@ -631,8 +694,8 @@ def cg( assert len(b.shape) == 1 or (len(b.shape) == 2 and b.shape[1] == 1) assert len(A.shape) == 2 and A.shape[0] == A.shape[1] - bnrm2 = np.linalg.norm(b) - atol, _ = _get_atol_rtol(bnrm2, tol, atol, rtol) + b_norm = cn.linalg.norm(b) + atol, _ = _get_atol_rtol(b_norm, tol, atol, rtol) n = b.shape[0] if maxiter is None: @@ -644,15 +707,15 @@ def cg( if M is None else make_linear_operator(M) ) - x = np.zeros(n) if x0 is None else x0.copy() - p = np.zeros(n) + x = cn.zeros(n) if x0 is None else x0.copy() + p = cn.zeros(n) # This implementation is adapted from CuPy's CG solve: # https://github.com/cupy/cupy/blob/master/cupyx/scipy/sparse/linalg/_iterative.py. # # Hold onto several temps to store allocations used in each iteration. r = b - A.matvec(x) iters = 0 - rho = 0 + rho: int | cn.ndarray = 0 z = None q = None @@ -679,9 +742,9 @@ def cg( iters += 1 if callback is not None: callback(x) - if (iters % conv_test_iters == 0 or iters == (maxiter - 1)) and np.linalg.norm( - r - ) < atol: + if ( + iters % conv_test_iters == 0 or iters == (maxiter - 1) + ) and cn.linalg.norm(r) < atol: converged = True # Test convergence every conv_test_iters iterations. break @@ -696,19 +759,19 @@ def cg( # This implementation of GMRES is lifted from the cupy implementation: # https://github.com/cupy/cupy/blob/9d2e2381ae7f33a42291d1bf8271484c9d2a55ac/cupyx/scipy/sparse/linalg/_iterative.py#L94. def gmres( - A, - b, - x0=None, - tol=None, - restart=None, - maxiter=None, - M=None, - callback=None, - restrt=None, - atol=0.0, - callback_type=None, - rtol=1e-5, -): + A: Any | LinearOperator, + b: cn.ndarray, + x0: cn.ndarray | None = None, + tol: float | None = None, + restart: int | None = None, + maxiter: int | None = None, + M: Any | LinearOperator | None = None, + callback: Any = None, + restrt: int | None = None, + atol: float = 0.0, + callback_type: str | None = None, + rtol: float = 1e-5, +) -> tuple[cn.ndarray, int]: """Solve a linear system using the Generalized Minimal Residual method. Parameters @@ -796,10 +859,10 @@ def gmres( if M is None else make_linear_operator(M) ) - x = np.zeros(n) if x0 is None else x0.copy() + x = cn.zeros(n) if x0 is None else x0.copy() - bnrm2 = np.linalg.norm(b) - atol, _ = _get_atol_rtol(bnrm2, tol, atol, rtol) + b_norm = cn.linalg.norm(b) + atol, _ = _get_atol_rtol(b_norm, tol, atol, rtol) if maxiter is None: maxiter = n * 10 @@ -813,11 +876,11 @@ def gmres( if callback is None: callback_type = None - V = np.empty((n, restart), dtype=A.dtype) - H = np.zeros((restart + 1, restart), dtype=A.dtype) - e = np.zeros((restart + 1,), dtype=A.dtype) + V = cn.empty((n, restart), dtype=A.dtype) + H: Any = cn.zeros((restart + 1, restart), dtype=A.dtype) + e: Any = cn.zeros((restart + 1,), dtype=A.dtype) - def compute_hu(u, j): + def compute_hu(u: cn.ndarray, j: int) -> tuple[cn.ndarray, cn.ndarray]: """Compute Householder transformation for Arnoldi iteration. Parameters @@ -847,7 +910,7 @@ def compute_hu(u, j): while True: mx = M.matvec(x) r = b - A.matvec(mx) - r_norm = np.linalg.norm(r) + r_norm = cn.linalg.norm(r) if callback_type == "x": callback(mx) elif callback_type == "pr_norm" and iters > 0: @@ -863,14 +926,14 @@ def compute_hu(u, j): z = M.matvec(v) u = A.matvec(z) H[: j + 1, j], u = compute_hu(u, j) - H[j + 1, j] = np.linalg.norm(u) + H[j + 1, j] = cn.linalg.norm(u) if j + 1 < restart: v = u / H[j + 1, j] V[:, j + 1] = v # Note: The least-square solution to equation Hy = e is computed on CPU # because it is faster if tha matrix size is small. - ret = np.linalg.lstsq(H, e) + ret = cn.linalg.lstsq(H, e) # type: ignore [attr-defined] y = ret[0] x += V @ y iters += restart @@ -879,3 +942,258 @@ def compute_hu(u, j): if iters == maxiter and not (r_norm <= atol): info = iters return mx, info + + +def spsolve(A: Any, b: np.ndarray) -> np.ndarray: + """ + Solve a linear system of equation Ax=b by factorizing A + + Parameters + ---------- + A : csr_array + Input sparse matrix of shape (N, N). + b : cupynumeric.ndarray + Dense vector of shape (N,). + + Returns + ------- + x : cupynumeric.ndarray + Dense vector of shape (N,), that solves A x = b. + + Raises + ------ + RuntimeError + If attempted to solve on any configuration other than one GPU + ValueError + If the RHS is not one-dimensional + + Notes + ----- + This function uses cuDSS to perform the sparse direct solve, which + computes the reordering on Host. + + """ + + # TODO: + # Support multi-dimensional RHS. Note that cuDSS only supports + # column-major order for x and b, so we need to update the + # mapper for those stores. Partitioning constraints will also need to + # be changed since alignment constraints will need both stores + # to be of the same dimension (e.g., we cannot align pos (1D) + # and b (say, 2D) without manipulating the stores + + # NOTE: multi-gpu runs might hang with cuda < 13.0.0. + # For multi-gpu runs, the user is expected to set the path to + # libcudss_comm_nccl.so in the env CUDSS_COMM_LIB + if runtime.num_gpus == 0: + raise RuntimeError("spsolve is currently supported only for GPU(s)") + + if b.ndim != 1: + raise ValueError(f"RHS must be 1D. Dimension of b is: {b.ndim}") + + b_store = get_store_from_cupynumeric_array(b) + x_store = runtime.create_store(b.dtype, shape=(A.shape[1],)) + + task = runtime.create_auto_task(SparseOpCode.SPSOLVE) + + pos_part = task.add_input(A.pos) + crd_part = task.add_input(A.crd) + vals_part = task.add_input(A.vals) + b_part = task.add_input(b_store) + x_part = task.add_output(x_store) + task.add_scalar_arg(A.shape[0], types.uint64) # global nrows + task.add_scalar_arg(A.vals.size, types.uint64) # global nnz + + # Add communicator + task.add_communicator("nccl") + + # Since we don't support multi-gpu or multi-cpu runs, these constraints + # are not particularly relevant right now, but they enable + # debugging the multi-gpu hang. The matrix and the vectors are + # partitioned row-wise without any sparsity-dependent constraints + # that is typical in other API implementations in legate-sparse + # that use mathlibs (e.g., cuSparse). This passes on the responsibility + # of inserting appropriate communication primitives to the + # underlying math library, cuDSS. This is why we don't constraint the + # partition of x to the image of crd (e.g., like in SpMv in csr.py) + task.add_constraint(image(pos_part, crd_part)) + task.add_constraint(image(pos_part, vals_part)) + task.add_constraint(align(x_part, pos_part)) + task.add_constraint(align(b_part, pos_part)) + + task.execute() + + return store_to_cupynumeric_array(x_store) + + +# this function has been adapted from cupy's implementation of `eigsh`: +# https://github.com/cupy/cupy/blob/v13.6.0/cupyx/scipy/sparse/linalg/_eigen.py +def eigsh( + a, + k=6, + *, + which="LM", + v0=None, + ncv=None, + maxiter=None, + tol=0, + return_eigenvectors=True, +): + def _lanczos(a, V, u, alpha, beta, i_start, i_end): + for i in range(i_start, i_end): + u[...] = a.matvec(V[i]) + alpha[i] = cn.dot(V[i].conj(), u) + + # Full reorthogonalization with "twice is enough" strategy + # for improved numerical stability. This matches the approach + # used in robust Lanczos implementations. + # First pass + coeffs = V[: i + 1].conj() @ u + u -= coeffs @ V[: i + 1] + # Second pass for numerical stability + coeffs2 = V[: i + 1].conj() @ u + u -= coeffs2 @ V[: i + 1] + + beta[i] = cn.linalg.norm(u) + if i >= i_end - 1: + break + V[i + 1] = u / beta[i] + + def _eigsh_solve_ritz(alpha, beta, beta_k, k, which): + # Note: This is done on the CPU using numpy, following CuPy's approach. + # This avoids numerical issues that can occur with GPU-based eigh + # on small tridiagonal matrices from the thick-restart Lanczos. + alpha_np = np.array(alpha) + beta_np = np.array(beta) + t = np.diag(alpha_np) + t = t + np.diag(beta_np[:-1], k=1) + t = t + np.diag(beta_np[:-1], k=-1) + if beta_k is not None: + beta_k_np = np.array(beta_k) + t[k, :k] = beta_k_np + t[:k, k] = beta_k_np + w, s = np.linalg.eigh(t) + + # Pick-up k ritz-values and ritz-vectors + if which == "LA": + idx = np.argsort(w) + wk = w[idx[-k:]] + sk = s[:, idx[-k:]] + elif which == "LM": + idx = np.argsort(np.absolute(w)) + wk = w[idx[-k:]] + sk = s[:, idx[-k:]] + elif which == "SA": + idx = np.argsort(w) + wk = w[idx[:k]] + sk = s[:, idx[:k]] + # Convert back to cupynumeric arrays + return cn.array(wk), cn.array(sk) + + # Convert to LinearOperator for uniform matvec interface + a = make_linear_operator(a) + n = a.shape[0] + if a.ndim != 2 or a.shape[0] != a.shape[1]: + raise ValueError("expected square matrix (shape: {})".format(a.shape)) + if a.dtype.char not in "fdFD": + raise TypeError("unsupprted dtype (actual: {})".format(a.dtype)) + if k <= 0: + raise ValueError("k must be greater than 0 (actual: {})".format(k)) + if k >= n: + raise ValueError("k must be smaller than n (actual: {})".format(k)) + if which not in ("LM", "LA", "SA"): + raise ValueError( + "which must be 'LM','LA'or'SA' (actual: {})".format(which) + ) + if ncv is None: + ncv = min(max(2 * k, k + 32), n - 1) + else: + ncv = min(max(ncv, k + 2), n - 1) + if maxiter is None: + maxiter = 10 * n + if tol == 0: + tol = cn.finfo(a.dtype).eps + + if k + 1 == ncv: + raise ValueError( + f"k must be smaller than ncv - 1 (k + 1 < ncv < n)." + f" ncv: {ncv}, k: {k}, n: {n}" + ) + + alpha = cn.zeros((ncv,), dtype=a.dtype) + beta = cn.zeros((ncv,), dtype=a.dtype.char.lower()) + V = cn.empty((ncv, n), dtype=a.dtype) + + if v0 is None: + u = cn.random.random((n,)).astype(a.dtype) + V[0] = u / cn.linalg.norm(u) + else: + u = v0 + V[0] = v0 / cn.linalg.norm(v0) + + _lanczos(a, V, u, alpha, beta, 0, ncv) + + iter_current = ncv + w, s = _eigsh_solve_ritz(alpha, beta, None, k, which) + x = V.T @ s + + beta_k = beta[-1] * s[-1, :] + res = cn.linalg.norm(beta_k) + + iter_increment = ncv - k + # Track initial beta scale for detecting relative breakdown + # When beta[k] is too small relative to the typical beta values, + # the thick restart becomes numerically unstable + initial_beta_scale = cn.max(cn.abs(beta[:-1])) + + while res > tol and iter_current < maxiter: + beta[:k] = 0 + alpha[:k] = w + V[:k] = x.T + + # Full reorthogonalization with "twice is enough" (same as in _lanczos) + coeffs = V[:k].conj() @ u + u = u - coeffs @ V[:k] + coeffs2 = V[:k].conj() @ u + u = u - coeffs2 @ V[:k] + + u_norm = cn.linalg.norm(u) + # Check for numerical breakdown: if u_norm is too small relative + # to initial scale, the thick restart becomes numerically unstable. + # A ratio < 0.1 indicates potential numerical issues. + if u_norm < 0.1 * initial_beta_scale: + # Accept current eigenvalues as converged + break + + V[k] = u / u_norm + u[...] = a.matvec(V[k]) + alpha[k] = cn.dot(V[k].conj(), u) + u -= alpha[k] * V[k] + u -= V[:k].T @ beta_k + beta[k] = cn.linalg.norm(u) + + # Check for numerical breakdown after computing beta[k] + # If beta[k] is very small relative to initial scale, + # continuing will cause numerical instability + if beta[k] < 0.1 * initial_beta_scale: + # Accept current eigenvalues as converged + break + + # note that this can run into Out of bounds error + # in legate if `k` is not properly constrained + # in the initial part of the algorithm + V[k + 1] = u / beta[k] + + _lanczos(a, V, u, alpha, beta, k + 1, ncv) + w, s = _eigsh_solve_ritz(alpha, beta, beta_k, k, which) + x = V.T @ s + beta_k = beta[-1] * s[-1, :] + res = cn.linalg.norm(beta_k) + + iter_current += iter_increment + + if return_eigenvectors: + idx = cn.argsort(w) + return w[idx], x[:, idx] + else: + return cn.sort(w) diff --git a/legate_sparse/module.py b/legate_sparse/module.py index 56f22fa1..2fe4c5dd 100644 --- a/legate_sparse/module.py +++ b/legate_sparse/module.py @@ -44,7 +44,9 @@ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from __future__ import annotations +from typing import Any from .csr import csr_array # noqa: F401 from .dia import dia_array # noqa: F401 @@ -56,10 +58,11 @@ # returns whether or not an object is a legate sparse created sparse matrix. -def _is_sparse_matrix(obj) -> bool: +def _is_sparse_matrix(obj: Any) -> bool: return any((isinstance(obj, csr_array), isinstance(obj, dia_array))) -def isspmatrix(obj) -> bool: + +def isspmatrix(obj: Any) -> bool: """Check if an object is a legate sparse matrix. Parameters @@ -81,7 +84,7 @@ def isspmatrix(obj) -> bool: return _is_sparse_matrix(obj) -def issparse(obj) -> bool: +def issparse(obj: Any) -> bool: """Check if an object is a legate sparse matrix. Parameters @@ -104,7 +107,7 @@ def issparse(obj) -> bool: # Variants for each particular format type. -def isspmatrix_csr(obj): +def isspmatrix_csr(obj: Any) -> bool: """Check if an object is a CSR sparse matrix. Parameters diff --git a/legate_sparse/runtime.py b/legate_sparse/runtime.py index e7a3dc41..84e12d3e 100644 --- a/legate_sparse/runtime.py +++ b/legate_sparse/runtime.py @@ -30,11 +30,12 @@ from .config import SparseOpCode, _library if TYPE_CHECKING: - from typing import Optional, Union + from typing import Any import numpy.typing as npt + from legate.core import Library -TO_CORE_DTYPES = { +TO_CORE_DTYPES: dict[npt.DTypeLike, types.Type] = { np.dtype(np.bool_): types.bool_, np.dtype(np.int8): types.int8, np.dtype(np.int16): types.int16, @@ -54,7 +55,7 @@ # TODO (marsaev): rename to SparseRuntime to avoid confusion? class Runtime: - def __init__(self, sparse_library): + def __init__(self, sparse_library: Library) -> None: self.sparse_library = sparse_library self.legate_runtime = get_legate_runtime() self.legate_machine = get_machine() @@ -66,25 +67,25 @@ def __init__(self, sparse_library): task = self.legate_runtime.create_manual_task( self.sparse_library, SparseOpCode.LOAD_CUDALIBS, - launch_shape=Shape((self.num_gpus,)), + launch_shape=(self.num_gpus,), ) task.execute() self.legate_runtime.issue_execution_fence(block=True) @property - def num_procs(self): + def num_procs(self) -> int: return self.legate_machine.count(self.legate_machine.preferred_target) @property - def num_gpus(self): + def num_gpus(self) -> int: return self.legate_machine.count(TaskTarget.GPU) def create_store( self, - ty: Union[npt.DTypeLike], - shape: Optional[Union[tuple[int, ...], Shape]] = None, + ty: npt.dtype[Any] | types.Type, + shape: Shape | tuple[int, ...] | None = None, optimize_scalar: bool = False, - ndim: Optional[int] = None, + ndim: int | None = None, ) -> LogicalStore: core_ty = TO_CORE_DTYPES[ty] if isinstance(ty, np.dtype) else ty return self.legate_runtime.create_store( @@ -92,11 +93,13 @@ def create_store( ) # only OpCode - def create_auto_task(self, OpCode) -> AutoTask: - return self.legate_runtime.create_auto_task(self.sparse_library, OpCode) + def create_auto_task(self, OpCode: int) -> AutoTask: + return self.legate_runtime.create_auto_task( + self.sparse_library, OpCode + ) # OpCode and launch domains - def create_manual_task(self, OpCode, *args) -> ManualTask: + def create_manual_task(self, OpCode: int, *args: Any) -> ManualTask: return self.legate_runtime.create_manual_task( self.sparse_library, OpCode, *args ) diff --git a/legate_sparse/settings.py b/legate_sparse/settings.py index 31e48a0c..7d518777 100644 --- a/legate_sparse/settings.py +++ b/legate_sparse/settings.py @@ -14,7 +14,12 @@ # from __future__ import annotations -from legate.util.settings import PrioritizedSetting, Settings, convert_bool +from legate.util.settings import ( + PrioritizedSetting, + Settings, + convert_bool, + convert_str, +) __all__ = ("settings",) @@ -32,5 +37,15 @@ class SparseRuntimeSettings(Settings): """, ) + cudss_commnccl_loc: PrioritizedSetting[bool] = PrioritizedSetting( + "cudss-comm-lib", + "CUDSS_COMM_LIB", + default="", + convert=convert_str, + help=""" + For multi-gpu runs, set CUDSS_COMM_LIB env to /path/to/libcudss_commlayer_nccl.so + """, + ) + settings = SparseRuntimeSettings() diff --git a/legate_sparse/types.py b/legate_sparse/types.py index 923767f2..a566f617 100644 --- a/legate_sparse/types.py +++ b/legate_sparse/types.py @@ -11,26 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations -import numpy +import numpy as np # Define some common types. Hopefully as we make more # progress in generalizing the compute kernels, we can # remove this code. -coord_ty = numpy.dtype(numpy.int64) +coord_ty = np.dtype(np.int64) """Data type for coordinate indices in sparse matrices (int64).""" -nnz_ty = numpy.dtype(numpy.uint64) +nnz_ty = np.dtype(np.uint64) """Data type for non-zero counts in sparse matrices (uint64).""" -float64 = numpy.dtype(numpy.float64) +float64 = np.dtype(np.float64) """64-bit floating point data type.""" -int32 = numpy.dtype(numpy.int32) +int32 = np.dtype(np.int32) """32-bit integer data type.""" -int64 = numpy.dtype(numpy.int64) +int64 = np.dtype(np.int64) """64-bit integer data type.""" -uint64 = numpy.dtype(numpy.uint64) +uint64 = np.dtype(np.uint64) """64-bit unsigned integer data type.""" diff --git a/legate_sparse/utils.py b/legate_sparse/utils.py index 2c072f2b..31b28a28 100644 --- a/legate_sparse/utils.py +++ b/legate_sparse/utils.py @@ -11,26 +11,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import math import traceback -from typing import Any +from typing import TYPE_CHECKING, cast -import cupynumeric -import numpy +import cupynumeric as cn +import numpy as np from legate.core import LogicalStore import legate_sparse from .runtime import runtime +if TYPE_CHECKING: + from typing import Any + + import numpy.typing as npt + + from .csr import csr_array + # Datatypes that spmv and spgemm operations are supported for -SUPPORTED_DATATYPES = ( - numpy.float32, - numpy.float64, - numpy.complex64, - numpy.complex128, -) +SUPPORTED_DATATYPES = (np.float32, np.float64, np.complex64, np.complex128) """Supported datatypes for sparse matrix operations (SpMV and SpGEMM).""" @@ -59,7 +62,7 @@ def find_last_user_stacklevel() -> int: # store_to_cupynumeric_array converts a store to a cuPyNumeric array. -def store_to_cupynumeric_array(store: LogicalStore): +def store_to_cupynumeric_array(store: LogicalStore) -> cn.ndarray: """Convert a LogicalStore to a cupynumeric array. Parameters @@ -72,13 +75,12 @@ def store_to_cupynumeric_array(store: LogicalStore): cupynumeric.ndarray The cupynumeric array representation of the store. """ - return cupynumeric.asarray(store) + return cn.asarray(store) # get_store_from_cupynumeric_array extracts a store from a cuPyNumeric array. def get_store_from_cupynumeric_array( - arr: cupynumeric.ndarray, - copy=False, + arr: cn.ndarray, copy: bool = False ) -> LogicalStore: """Extract a LogicalStore from a cupynumeric array. @@ -96,17 +98,17 @@ def get_store_from_cupynumeric_array( """ if copy: # If requested to make a copy, do so. - arr = cupynumeric.array(arr) + arr = cn.array(arr) data = arr.__legate_data_interface__["data"] array = data[next(iter(data))] store = array.data - return store + return cast(LogicalStore, store) # cast_to_store attempts to cast an arbitrary object into a store. -def cast_to_store(arr): +def cast_to_store(arr: cn.ndarray | LogicalStore) -> LogicalStore: """Cast an arbitrary object to a LogicalStore. Parameters @@ -126,16 +128,18 @@ def cast_to_store(arr): """ if isinstance(arr, LogicalStore): return arr - if isinstance(arr, numpy.ndarray): - arr = cupynumeric.array(arr) - if isinstance(arr, cupynumeric.ndarray): + if isinstance(arr, np.ndarray): + arr = cn.array(arr) + if isinstance(arr, cn.ndarray): return get_store_from_cupynumeric_array(arr) raise NotImplementedError # cast_arr attempts to cast an arbitrary object into a cupynumeric # ndarray, with an optional desired type. -def cast_arr(arr, dtype=None): +def cast_arr( + arr: cn.ndarray | LogicalStore, dtype: npt.dtype[Any] | None = None +) -> cn.ndarray: """Cast an arbitrary object to a cupynumeric array. Parameters @@ -152,14 +156,16 @@ def cast_arr(arr, dtype=None): """ if isinstance(arr, LogicalStore): arr = store_to_cupynumeric_array(arr) - elif not isinstance(arr, cupynumeric.ndarray): - arr = cupynumeric.array(arr) + elif not isinstance(arr, cn.ndarray): + arr = cn.array(arr) if dtype is not None: arr = arr.astype(dtype) return arr -def find_common_type(*args): +def find_common_type( + *args: cn.ndarray | csr_array | np.ndarray, +) -> npt.dtype[Any]: """Find the common data type for a set of arrays. This function performs a similar analysis to cupynumeric.ndarray.find_common_type @@ -190,33 +196,10 @@ def find_common_type(*args): scalar_types.append(array.dtype) else: array_types.append(array.dtype) - return numpy.result_type(*array_types, *scalar_types) - - -def cast_to_common_type(*args): - """Cast all arguments to the same common data type. - - Parameters - ---------- - *args : array_like - Arrays to cast to a common type. - - Returns - ------- - tuple - Tuple of arrays, all cast to the same common data type. + return np.result_type(*array_types, *scalar_types) - Notes - ----- - This function first finds the common type using find_common_type, - then casts each input to that type. If all arguments are already - the common type, this will be a no-op. - """ - common_type = find_common_type(*args) - return tuple(arg.astype(common_type, copy=False) for arg in args) - -def factor_int(n): +def factor_int(n: int) -> tuple[int, int]: """Split an integer into two close factors. Parameters @@ -242,7 +225,9 @@ def factor_int(n): return val, val2 -def broadcast_store(store: LogicalStore, shape: Any) -> LogicalStore: +def broadcast_store( + store: LogicalStore, shape: tuple[int, ...] +) -> LogicalStore: """Broadcast a LogicalStore to the desired shape. Parameters @@ -294,12 +279,14 @@ def copy_store(store: LogicalStore) -> LogicalStore: LogicalStore A new LogicalStore with the same data as the input. """ - res = runtime.create_store(store.type, store.shape) # type: ignore + res = runtime.create_store(store.type, store.shape) runtime.legate_runtime.issue_copy(res, store) return res -def store_from_store_or_array(src, copy=False) -> LogicalStore: # type: ignore +def store_from_store_or_array( + src: LogicalStore | cn.ndarray, copy: bool = False +) -> LogicalStore: """Get LogicalStore from a LogicalStore or array, potentially creating a copy. Parameters @@ -319,15 +306,19 @@ def store_from_store_or_array(src, copy=False) -> LogicalStore: # type: ignore AssertionError If the input type is not supported. """ - if isinstance(src, cupynumeric.ndarray): + if isinstance(src, cn.ndarray): return get_store_from_cupynumeric_array(src, copy) elif isinstance(src, LogicalStore): return copy_store(src) if copy else src else: - AssertionError("Wrong type for 'store_from_store_or_array()' utility") + raise AssertionError( + "Wrong type for 'store_from_store_or_array()' utility" + ) -def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray: # type: ignore +def array_from_store_or_array( + src: LogicalStore | cn.ndarray, copy: bool = False +) -> cn.ndarray: """Get array from a LogicalStore or array, potentially creating a copy. Parameters @@ -347,7 +338,7 @@ def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray: # type: AssertionError If the input type is not supported. """ - if isinstance(src, cupynumeric.ndarray): + if isinstance(src, cn.ndarray): return src.copy() if copy else src elif isinstance(src, LogicalStore): return ( @@ -356,11 +347,12 @@ def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray: # type: else store_to_cupynumeric_array(src) ) else: - AssertionError("Wrong type for 'array_from_store_or_array()' utility") - # type: ignore + raise AssertionError( + "Wrong type for 'array_from_store_or_array()' utility" + ) -def get_storage_type(src): +def get_storage_type(src: LogicalStore | cn.ndarray) -> npt.dtype[Any]: """Get the storage type of an object. Parameters @@ -378,18 +370,17 @@ def get_storage_type(src): AssertionError If the input type is not supported. """ - if isinstance(src, cupynumeric.ndarray): + if isinstance(src, cn.ndarray): return src.dtype elif isinstance(src, LogicalStore): # there is legate.core to_core_dtype(), but here we need the opposite # doing via array now return cast_arr(src).dtype else: - AssertionError("Wrong type for 'get_storage_type()' utility") - # type: ignore + raise AssertionError("Wrong type for 'get_storage_type()' utility") -def is_dtype_supported(dtype: numpy.dtype) -> bool: +def is_dtype_supported(dtype: npt.dtype[Any]) -> bool: """Check if a datatype supports SpMV and SpGEMM operations. Parameters @@ -409,7 +400,7 @@ def is_dtype_supported(dtype: numpy.dtype) -> bool: return dtype in SUPPORTED_DATATYPES -def is_dense(x) -> bool: +def is_dense(x: Any) -> bool: """Check if an object is a dense cupynumeric array. Parameters @@ -422,10 +413,10 @@ def is_dense(x) -> bool: bool True if x is a cupynumeric.ndarray, False otherwise. """ - return isinstance(x, cupynumeric.ndarray) + return isinstance(x, cn.ndarray) -def is_scalar_like(x) -> bool: +def is_scalar_like(x: Any) -> bool: """Check if an object is a scalar-like type. Parameters @@ -445,10 +436,10 @@ def is_scalar_like(x) -> bool: """ if isinstance(x, str): return False - return cupynumeric.isscalar(x) or (is_dense(x) and x.ndim == 0) + return cn.isscalar(x) or (is_dense(x) and x.ndim == 0) -def is_sparse(x) -> bool: +def is_sparse(x: Any) -> bool: """Check if an object is a legate sparse matrix. Parameters @@ -464,7 +455,7 @@ def is_sparse(x) -> bool: return legate_sparse.isspmatrix(x) -def sort_by_rows_then_cols(rows: cupynumeric.ndarray, cols: cupynumeric.ndarray): +def sort_by_rows_then_cols(rows: cn.ndarray, cols: cn.ndarray) -> cn.ndarray: """Sort indices by rows first, then by columns. This function is a quick and dirty hack that does what np.lexsort does @@ -501,7 +492,7 @@ def sort_by_rows_then_cols(rows: cupynumeric.ndarray, cols: cupynumeric.ndarray) # note that the lexsort reverses the order of key, # so this would be equivalent to np.lexsort((cols, rows)) - indices = cupynumeric.argsort(cols, kind="stable") - order = cupynumeric.argsort(rows[indices], kind="stable") + indices = cn.argsort(cols, kind="stable") + order = cn.argsort(rows[indices], kind="stable") return indices[order] diff --git a/legate_sparse_cpp.cmake b/legate_sparse_cpp.cmake index 6a90e3b3..2f37b63d 100644 --- a/legate_sparse_cpp.cmake +++ b/legate_sparse_cpp.cmake @@ -105,6 +105,7 @@ if(Legion_USE_CUDA) ) include(cmake/thirdparty/get_nccl.cmake) + include(cmake/thirdparty/get_cudss.cmake) endif() # End From cupynumeric @@ -134,7 +135,7 @@ list(APPEND legate_sparse_SOURCES src/legate_sparse/array/csr/spmv.cc src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc src/legate_sparse/array/csr/indexing.cc - + src/legate_sparse/array/util/unzip_rect.cc src/legate_sparse/array/util/zip_to_rect.cc @@ -142,6 +143,8 @@ list(APPEND legate_sparse_SOURCES src/legate_sparse/io/mtx_to_coo.cc src/legate_sparse/linalg/axpby.cc + src/legate_sparse/linalg/spsolve.cc + src/legate_sparse/array/csr/geam.cc ) if(Legion_USE_OpenMP) @@ -154,6 +157,7 @@ if(Legion_USE_OpenMP) src/legate_sparse/array/csr/spmv_omp.cc src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc src/legate_sparse/array/csr/indexing_omp.cc + src/legate_sparse/array/csr/geam_omp.cc src/legate_sparse/array/util/unzip_rect_omp.cc src/legate_sparse/array/util/zip_to_rect_omp.cc @@ -164,7 +168,7 @@ endif() if(Legion_USE_CUDA) list(APPEND legate_sparse_SOURCES - src/legate_sparse/cudalibs.cu + src/legate_sparse/cudalibs.cu src/legate_sparse/array/conv/dense_to_csr.cu src/legate_sparse/array/conv/csr_to_dense.cu @@ -174,19 +178,21 @@ if(Legion_USE_CUDA) src/legate_sparse/array/csr/spmv.cu src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu src/legate_sparse/array/csr/indexing.cu + src/legate_sparse/array/csr/geam.cu src/legate_sparse/array/util/unzip_rect.cu src/legate_sparse/array/util/zip_to_rect.cu - + src/legate_sparse/partition/fast_image_partition.cu src/legate_sparse/linalg/axpby.cu + src/legate_sparse/linalg/spsolve.cu ) endif() list(APPEND legate_sparse_SOURCES - + # This must always be the last file! # It guarantees we do our registration callback # only after all task variants are recorded @@ -237,17 +243,21 @@ set_target_properties(legate_sparse CUDA_STANDARD_REQUIRED ON LIBRARY_OUTPUT_DIRECTORY lib) +# NOTE: For multi-GPU runs, the env CUDSS_COMM_LIB must be set to path to libcudss_commlayer_nccl.so +# conda install -c conda-forge libcudss libcudss-dev libcudss-commlayer-nccl +# should install it in ${CONDA_PREFIX}/lib/ target_link_libraries(legate_sparse PUBLIC legate::legate $ # do we need to put this dependency here? # what is the correct target? # cupynumeric::cupynumeric - PRIVATE + PRIVATE # Add Conda library and include paths $ $ $ + $ $) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..93344517 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,118 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[build-system] +requires = [ + "wheel", + "ninja", + "setuptools", + "scikit-build>=0.13.1", + "cmake>=3.30.4", +] +build-backend = "setuptools.build_meta" + +[tool.pytest.ini_options] +addopts = "--capture=sys" +cache_dir = "./.cache/pytest" + +[tool.mypy] +python_version = "3.11" +cache_dir = "./.cache/mypy" + +pretty = true +show_error_codes = true +show_error_context = true +show_column_numbers = true + +namespace_packages = true +ignore_missing_imports = false + +disallow_any_unimported = true +disallow_any_expr = false +disallow_any_decorated = false +disallow_any_explicit = false +disallow_any_generics = true +disallow_subclassing_any = true + +disallow_untyped_calls = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +check_untyped_defs = true +disallow_untyped_decorators = true +no_implicit_optional = true +strict_optional = true + +warn_redundant_casts = true +warn_unused_ignores = false +warn_no_return = true +warn_return_any = true +warn_unreachable = true + +ignore_errors = false + +allow_untyped_globals = false +allow_redefinition = false +implicit_reexport = true +strict_equality = true + +warn_unused_configs = true + +[[tool.mypy.overrides]] +# ignore auto-generated files +# or files depending on auto-generated field +module = [ + "legate_sparse.install_info", + "legate_sparse._version", + "legate._version", + "legate.__main__", + "legate.install_info", +] +ignore_errors = true + +[tool.ruff] +cache-dir = "./.cache/ruff" +extend-exclude = [ + "arch-*", + "*-arch", + "venv", + "*venv", + "build", +] +line-length = 79 +src = [".", "legate_sparse"] + +[tool.ruff.format] +skip-magic-trailing-comma = true + +[tool.ruff.lint.isort.sections] +legion = ["legion_cffi", "legion_top"] +legate = ["legate"] +testing = ["pytest", "pytest_mock"] + +[tool.ruff.lint.isort] +known-third-party = ["numpy", "scipy"] +known-first-party = ["legate_sparse"] +length-sort-straight = true +combine-as-imports = true +split-on-trailing-comma = false +required-imports = ["from __future__ import annotations"] +section-order = [ + "future", + "standard-library", + "third-party", + "legion", + "legate", + "first-party", + "local-folder", +] diff --git a/scripts/memlog_analysis.py b/scripts/memlog_analysis.py old mode 100644 new mode 100755 index ee8bd3c6..e16a5369 --- a/scripts/memlog_analysis.py +++ b/scripts/memlog_analysis.py @@ -16,10 +16,10 @@ # Parse the log file allocations = parse_memlog('memlog.txt') - + # Export to CSV export_to_csv(allocations, 'memory_analysis.csv') - + # Create visualizations (requires pandas, matplotlib, seaborn) visualize_allocations(allocations) """ # noqa: W293 @@ -116,7 +116,9 @@ def export_to_csv( # If unique_mb_only is enabled, check for similar memory sizes if unique_mb_only: is_similar = any( - are_similar_sizes(mb_size, seen_size, threshold_percent) + are_similar_sizes( + mb_size, seen_size, threshold_percent + ) for seen_size in seen_mb_sizes ) if is_similar: @@ -145,7 +147,9 @@ def export_to_csv( ) -def export_to_excel(allocations: List[BufferAllocation], output_file: str) -> bool: +def export_to_excel( + allocations: List[BufferAllocation], output_file: str +) -> bool: """ Export memory allocation data to formatted Excel file. @@ -299,7 +303,9 @@ def visualize_allocations( """ if not all([PANDAS_AVAILABLE, MATPLOTLIB_AVAILABLE, SEABORN_AVAILABLE]): print("Error: Visualization requires pandas, matplotlib, and seaborn.") - print("Please install them with: pip install pandas matplotlib seaborn") + print( + "Please install them with: pip install pandas matplotlib seaborn" + ) return False # Convert to DataFrame @@ -360,7 +366,9 @@ def visualize_allocations( else: # Memory usage by description (top 10) plt.subplot(2, 2, 1) - top_descriptions = df.groupby("Description")["Size_MB"].sum().nlargest(10) + top_descriptions = ( + df.groupby("Description")["Size_MB"].sum().nlargest(10) + ) sns.barplot(x=top_descriptions.values, y=top_descriptions.index) plt.title("Top 10 Memory Usage by Description") plt.xlabel("Memory (MB)") @@ -372,7 +380,9 @@ def visualize_allocations( plt.title("Memory Distribution by Type") plt.tight_layout() - plt.savefig(f"{output_dir}/memory_analysis.png", dpi=300, bbox_inches="tight") + plt.savefig( + f"{output_dir}/memory_analysis.png", dpi=300, bbox_inches="tight" + ) plt.close() return True @@ -383,7 +393,9 @@ def main(): from memlog_parser import parse_memlog - parser = argparse.ArgumentParser(description="Analyze memory allocation logs") + parser = argparse.ArgumentParser( + description="Analyze memory allocation logs" + ) parser.add_argument("file", help="Path to the memory log file") parser.add_argument( "--output-dir", default=".", help="Directory to save output files" diff --git a/scripts/memlog_cli.py b/scripts/memlog_cli.py old mode 100644 new mode 100755 index ef45a129..94cf430c --- a/scripts/memlog_cli.py +++ b/scripts/memlog_cli.py @@ -14,7 +14,11 @@ import os import sys -from memlog_analysis import export_to_csv, export_to_excel, visualize_allocations +from memlog_analysis import ( + export_to_csv, + export_to_excel, + visualize_allocations, +) from memlog_parser import ( filter_allocations, parse_memlog, @@ -49,8 +53,12 @@ def check_dependencies(format: str) -> bool: import pandas # noqa: F401 import seaborn # noqa: F401 except ImportError: - print("Error: Visualization requires pandas, matplotlib, and seaborn.") - print("Please install them with: pip install pandas matplotlib seaborn") + print( + "Error: Visualization requires pandas, matplotlib, and seaborn." + ) + print( + "Please install them with: pip install pandas matplotlib seaborn" + ) return False return True diff --git a/scripts/memlog_parser.py b/scripts/memlog_parser.py old mode 100644 new mode 100755 index 0024b41d..854b3a72 --- a/scripts/memlog_parser.py +++ b/scripts/memlog_parser.py @@ -60,7 +60,9 @@ class BufferAllocation: def total_bytes(self) -> int: """Calculate total bytes allocated including data type size.""" - type_size = TYPE_SIZES.get(self.type, 1) # Default to 1 byte if type not found + type_size = TYPE_SIZES.get( + self.type, 1 + ) # Default to 1 byte if type not found return self.size * type_size def total_mb(self) -> float: @@ -68,7 +70,9 @@ def total_mb(self) -> float: return self.total_bytes() / (1024 * 1024) -def are_similar_sizes(size1: float, size2: float, threshold_percent: float) -> bool: +def are_similar_sizes( + size1: float, size2: float, threshold_percent: float +) -> bool: """ Check if two sizes are similar within the given percentage threshold. @@ -190,7 +194,10 @@ def filter_allocations( filtered = [] for alloc in allocations: - if alloc.description not in ignore_descriptions and alloc.total_mb() >= min_mb: + if ( + alloc.description not in ignore_descriptions + and alloc.total_mb() >= min_mb + ): filtered.append(alloc) return filtered @@ -215,9 +222,9 @@ def print_description_group( max_bytes = max(alloc.total_bytes() for alloc in allocs) print(f"\n{desc}:") print( - f" Total bytes (includes non-unique allocs): {desc_total_bytes / (1024*1024):.2f} MB" + f" Total bytes (includes non-unique allocs): {desc_total_bytes / (1024 * 1024):.2f} MB" ) - print(f" Max bytes : {max_bytes / (1024*1024):.2f} MB") + print(f" Max bytes : {max_bytes / (1024 * 1024):.2f} MB") # Track seen entries for this description seen_entries = set() @@ -275,9 +282,9 @@ def print_size_group( print(f"\nSize: {size} elements:") print( - f" Total bytes (includes non-unique allocs): {size_total_bytes / (1024*1024):.2f} MB" + f" Total bytes (includes non-unique allocs): {size_total_bytes / (1024 * 1024):.2f} MB" ) - print(f" Max bytes : {max_bytes / (1024*1024):.2f} MB") + print(f" Max bytes : {max_bytes / (1024 * 1024):.2f} MB") for alloc in allocs: mb_size = alloc.total_mb() diff --git a/scripts/pre-commit/yamllint.yml b/scripts/pre-commit/yamllint.yml new file mode 100644 index 00000000..2017e01d --- /dev/null +++ b/scripts/pre-commit/yamllint.yml @@ -0,0 +1,6 @@ +--- +extends: default +rules: + truthy: + ignore: ".github/workflows/*.yml" + line-length: disable diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..eed372ce --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[mypy] +python_version = 3.11 +strict = True +implicit_reexport = true + +[mypy-legate_sparse._version] +ignore_errors = True diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 68efb75c..c358d32e --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ setup( name="legate-sparse", - version="25.07.00", + version="26.02.00", description="An Aspiring Drop-In Replacement for SciPy Sparse module at Scale", author="NVIDIA Corporation", license="Apache 2.0", @@ -52,10 +52,7 @@ "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", ], - packages=find_packages( - where=".", - include=["legate_sparse*"], - ), + packages=find_packages(where=".", include=["legate_sparse*"]), include_package_data=True, zip_safe=False, ) diff --git a/src/legate_sparse/array/conv/csr_to_dense.cc b/src/legate_sparse/array/conv/csr_to_dense.cc index de9a8958..5ff66d63 100644 --- a/src/legate_sparse/array/conv/csr_to_dense.cc +++ b/src/legate_sparse/array/conv/csr_to_dense.cc @@ -23,6 +23,9 @@ using namespace legate; template struct CSRToDenseImplBody { + TaskContext context; + explicit CSRToDenseImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/conv/csr_to_dense.cu b/src/legate_sparse/array/conv/csr_to_dense.cu index 2d3c159f..de98b015 100644 --- a/src/legate_sparse/array/conv/csr_to_dense.cu +++ b/src/legate_sparse/array/conv/csr_to_dense.cu @@ -48,6 +48,9 @@ __global__ void CSRtoDenseKernel(size_t rows, template <> struct CSRToDenseImpl { + TaskContext context; + explicit CSRToDenseImpl(TaskContext context) : context(context) {} + template void operator()(CSRToDenseArgs& args) const { @@ -64,7 +67,7 @@ struct CSRToDenseImpl { return; } - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); auto B_domain = B_pos.domain(); auto rows = B_domain.hi()[0] - B_domain.lo()[0] + 1; diff --git a/src/legate_sparse/array/conv/csr_to_dense_omp.cc b/src/legate_sparse/array/conv/csr_to_dense_omp.cc index ec5da532..d048e0d6 100644 --- a/src/legate_sparse/array/conv/csr_to_dense_omp.cc +++ b/src/legate_sparse/array/conv/csr_to_dense_omp.cc @@ -23,6 +23,9 @@ using namespace legate; template struct CSRToDenseImplBody { + TaskContext context; + explicit CSRToDenseImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/conv/csr_to_dense_template.inl b/src/legate_sparse/array/conv/csr_to_dense_template.inl index 58529312..9fb8d4dd 100644 --- a/src/legate_sparse/array/conv/csr_to_dense_template.inl +++ b/src/legate_sparse/array/conv/csr_to_dense_template.inl @@ -31,6 +31,9 @@ struct CSRToDenseImplBody; template struct CSRToDenseImpl { + TaskContext context; + explicit CSRToDenseImpl(TaskContext context) : context(context) {} + template void operator()(CSRToDenseArgs& args) const { @@ -45,7 +48,7 @@ struct CSRToDenseImpl { if (args.A_vals.domain().empty()) { return; } - CSRToDenseImplBody()( + CSRToDenseImplBody{context}( A_vals, B_pos, B_crd, B_vals, args.A_vals.shape<2>()); } }; @@ -61,7 +64,7 @@ static void csr_to_dense_template(TaskContext context) CSRToDenseArgs args{outputs[0], context.inputs()[0], context.inputs()[1], context.inputs()[2]}; index_type_value_type_dispatch( - args.B_crd.code(), args.A_vals.code(), CSRToDenseImpl{}, args); + args.B_crd.code(), args.A_vals.code(), CSRToDenseImpl{context}, args); } } // namespace sparse diff --git a/src/legate_sparse/array/conv/dense_to_csr.cc b/src/legate_sparse/array/conv/dense_to_csr.cc index 3304b558..410b37d8 100644 --- a/src/legate_sparse/array/conv/dense_to_csr.cc +++ b/src/legate_sparse/array/conv/dense_to_csr.cc @@ -23,6 +23,9 @@ using namespace legate; template struct DenseToCSRNNZImplBody { + TaskContext context; + explicit DenseToCSRNNZImplBody(TaskContext context) : context(context) {} + using VAL_TY = type_of; void operator()(const AccessorWO& nnz, @@ -43,6 +46,9 @@ struct DenseToCSRNNZImplBody { template struct DenseToCSRImplBody { + TaskContext context; + explicit DenseToCSRImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/conv/dense_to_csr.cu b/src/legate_sparse/array/conv/dense_to_csr.cu index e38d906b..34698a4d 100644 --- a/src/legate_sparse/array/conv/dense_to_csr.cu +++ b/src/legate_sparse/array/conv/dense_to_csr.cu @@ -44,6 +44,9 @@ __global__ void denseToCSRNNZKernel(size_t rows, template <> struct DenseToCSRNNZImpl { + TaskContext context; + explicit DenseToCSRNNZImpl(TaskContext context) : context(context) {} + template void operator()(DenseToCSRNNZArgs& args) const { @@ -57,7 +60,7 @@ struct DenseToCSRNNZImpl { return; } - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); // #if (CUSPARSE_VER_MAJOR < 11 || (CUSPARSE_VER_MAJOR == 11 && CUSPARSE_VER_MINOR < 2)) #if 1 @@ -149,6 +152,9 @@ __global__ void denseToCSRKernel(size_t rows, template <> struct DenseToCSRImpl { + TaskContext context; + explicit DenseToCSRImpl(TaskContext context) : context(context) {} + template void operator()(DenseToCSRArgs& args) const { @@ -166,7 +172,7 @@ struct DenseToCSRImpl { } // Get context sensitive objects. - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); auto B_domain = B_vals.domain(); auto rows = B_domain.hi()[0] - B_domain.lo()[0] + 1; diff --git a/src/legate_sparse/array/conv/dense_to_csr_omp.cc b/src/legate_sparse/array/conv/dense_to_csr_omp.cc index 78e060de..7de5334d 100644 --- a/src/legate_sparse/array/conv/dense_to_csr_omp.cc +++ b/src/legate_sparse/array/conv/dense_to_csr_omp.cc @@ -23,6 +23,9 @@ using namespace legate; template struct DenseToCSRNNZImplBody { + TaskContext context; + explicit DenseToCSRNNZImplBody(TaskContext context) : context(context) {} + using VAL_TY = type_of; void operator()(const AccessorWO& nnz, @@ -44,6 +47,9 @@ struct DenseToCSRNNZImplBody { template struct DenseToCSRImplBody { + TaskContext context; + explicit DenseToCSRImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/conv/dense_to_csr_template.inl b/src/legate_sparse/array/conv/dense_to_csr_template.inl index 31c81686..bbf98cb4 100644 --- a/src/legate_sparse/array/conv/dense_to_csr_template.inl +++ b/src/legate_sparse/array/conv/dense_to_csr_template.inl @@ -32,6 +32,9 @@ struct DenseToCSRNNZImplBody; template struct DenseToCSRNNZImpl { + TaskContext context; + explicit DenseToCSRNNZImpl(TaskContext context) : context(context) {} + template void operator()(DenseToCSRNNZArgs& args) const { @@ -43,7 +46,7 @@ struct DenseToCSRNNZImpl { if (args.nnz.domain().empty()) { return; } - DenseToCSRNNZImplBody()(nnz, B_vals, args.B_vals.shape<2>()); + DenseToCSRNNZImplBody{context}(nnz, B_vals, args.B_vals.shape<2>()); } }; @@ -52,6 +55,9 @@ struct DenseToCSRImplBody; template struct DenseToCSRImpl { + TaskContext context; + explicit DenseToCSRImpl(TaskContext context) : context(context) {} + template void operator()(DenseToCSRArgs& args) const { @@ -66,7 +72,7 @@ struct DenseToCSRImpl { if (args.A_pos.domain().empty()) { return; } - DenseToCSRImplBody()( + DenseToCSRImplBody{context}( A_pos, A_crd, A_vals, B_vals, args.B_vals.shape<2>()); } }; @@ -78,7 +84,7 @@ static void dense_to_csr_nnz_template(TaskContext context) context.output(0), // nnz_per_row context.input(0) // B_vals }; - value_type_dispatch(args.B_vals.code(), DenseToCSRNNZImpl{}, args); + value_type_dispatch(args.B_vals.code(), DenseToCSRNNZImpl{context}, args); } template @@ -92,7 +98,7 @@ static void dense_to_csr_template(TaskContext context) }; index_type_value_type_dispatch( - args.A_crd.code(), args.A_vals.code(), DenseToCSRImpl{}, args); + args.A_crd.code(), args.A_vals.code(), DenseToCSRImpl{context}, args); } } // namespace sparse diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cc b/src/legate_sparse/array/conv/pos_to_coordinates.cc index 7cadb10e..20773a22 100644 --- a/src/legate_sparse/array/conv/pos_to_coordinates.cc +++ b/src/legate_sparse/array/conv/pos_to_coordinates.cc @@ -23,6 +23,9 @@ using namespace legate; template struct ExpandPosToCoordinatesImplBody { + TaskContext context; + explicit ExpandPosToCoordinatesImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; void operator()(const AccessorRO, 1>& pos, diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cu b/src/legate_sparse/array/conv/pos_to_coordinates.cu index c74a5c3f..ced2335d 100644 --- a/src/legate_sparse/array/conv/pos_to_coordinates.cu +++ b/src/legate_sparse/array/conv/pos_to_coordinates.cu @@ -44,13 +44,16 @@ __global__ void fill_row_indices(size_t rows, template struct ExpandPosToCoordinatesImplBody { + TaskContext context; + explicit ExpandPosToCoordinatesImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; void operator()(const AccessorRO, 1>& pos, const AccessorWO& row_indices, const Rect<1>& rect) { - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); auto blocks = get_num_blocks_1d(rect.volume()); size_t rows = rect.volume(); diff --git a/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc b/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc index 80da99a5..01d51002 100644 --- a/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc +++ b/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc @@ -23,6 +23,9 @@ using namespace legate; template struct ExpandPosToCoordinatesImplBody { + TaskContext context; + explicit ExpandPosToCoordinatesImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; void operator()(const AccessorRO, 1>& pos, diff --git a/src/legate_sparse/array/conv/pos_to_coordinates_template.inl b/src/legate_sparse/array/conv/pos_to_coordinates_template.inl index 39142e70..160bc53f 100644 --- a/src/legate_sparse/array/conv/pos_to_coordinates_template.inl +++ b/src/legate_sparse/array/conv/pos_to_coordinates_template.inl @@ -28,6 +28,9 @@ struct ExpandPosToCoordinatesImplBody; template struct ExpandPosToCoordinatesImpl { + TaskContext context; + explicit ExpandPosToCoordinatesImpl(TaskContext context) : context(context) {} + template void operator()(ExpandPosToCoordinatesArgs& args) const { @@ -41,7 +44,8 @@ struct ExpandPosToCoordinatesImpl { if (pos_domain.empty() || row_indices_domain.empty()) { return; } - ExpandPosToCoordinatesImplBody()(pos, row_indices, args.pos.shape<1>()); + ExpandPosToCoordinatesImplBody{context}( + pos, row_indices, args.pos.shape<1>()); } }; @@ -52,7 +56,7 @@ static void pos_to_coordinates_template(TaskContext context) context.outputs()[0], context.inputs()[0], }; - index_type_dispatch(args.row_indices.code(), ExpandPosToCoordinatesImpl(), args); + index_type_dispatch(args.row_indices.code(), ExpandPosToCoordinatesImpl{context}, args); } } // namespace sparse diff --git a/src/legate_sparse/array/csr/geam.cc b/src/legate_sparse/array/csr/geam.cc new file mode 100644 index 00000000..ced2d73b --- /dev/null +++ b/src/legate_sparse/array/csr/geam.cc @@ -0,0 +1,79 @@ +#include "legate_sparse/array/csr/geam.h" +#include "legate_sparse/array/csr/geam_template.inl" +#include "legate_sparse/array/csr/geam_kernels.h" + +namespace sparse { +using namespace legate; + +template +struct GeamComputeImplBody { + TaskContext context; + explicit GeamComputeImplBody(TaskContext context) : context(context) {} + + using INDEX_TY = type_of; + using VAL_TY = type_of; + + void operator()(const AccessorRO, 1>& A_pos, + const AccessorRO& A_crd, + const AccessorRO& A_vals, + const AccessorRO, 1>& B_pos, + const AccessorRO& B_crd, + const AccessorRO& B_vals, + const AccessorRO, 1>& C_pos, + const AccessorWO& C_crd, + const AccessorWO& C_vals, + const AccessorRO& alpha, + const AccessorRO& beta, + const Rect<1>& rect) + { + VAL_TY alpha_val = alpha[0]; + VAL_TY beta_val = beta[0]; + + for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) { + geam_compute_row( + row, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha_val, beta_val); + } + } +}; + +template +struct GeamSymbolicImplBody { + TaskContext context; + explicit GeamSymbolicImplBody(TaskContext context) : context(context) {} + + using INDEX_TY = type_of; + + void operator()(const AccessorRO, 1>& A_pos, + const AccessorRO& A_crd, + const AccessorRO, 1>& B_pos, + const AccessorRO& B_crd, + const AccessorRW& nnz_per_row, + const Rect<1>& rect) + { + for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) { + nnz_per_row[row] = geam_symbolic_row(row, A_pos, A_crd, B_pos, B_crd); + } + } +}; + +/* static */ void GeamCSRCSRSymbolic::cpu_variant(legate::TaskContext context) +{ + geam_csr_csr_symbolic_template(context); +} + +/* static */ void GeamCSRCSRCompute::cpu_variant(legate::TaskContext context) +{ + geam_csr_csr_compute_template(context); +} + +namespace // unnamed +{ +static const auto sparse_reg_task_ = []() -> char { + GeamCSRCSRSymbolic::register_variants(); + GeamCSRCSRCompute::register_variants(); + return 0; +}(); + +} // namespace + +} // namespace sparse diff --git a/src/legate_sparse/array/csr/geam.cu b/src/legate_sparse/array/csr/geam.cu new file mode 100644 index 00000000..fdf467d3 --- /dev/null +++ b/src/legate_sparse/array/csr/geam.cu @@ -0,0 +1,144 @@ +/* Copyright 2022-2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "legate_sparse/array/csr/geam.h" +#include "legate_sparse/array/csr/geam_template.inl" +#include "legate_sparse/array/csr/geam_kernels.h" +#include "legate_sparse/util/cuda_help.h" + +namespace sparse { +using namespace legate; + +// GPU kernel for symbolic phase: compute nnz_per_row +template +__global__ void geam_symbolic_kernel(const size_t nrows, + const AccessorRO, 1> A_pos, + const AccessorRO A_crd, + const AccessorRO, 1> B_pos, + const AccessorRO B_crd, + const AccessorRW nnz_per_row) +{ + const size_t row = global_tid_1d(); + if (row >= nrows) { + return; + } + + nnz_per_row[row] = geam_symbolic_row(row, A_pos, A_crd, B_pos, B_crd); +} + +// GPU kernel for compute phase: C = alpha * A + beta * B +template +__global__ void geam_compute_kernel(const size_t nrows, + const AccessorRO, 1> A_pos, + const AccessorRO A_crd, + const AccessorRO A_vals, + const AccessorRO, 1> B_pos, + const AccessorRO B_crd, + const AccessorRO B_vals, + const AccessorRO, 1> C_pos, + const AccessorWO C_crd, + const AccessorWO C_vals, + const AccessorRO alpha_acc, + const AccessorRO beta_acc) +{ + const size_t row = global_tid_1d(); + if (row >= nrows) { + return; + } + + VAL_TY alpha = alpha_acc[0]; + VAL_TY beta = beta_acc[0]; + + geam_compute_row( + row, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha, beta); +} + +// GPU implementation of the symbolic phase +template +struct GeamSymbolicImplBody { + TaskContext context; + explicit GeamSymbolicImplBody(TaskContext context) : context(context) {} + + using INDEX_TY = type_of; + + void operator()(const AccessorRO, 1>& A_pos, + const AccessorRO& A_crd, + const AccessorRO, 1>& B_pos, + const AccessorRO& B_crd, + const AccessorRW& nnz_per_row, + const Rect<1>& rect) + { + auto stream = context.get_task_stream(); + auto nrows = rect.hi[0] - rect.lo[0] + 1; + auto num_blocks = get_num_blocks_1d(nrows); + + if (nrows == 0) { + return; + } + + geam_symbolic_kernel<<>>( + nrows, A_pos, A_crd, B_pos, B_crd, nnz_per_row); + LEGATE_SPARSE_CHECK_CUDA_STREAM(stream); + } +}; + +/*static*/ void GeamCSRCSRSymbolic::gpu_variant(TaskContext context) +{ + geam_csr_csr_symbolic_template(context); +} + +// GPU implementation of the compute phase +template +struct GeamComputeImplBody { + TaskContext context; + explicit GeamComputeImplBody(TaskContext context) : context(context) {} + + using INDEX_TY = type_of; + using VAL_TY = type_of; + + void operator()(const AccessorRO, 1>& A_pos, + const AccessorRO& A_crd, + const AccessorRO& A_vals, + const AccessorRO, 1>& B_pos, + const AccessorRO& B_crd, + const AccessorRO& B_vals, + const AccessorRO, 1>& C_pos, + const AccessorWO& C_crd, + const AccessorWO& C_vals, + const AccessorRO& alpha, + const AccessorRO& beta, + const Rect<1>& rect) + { + auto stream = context.get_task_stream(); + auto nrows = rect.hi[0] - rect.lo[0] + 1; + auto num_blocks = get_num_blocks_1d(nrows); + + if (nrows == 0) { + return; + } + + geam_compute_kernel<<>>( + nrows, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha, beta); + LEGATE_SPARSE_CHECK_CUDA_STREAM(stream); + } +}; + +/*static*/ void GeamCSRCSRCompute::gpu_variant(TaskContext context) +{ + geam_csr_csr_compute_template(context); +} + +} // namespace sparse diff --git a/src/legate_sparse/array/csr/geam.h b/src/legate_sparse/array/csr/geam.h new file mode 100644 index 00000000..6329f307 --- /dev/null +++ b/src/legate_sparse/array/csr/geam.h @@ -0,0 +1,91 @@ +/* Copyright 2022-2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "legate_sparse/sparse.h" +#include "legate_sparse/sparse_c.h" +#include "legate.h" + +namespace sparse { + +struct GeamCSRCSRSymbolicArgs { + // Symbolic phase: compute the sparsity pattern of C = alpha * A + beta * B + // This phase only needs the positions and coordinates, not the values or scalars + const legate::PhysicalStore& A_pos; + const legate::PhysicalStore& A_crd; + const legate::PhysicalStore& B_pos; + const legate::PhysicalStore& B_crd; + const legate::PhysicalStore& nnz_per_row; // output: number of non-zeros per row +}; + +struct GeamCSRCSRComputeArgs { + // Compute phase: compute the output C where C = alpha * A + beta * B + // Inputs + const legate::PhysicalStore& A_pos; + const legate::PhysicalStore& A_crd; + const legate::PhysicalStore& A_vals; + const legate::PhysicalStore& B_pos; + const legate::PhysicalStore& B_crd; + const legate::PhysicalStore& B_vals; + + // C_pos is an INPUT (computed in symbolic phase, read-only here) + const legate::PhysicalStore& C_pos; + + // C_crd and C_vals are outputs + const legate::PhysicalStore& C_crd; + const legate::PhysicalStore& C_vals; + + // Scalar constants + const legate::PhysicalStore& alpha; + const legate::PhysicalStore& beta; +}; + +class GeamCSRCSRCompute : public SparseTask { + public: + static inline const auto TASK_CONFIG = + legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE}}; + + public: + static void cpu_variant(legate::TaskContext context); + +#ifdef LEGATE_USE_OPENMP + static void omp_variant(legate::TaskContext context); +#endif + +#ifdef LEGATE_USE_CUDA + static void gpu_variant(legate::TaskContext context); +#endif +}; + +class GeamCSRCSRSymbolic : public SparseTask { + public: + static inline const auto TASK_CONFIG = + legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC}}; + + public: + static void cpu_variant(legate::TaskContext context); + +#ifdef LEGATE_USE_OPENMP + static void omp_variant(legate::TaskContext context); +#endif + +#ifdef LEGATE_USE_CUDA + static void gpu_variant(legate::TaskContext context); +#endif +}; + +} // namespace sparse diff --git a/src/legate_sparse/array/csr/geam_kernels.h b/src/legate_sparse/array/csr/geam_kernels.h new file mode 100644 index 00000000..6ae7c6f0 --- /dev/null +++ b/src/legate_sparse/array/csr/geam_kernels.h @@ -0,0 +1,129 @@ +/* Copyright 2022-2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "legate_sparse/util/typedefs.h" +#include "legate.h" + +namespace sparse { +using namespace legate; + +// ============================================================================= +// Symbolic Phase: Compute nnz per row for C = A + B +// ============================================================================= + +// Computes the number of non-zeros in a single row of C = A + B +template +LEGATE_HOST_DEVICE inline nnz_ty geam_symbolic_row(size_t row, + const AccessorRO, 1> A_pos, + const AccessorRO A_crd, + const AccessorRO, 1> B_pos, + const AccessorRO B_crd) +{ + size_t A_pos_start = A_pos[row].lo; + size_t A_pos_end = A_pos[row].hi + 1; + size_t B_pos_start = B_pos[row].lo; + size_t B_pos_end = B_pos[row].hi + 1; + + size_t a_pos = A_pos_start; + size_t b_pos = B_pos_start; + nnz_ty count = 0; + + // Merge sorted column indices and count unique entries + while (a_pos < A_pos_end && b_pos < B_pos_end) { + if (A_crd[a_pos] < B_crd[b_pos]) { + a_pos++; + } else if (A_crd[a_pos] > B_crd[b_pos]) { + b_pos++; + } else { + a_pos++; + b_pos++; + } + count++; + } + + // Add remaining elements + count += (A_pos_end - a_pos) + (B_pos_end - b_pos); + return count; +} + +// ============================================================================= +// Compute Phase: Compute C = alpha * A + beta * B for a single row +// ============================================================================= + +// Computes a single row of C = alpha * A + beta * B +template +LEGATE_HOST_DEVICE inline void geam_compute_row(size_t row, + const AccessorRO, 1> A_pos, + const AccessorRO A_crd, + const AccessorRO A_vals, + const AccessorRO, 1> B_pos, + const AccessorRO B_crd, + const AccessorRO B_vals, + const AccessorRO, 1> C_pos, + const AccessorWO C_crd, + const AccessorWO C_vals, + VAL_TY alpha, + VAL_TY beta) +{ + size_t A_pos_start = A_pos[row].lo; + size_t A_pos_end = A_pos[row].hi + 1; + size_t B_pos_start = B_pos[row].lo; + size_t B_pos_end = B_pos[row].hi + 1; + size_t C_pos_start = C_pos[row].lo; + + size_t a_pos = A_pos_start; + size_t b_pos = B_pos_start; + size_t c_pos = C_pos_start; + + // Merge sorted column indices and compute values + while (a_pos < A_pos_end && b_pos < B_pos_end) { + if (A_crd[a_pos] < B_crd[b_pos]) { + C_crd[c_pos] = A_crd[a_pos]; + C_vals[c_pos] = alpha * A_vals[a_pos]; + a_pos++; + } else if (A_crd[a_pos] > B_crd[b_pos]) { + C_crd[c_pos] = B_crd[b_pos]; + C_vals[c_pos] = beta * B_vals[b_pos]; + b_pos++; + } else { + C_crd[c_pos] = A_crd[a_pos]; + C_vals[c_pos] = alpha * A_vals[a_pos] + beta * B_vals[b_pos]; + a_pos++; + b_pos++; + } + c_pos++; + } + + // Add remaining elements from A + while (a_pos < A_pos_end) { + C_crd[c_pos] = A_crd[a_pos]; + C_vals[c_pos] = alpha * A_vals[a_pos]; + a_pos++; + c_pos++; + } + + // Add remaining elements from B + while (b_pos < B_pos_end) { + C_crd[c_pos] = B_crd[b_pos]; + C_vals[c_pos] = beta * B_vals[b_pos]; + b_pos++; + c_pos++; + } +} + +} // namespace sparse diff --git a/src/legate_sparse/array/csr/geam_omp.cc b/src/legate_sparse/array/csr/geam_omp.cc new file mode 100644 index 00000000..26c52361 --- /dev/null +++ b/src/legate_sparse/array/csr/geam_omp.cc @@ -0,0 +1,87 @@ +/* Copyright 2022-2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "legate_sparse/array/csr/geam.h" +#include "legate_sparse/array/csr/geam_template.inl" +#include "legate_sparse/array/csr/geam_kernels.h" + +namespace sparse { +using namespace legate; + +template +struct GeamSymbolicImplBody { + TaskContext context; + explicit GeamSymbolicImplBody(TaskContext context) : context(context) {} + + using INDEX_TY = type_of; + + void operator()(const AccessorRO, 1>& A_pos, + const AccessorRO& A_crd, + const AccessorRO, 1>& B_pos, + const AccessorRO& B_crd, + const AccessorRW& nnz_per_row, + const Rect<1>& rect) + { +#pragma omp parallel for + for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) { + nnz_per_row[row] = geam_symbolic_row(row, A_pos, A_crd, B_pos, B_crd); + } + } +}; + +/* static */ void GeamCSRCSRSymbolic::omp_variant(TaskContext context) +{ + geam_csr_csr_symbolic_template(context); +} + +template +struct GeamComputeImplBody { + TaskContext context; + explicit GeamComputeImplBody(TaskContext context) : context(context) {} + + using INDEX_TY = type_of; + using VAL_TY = type_of; + + void operator()(const AccessorRO, 1>& A_pos, + const AccessorRO& A_crd, + const AccessorRO& A_vals, + const AccessorRO, 1>& B_pos, + const AccessorRO& B_crd, + const AccessorRO& B_vals, + const AccessorRO, 1>& C_pos, + const AccessorWO& C_crd, + const AccessorWO& C_vals, + const AccessorRO& alpha, + const AccessorRO& beta, + const Rect<1>& rect) + { + VAL_TY alpha_val = alpha[0]; + VAL_TY beta_val = beta[0]; + +#pragma omp parallel for + for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) { + geam_compute_row( + row, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha_val, beta_val); + } + } +}; + +/* static */ void GeamCSRCSRCompute::omp_variant(TaskContext context) +{ + geam_csr_csr_compute_template(context); +} + +} // namespace sparse diff --git a/src/legate_sparse/array/csr/geam_template.inl b/src/legate_sparse/array/csr/geam_template.inl new file mode 100644 index 00000000..0dc13513 --- /dev/null +++ b/src/legate_sparse/array/csr/geam_template.inl @@ -0,0 +1,139 @@ +/* Copyright 2022-2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "legate_sparse/array/csr/geam.h" +#include "legate_sparse/util/dispatch.h" +#include "legate_sparse/util/typedefs.h" + +namespace sparse { +using namespace legate; + +// ============================================================================ +// Symbolic phase templates +// ============================================================================ + +template +struct GeamSymbolicImplBody; + +template +struct GeamSymbolicImpl { + TaskContext context; + explicit GeamSymbolicImpl(TaskContext context) : context(context) {} + + template + void operator()(const GeamCSRCSRSymbolicArgs& args) + { + using INDEX_TY = type_of; + + auto A_pos = args.A_pos.read_accessor, 1>(); + auto A_crd = args.A_crd.read_accessor(); + auto B_pos = args.B_pos.read_accessor, 1>(); + auto B_crd = args.B_crd.read_accessor(); + + auto nnz_per_row = args.nnz_per_row.read_write_accessor(); + + GeamSymbolicImplBody{context}( + A_pos, A_crd, B_pos, B_crd, nnz_per_row, args.A_pos.shape<1>()); + } +}; + +template +static void geam_csr_csr_symbolic_template(TaskContext context) +{ + GeamCSRCSRSymbolicArgs args{ + context.inputs()[0], // A_pos + context.inputs()[1], // A_crd + context.inputs()[2], // B_pos + context.inputs()[3], // B_crd + context.outputs()[0], // nnz_per_row + }; + + index_type_dispatch(args.A_crd.code(), GeamSymbolicImpl{context}, args); +} + +// ============================================================================ +// Compute phase templates +// ============================================================================ + +template +struct GeamComputeImplBody; + +template +struct GeamComputeImpl { + TaskContext context; + explicit GeamComputeImpl(TaskContext context) : context(context) {} + + template + void operator()(const GeamCSRCSRComputeArgs& args) + { + using INDEX_TY = type_of; + using VAL_TY = type_of; + + auto A_pos = args.A_pos.read_accessor, 1>(); + auto A_crd = args.A_crd.read_accessor(); + auto A_vals = args.A_vals.read_accessor(); + auto B_pos = args.B_pos.read_accessor, 1>(); + auto B_crd = args.B_crd.read_accessor(); + auto B_vals = args.B_vals.read_accessor(); + + // C_pos is read-only (computed in symbolic phase) + auto C_pos = args.C_pos.read_accessor, 1>(); + auto C_crd = args.C_crd.write_accessor(); + auto C_vals = args.C_vals.write_accessor(); + + // Read scalar values + auto alpha = args.alpha.read_accessor(); + auto beta = args.beta.read_accessor(); + + GeamComputeImplBody{context}(A_pos, + A_crd, + A_vals, + B_pos, + B_crd, + B_vals, + C_pos, + C_crd, + C_vals, + alpha, + beta, + args.A_pos.shape<1>()); + } +}; + +template +static void geam_csr_csr_compute_template(TaskContext context) +{ + GeamCSRCSRComputeArgs args{ + context.inputs()[0], // A_pos + context.inputs()[1], // A_crd + context.inputs()[2], // A_vals + context.inputs()[3], // B_pos + context.inputs()[4], // B_crd + context.inputs()[5], // B_vals + context.inputs()[6], // C_pos (read-only, computed in symbolic phase) + context.outputs()[0], // C_crd + context.outputs()[1], // C_vals + context.inputs()[7], // alpha + context.inputs()[8], // beta + }; + + index_type_value_type_dispatch( + args.A_crd.code(), args.A_vals.code(), GeamComputeImpl{context}, args); +} + +} // namespace sparse diff --git a/src/legate_sparse/array/csr/get_diagonal.cc b/src/legate_sparse/array/csr/get_diagonal.cc index cace6438..47a8c7d1 100644 --- a/src/legate_sparse/array/csr/get_diagonal.cc +++ b/src/legate_sparse/array/csr/get_diagonal.cc @@ -23,6 +23,9 @@ using namespace legate; template struct GetCSRDiagonalImplBody { + TaskContext context; + explicit GetCSRDiagonalImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/csr/get_diagonal.cu b/src/legate_sparse/array/csr/get_diagonal.cu index 15e5e8a4..f0a32dfd 100644 --- a/src/legate_sparse/array/csr/get_diagonal.cu +++ b/src/legate_sparse/array/csr/get_diagonal.cu @@ -45,6 +45,9 @@ __global__ void compute_diag_kernel(size_t rows, template struct GetCSRDiagonalImplBody { + TaskContext context; + explicit GetCSRDiagonalImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; @@ -54,7 +57,7 @@ struct GetCSRDiagonalImplBody { const AccessorRO& vals, const Rect<1>& rect) { - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); auto blocks = get_num_blocks_1d(rect.volume()); compute_diag_kernel <<>>(rect.volume(), rect.lo[0], diag, pos, crd, vals); diff --git a/src/legate_sparse/array/csr/get_diagonal_omp.cc b/src/legate_sparse/array/csr/get_diagonal_omp.cc index ad698eed..c3d114ef 100644 --- a/src/legate_sparse/array/csr/get_diagonal_omp.cc +++ b/src/legate_sparse/array/csr/get_diagonal_omp.cc @@ -23,6 +23,9 @@ using namespace legate; template struct GetCSRDiagonalImplBody { + TaskContext context; + explicit GetCSRDiagonalImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/csr/get_diagonal_template.inl b/src/legate_sparse/array/csr/get_diagonal_template.inl index 74ac61bb..0cee5e9a 100644 --- a/src/legate_sparse/array/csr/get_diagonal_template.inl +++ b/src/legate_sparse/array/csr/get_diagonal_template.inl @@ -29,6 +29,9 @@ struct GetCSRDiagonalImplBody; template struct GetCSRDiagonalImpl { + TaskContext context; + explicit GetCSRDiagonalImpl(TaskContext context) : context(context) {} + template void operator()(GetCSRDiagonalArgs& args) const { @@ -45,7 +48,7 @@ struct GetCSRDiagonalImpl { return; } - GetCSRDiagonalImplBody()( + GetCSRDiagonalImplBody{context}( diag, pos, crd, vals, args.diag.shape<1>()); } }; @@ -56,6 +59,6 @@ static void get_csr_diagonal_template(TaskContext context) auto inputs = context.inputs(); GetCSRDiagonalArgs args{context.outputs()[0], inputs[0], inputs[1], inputs[2]}; index_type_value_type_dispatch( - args.crd.code(), args.diag.code(), GetCSRDiagonalImpl{}, args); + args.crd.code(), args.diag.code(), GetCSRDiagonalImpl{context}, args); } } // namespace sparse diff --git a/src/legate_sparse/array/csr/indexing.cc b/src/legate_sparse/array/csr/indexing.cc index f40c901b..1ec4f57a 100644 --- a/src/legate_sparse/array/csr/indexing.cc +++ b/src/legate_sparse/array/csr/indexing.cc @@ -23,6 +23,9 @@ using namespace legate; template struct CSRIndexingCSRImplBody { + TaskContext context; + explicit CSRIndexingCSRImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/csr/indexing.cu b/src/legate_sparse/array/csr/indexing.cu index 25e96097..68d03ffd 100644 --- a/src/legate_sparse/array/csr/indexing.cu +++ b/src/legate_sparse/array/csr/indexing.cu @@ -84,6 +84,9 @@ __global__ void csr_indexing_csr_kernel(const size_t num_rows, template struct CSRIndexingCSRImplBody { + TaskContext context; + explicit CSRIndexingCSRImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; @@ -98,9 +101,7 @@ struct CSRIndexingCSRImplBody { // Get the number of rows in the matrix size_t num_rows = rect.hi[0] - rect.lo[0] + 1; - std::cout << "GPU variant" << std::endl; - - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); auto blocks = get_num_blocks_1d(rect.volume()); csr_indexing_csr_kernel<<>>( num_rows, A_pos, A_crd, A_vals, mask_pos, mask_crd, value); diff --git a/src/legate_sparse/array/csr/indexing_omp.cc b/src/legate_sparse/array/csr/indexing_omp.cc index c429481f..a96fc270 100644 --- a/src/legate_sparse/array/csr/indexing_omp.cc +++ b/src/legate_sparse/array/csr/indexing_omp.cc @@ -23,6 +23,9 @@ using namespace legate; template struct CSRIndexingCSRImplBody { + TaskContext context; + explicit CSRIndexingCSRImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; @@ -34,7 +37,6 @@ struct CSRIndexingCSRImplBody { const AccessorRO& value, const Rect<1>& rect) { - std::cout << "OMP variant" << std::endl; #pragma omp parallel for for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) { size_t j_pos_start = A_pos[row].lo; diff --git a/src/legate_sparse/array/csr/indexing_template.inl b/src/legate_sparse/array/csr/indexing_template.inl index 381ca45b..e73efa60 100644 --- a/src/legate_sparse/array/csr/indexing_template.inl +++ b/src/legate_sparse/array/csr/indexing_template.inl @@ -28,6 +28,9 @@ struct CSRIndexingCSRImplBody; template struct CSRIndexingCSRImpl { + TaskContext context; + explicit CSRIndexingCSRImpl(TaskContext context) : context(context) {} + template void operator()(const CSRIndexingCSRArgs& args) { @@ -44,7 +47,7 @@ struct CSRIndexingCSRImpl { auto value = args.value.read_accessor(); // TODO: Rect is based on A_pos.shape, is that correct? - CSRIndexingCSRImplBody()( + CSRIndexingCSRImplBody{context}( A_pos, A_crd, A_vals, key_pos, key_crd, value, args.A_pos.shape<1>()); } }; @@ -62,7 +65,7 @@ static void csr_indexing_csr_template(TaskContext context) }; index_type_value_type_dispatch( - args.A_crd.code(), args.A_vals.code(), CSRIndexingCSRImpl(), args); + args.A_crd.code(), args.A_vals.code(), CSRIndexingCSRImpl{context}, args); } } // namespace sparse diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc index 6c4945de..71728397 100644 --- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc +++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc @@ -27,6 +27,9 @@ using namespace legate; template struct SpGEMMCSRxCSRxCSRNNZImplBody { + TaskContext context; + explicit SpGEMMCSRxCSRxCSRNNZImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; void operator()(const AccessorWO& nnz, @@ -94,6 +97,9 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody { template struct SpGEMMCSRxCSRxCSRImplBody { + TaskContext context; + explicit SpGEMMCSRxCSRxCSRImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu index 66827db6..3aa14a26 100644 --- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu +++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu @@ -41,10 +41,10 @@ __global__ void cast_and_offset(size_t elems, DST* dst, const SRC* src, int64_t dst[idx] = static_cast(src[idx] - offset); } -int64_t local_offset_from_nnz(ncclComm_t comm, coord_t task_id, coord_t task_num, int64_t A_nnz) +int64_t local_offset_from_nnz( + ncclComm_t comm, coord_t task_id, coord_t task_num, int64_t A_nnz, cudaStream_t stream) { ThrustAllocator alloc(Memory::GPU_FB_MEM); - auto stream = get_cached_stream(); auto policy = thrust::cuda::par(alloc).on(stream); auto buf = CREATE_BUFFER(int64_t, task_num, Memory::GPU_FB_MEM, "nnz_reduce_buf"); auto nnz_reduce_buf = buf.ptr(0); @@ -67,6 +67,9 @@ int64_t local_offset_from_nnz(ncclComm_t comm, coord_t task_id, coord_t task_num } struct SpGEMMCSRxCSRxCSRGPUImpl { + TaskContext context; + explicit SpGEMMCSRxCSRxCSRGPUImpl(TaskContext context) : context(context) {} + template void operator()(SpGEMMCSRxCSRxCSRGPUArgs& args, coord_t task_id, coord_t task_size) const { @@ -106,7 +109,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl { // Get context sensitive objects. auto handle = get_cusparse(); - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); CHECK_CUSPARSE(cusparseSetStream(handle, stream)); auto B_rows = B_pos.domain().get_volume(); @@ -331,7 +334,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl { //@TODO (marsaev): we don't really need nccl comm here // latency for 1 int and host comm should be much better ncclComm_t* comm = args.comms[0].get(); - offset_nnz = local_offset_from_nnz(*comm, task_id, task_num, A_nnz); + offset_nnz = local_offset_from_nnz(*comm, task_id, task_num, A_nnz, stream); } // Convert the A_indptr array into a pos array. @@ -472,7 +475,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl { //@TODO (marsaev): we don't really need nccl comm here // latency for 1 int and host comm should be much better ncclComm_t* comm = args.comms[0].get(); - offset_nnz = local_offset_from_nnz(*comm, task_id, task_num, A_nnz); + offset_nnz = local_offset_from_nnz(*comm, task_id, task_num, A_nnz, stream); } // Convert the A_indptr array into a pos array. @@ -524,7 +527,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl { context.communicators()}; index_type_floating_point_value_type_dispatch(args.A_crd.code(), args.A_vals.code(), - SpGEMMCSRxCSRxCSRGPUImpl{}, + SpGEMMCSRxCSRxCSRGPUImpl{context}, args, context.get_task_index()[0], context.get_launch_domain().hi()[0]); diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc index e6ac4ef6..addf5d59 100644 --- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc +++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc @@ -28,6 +28,9 @@ using namespace legate; template struct SpGEMMCSRxCSRxCSRNNZImplBody { + TaskContext context; + explicit SpGEMMCSRxCSRxCSRNNZImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; void operator()(const AccessorWO& nnz, @@ -96,6 +99,9 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody { template struct SpGEMMCSRxCSRxCSRImplBody { + TaskContext context; + explicit SpGEMMCSRxCSRxCSRImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl index c958752a..0b99743a 100644 --- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl +++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl @@ -32,6 +32,9 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody; template struct SpGEMMCSRxCSRxCSRNNZImpl { + TaskContext context; + explicit SpGEMMCSRxCSRxCSRNNZImpl(TaskContext context) : context(context) {} + template void operator()(SpGEMMCSRxCSRxCSRNNZArgs& args) const { @@ -43,7 +46,7 @@ struct SpGEMMCSRxCSRxCSRNNZImpl { auto C_pos = args.C_pos.read_accessor, 1>(); auto C_crd = args.C_crd.read_accessor(); - SpGEMMCSRxCSRxCSRNNZImplBody()( + SpGEMMCSRxCSRxCSRNNZImplBody{context}( nnz, B_pos, B_crd, C_pos, C_crd, args.B_pos.shape<1>(), args.C_crd.shape<1>()); } }; @@ -53,6 +56,9 @@ struct SpGEMMCSRxCSRxCSRImplBody; template struct SpGEMMCSRxCSRxCSRImpl { + TaskContext context; + explicit SpGEMMCSRxCSRxCSRImpl(TaskContext context) : context(context) {} + template void operator()(SpGEMMCSRxCSRxCSRArgs& args) const { @@ -69,17 +75,17 @@ struct SpGEMMCSRxCSRxCSRImpl { auto C_crd = args.C_crd.read_accessor(); auto C_vals = args.C_vals.read_accessor(); - SpGEMMCSRxCSRxCSRImplBody()(A_pos, - A_crd, - A_vals, - B_pos, - B_crd, - B_vals, - C_pos, - C_crd, - C_vals, - args.B_pos.shape<1>(), - args.C_crd.shape<1>()); + SpGEMMCSRxCSRxCSRImplBody{context}(A_pos, + A_crd, + A_vals, + B_pos, + B_crd, + B_vals, + C_pos, + C_crd, + C_vals, + args.B_pos.shape<1>(), + args.C_crd.shape<1>()); } }; @@ -95,7 +101,7 @@ static void spgemm_csr_csr_csr_nnz_template(TaskContext context) inputs[3], }; - index_type_dispatch(args.B_crd.code(), SpGEMMCSRxCSRxCSRNNZImpl{}, args); + index_type_dispatch(args.B_crd.code(), SpGEMMCSRxCSRxCSRNNZImpl{context}, args); } template @@ -115,7 +121,7 @@ static void spgemm_csr_csr_csr_template(TaskContext context) inputs[5], }; index_type_floating_point_value_type_dispatch( - args.A_crd.code(), args.A_vals.code(), SpGEMMCSRxCSRxCSRImpl{}, args); + args.A_crd.code(), args.A_vals.code(), SpGEMMCSRxCSRxCSRImpl{context}, args); } } // namespace sparse diff --git a/src/legate_sparse/array/csr/spmv.cc b/src/legate_sparse/array/csr/spmv.cc index d9efa4fd..42d2576c 100644 --- a/src/legate_sparse/array/csr/spmv.cc +++ b/src/legate_sparse/array/csr/spmv.cc @@ -23,6 +23,9 @@ using namespace legate; template struct CSRSpMVRowSplitImplBody { + TaskContext context; + explicit CSRSpMVRowSplitImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/csr/spmv.cu b/src/legate_sparse/array/csr/spmv.cu index f2c5f1a1..536a18d8 100644 --- a/src/legate_sparse/array/csr/spmv.cu +++ b/src/legate_sparse/array/csr/spmv.cu @@ -29,6 +29,9 @@ namespace sparse { template <> struct CSRSpMVRowSplitImpl { + TaskContext context; + explicit CSRSpMVRowSplitImpl(TaskContext context) : context(context) {} + template void operator()(CSRSpMVRowSplitArgs& args) const { @@ -48,7 +51,7 @@ struct CSRSpMVRowSplitImpl { // Get context sensitive objects. auto handle = get_cusparse(); - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); CHECK_CUSPARSE(cusparseSetStream(handle, stream)); // Older cusparse has bug when output vector is not aligned to 16 bytes @@ -109,7 +112,7 @@ struct CSRSpMVRowSplitImpl { CHECK_CUSPARSE(cusparseCreateDnVec( &cusparse_y, y_domain_size /* size */, output_ptr, cusparseDataType())); - auto cusparse_A = makeCuSparseCSR(A_pos, A_crd, A_vals, cols); + auto cusparse_A = makeCuSparseCSR(A_pos, A_crd, A_vals, cols, stream); // Make the CUSPARSE calls. VAL_TY alpha = 1.0; @@ -153,7 +156,7 @@ struct CSRSpMVRowSplitImpl { workspacePtr)); // if we used temporary buffer, copy result to output if (y_aligned) { - LEGATE_CHECK_CUDA(cudaMemcpyAsync( + LEGATE_SPARSE_CHECK_CUDA(cudaMemcpyAsync( y_raw_ptr, output_ptr, y_domain_size * sizeof(VAL_TY), cudaMemcpyDeviceToDevice, stream)); } // Destroy the created objects. @@ -170,7 +173,7 @@ struct CSRSpMVRowSplitImpl { CSRSpMVRowSplitArgs args{context.outputs()[0], inputs[0], inputs[1], inputs[2], inputs[3]}; index_type_floating_point_value_type_dispatch( - args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl{}, args); + args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl{context}, args); } } // namespace sparse diff --git a/src/legate_sparse/array/csr/spmv_omp.cc b/src/legate_sparse/array/csr/spmv_omp.cc index 40b84e83..2937d848 100644 --- a/src/legate_sparse/array/csr/spmv_omp.cc +++ b/src/legate_sparse/array/csr/spmv_omp.cc @@ -23,6 +23,9 @@ using namespace legate; template struct CSRSpMVRowSplitImplBody { + TaskContext context; + explicit CSRSpMVRowSplitImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; using VAL_TY = type_of; diff --git a/src/legate_sparse/array/csr/spmv_template.inl b/src/legate_sparse/array/csr/spmv_template.inl index f339e2d1..ba55490a 100644 --- a/src/legate_sparse/array/csr/spmv_template.inl +++ b/src/legate_sparse/array/csr/spmv_template.inl @@ -31,6 +31,9 @@ struct CSRSpMVRowSplitImplBody; template struct CSRSpMVRowSplitImpl { + TaskContext context; + explicit CSRSpMVRowSplitImpl(TaskContext context) : context(context) {} + template void operator()(CSRSpMVRowSplitArgs& args) const { @@ -48,7 +51,7 @@ struct CSRSpMVRowSplitImpl { return; } - CSRSpMVRowSplitImplBody()( + CSRSpMVRowSplitImplBody{context}( y, A_pos, A_crd, A_vals, x, args.y.shape<1>()); } }; @@ -60,7 +63,7 @@ static void csr_spmv_row_split_template(TaskContext context) CSRSpMVRowSplitArgs args{context.outputs()[0], inputs[0], inputs[1], inputs[2], inputs[3]}; index_type_value_type_dispatch( - args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl{}, args); + args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl{context}, args); } } // namespace sparse diff --git a/src/legate_sparse/array/util/scale_rect.cc b/src/legate_sparse/array/util/scale_rect.cc index c2d2df90..50bbedd8 100644 --- a/src/legate_sparse/array/util/scale_rect.cc +++ b/src/legate_sparse/array/util/scale_rect.cc @@ -23,6 +23,9 @@ using namespace legate; template <> struct ScaleRect1ImplBody { + TaskContext context; + explicit ScaleRect1ImplBody(TaskContext context) : context(context) {} + void operator()(const AccessorRW, 1>& output, const int64_t scale, const Rect<1>& rect) { for (coord_t i = rect.lo[0]; i < rect.hi[0] + 1; i++) { diff --git a/src/legate_sparse/array/util/scale_rect.cu b/src/legate_sparse/array/util/scale_rect.cu index 22132340..fc07bcd6 100644 --- a/src/legate_sparse/array/util/scale_rect.cu +++ b/src/legate_sparse/array/util/scale_rect.cu @@ -38,11 +38,14 @@ __global__ void scale_rect1_kernel(size_t elems, template <> struct ScaleRect1ImplBody { + TaskContext context; + explicit ScaleRect1ImplBody(TaskContext context) : context(context) {} + void operator()(const AccessorRW, 1>& output, const int64_t scale, const Rect<1>& rect) { auto elems = rect.volume(); auto blocks = get_num_blocks_1d(elems); - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); scale_rect1_kernel<<>>(elems, rect.lo, output, scale); LEGATE_SPARSE_CHECK_CUDA_STREAM(stream); } diff --git a/src/legate_sparse/array/util/scale_rect_omp.cc b/src/legate_sparse/array/util/scale_rect_omp.cc index fc850bf3..1783d335 100644 --- a/src/legate_sparse/array/util/scale_rect_omp.cc +++ b/src/legate_sparse/array/util/scale_rect_omp.cc @@ -23,6 +23,9 @@ using namespace legate; template <> struct ScaleRect1ImplBody { + TaskContext context; + explicit ScaleRect1ImplBody(TaskContext context) : context(context) {} + void operator()(const AccessorRW, 1>& output, const int64_t scale, const Rect<1>& rect) { #pragma omp parallel for schedule(static) diff --git a/src/legate_sparse/array/util/scale_rect_template.inl b/src/legate_sparse/array/util/scale_rect_template.inl index 11724c24..512dc2c8 100644 --- a/src/legate_sparse/array/util/scale_rect_template.inl +++ b/src/legate_sparse/array/util/scale_rect_template.inl @@ -29,13 +29,16 @@ struct ScaleRect1ImplBody; template struct ScaleRect1Impl { + TaskContext context; + explicit ScaleRect1Impl(TaskContext context) : context(context) {} + void operator()(ScaleRect1Args& args) const { auto output = args.out.read_write_accessor, 1>(); if (args.out.domain().empty()) { return; } - ScaleRect1ImplBody()(output, args.scale, args.out.shape<1>()); + ScaleRect1ImplBody{context}(output, args.scale, args.out.shape<1>()); } }; @@ -45,7 +48,7 @@ static void scale_rect_1_template(TaskContext context) auto task = context.task_; auto scale = task->futures[0].get_result(); ScaleRect1Args args{context.outputs()[0], scale}; - ScaleRect1Impl{}(args); + ScaleRect1Impl{context}(args); } } // namespace sparse diff --git a/src/legate_sparse/array/util/unzip_rect.cc b/src/legate_sparse/array/util/unzip_rect.cc index 1272e9cc..08170da7 100644 --- a/src/legate_sparse/array/util/unzip_rect.cc +++ b/src/legate_sparse/array/util/unzip_rect.cc @@ -23,6 +23,9 @@ using namespace legate; template <> struct UnZipRect1ImplBody { + TaskContext context; + explicit UnZipRect1ImplBody(TaskContext context) : context(context) {} + void operator()(const AccessorWO& out1, const AccessorWO& out2, const AccessorRO, 1>& in, diff --git a/src/legate_sparse/array/util/unzip_rect.cu b/src/legate_sparse/array/util/unzip_rect.cu index 28067190..d7964c01 100644 --- a/src/legate_sparse/array/util/unzip_rect.cu +++ b/src/legate_sparse/array/util/unzip_rect.cu @@ -39,6 +39,9 @@ __global__ void unzip_rect1_kernel(size_t elems, template <> struct UnZipRect1ImplBody { + TaskContext context; + explicit UnZipRect1ImplBody(TaskContext context) : context(context) {} + void operator()(const AccessorWO& out1, const AccessorWO& out2, const AccessorRO, 1>& in, @@ -46,7 +49,7 @@ struct UnZipRect1ImplBody { { auto elems = rect.volume(); auto blocks = get_num_blocks_1d(elems); - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); unzip_rect1_kernel<<>>(elems, rect.lo, out1, out2, in); LEGATE_SPARSE_CHECK_CUDA_STREAM(stream); } diff --git a/src/legate_sparse/array/util/unzip_rect_omp.cc b/src/legate_sparse/array/util/unzip_rect_omp.cc index e57c43cd..b0345795 100644 --- a/src/legate_sparse/array/util/unzip_rect_omp.cc +++ b/src/legate_sparse/array/util/unzip_rect_omp.cc @@ -23,6 +23,9 @@ using namespace legate; template <> struct UnZipRect1ImplBody { + TaskContext context; + explicit UnZipRect1ImplBody(TaskContext context) : context(context) {} + void operator()(const AccessorWO& out1, const AccessorWO& out2, const AccessorRO, 1>& in, diff --git a/src/legate_sparse/array/util/unzip_rect_template.inl b/src/legate_sparse/array/util/unzip_rect_template.inl index d8bd9d2e..2c97d28d 100644 --- a/src/legate_sparse/array/util/unzip_rect_template.inl +++ b/src/legate_sparse/array/util/unzip_rect_template.inl @@ -29,6 +29,9 @@ struct UnZipRect1ImplBody; template struct UnZipRect1Impl { + TaskContext context; + explicit UnZipRect1Impl(TaskContext context) : context(context) {} + void operator()(UnZipRect1Args& args) const { auto out1 = args.out1.write_accessor(); @@ -37,7 +40,7 @@ struct UnZipRect1Impl { if (args.in.domain().empty()) { return; } - UnZipRect1ImplBody()(out1, out2, in, args.in.shape<1>()); + UnZipRect1ImplBody{context}(out1, out2, in, args.in.shape<1>()); } }; @@ -46,7 +49,7 @@ static void unzip_rect_1_template(TaskContext context) { auto outputs = context.outputs(); UnZipRect1Args args{outputs[0], outputs[1], context.inputs()[0]}; - UnZipRect1Impl{}(args); + UnZipRect1Impl{context}(args); } } // namespace sparse diff --git a/src/legate_sparse/array/util/zip_to_rect.cc b/src/legate_sparse/array/util/zip_to_rect.cc index c8871583..dcbd8dfb 100644 --- a/src/legate_sparse/array/util/zip_to_rect.cc +++ b/src/legate_sparse/array/util/zip_to_rect.cc @@ -23,6 +23,9 @@ using namespace legate; template struct ZipToRect1ImplBody { + TaskContext context; + explicit ZipToRect1ImplBody(TaskContext context) : context(context) {} + void operator()(const AccessorWO, 1>& output, const AccessorRO& lo, const AccessorRO& hi, diff --git a/src/legate_sparse/array/util/zip_to_rect.cu b/src/legate_sparse/array/util/zip_to_rect.cu index 697422e0..393c5860 100644 --- a/src/legate_sparse/array/util/zip_to_rect.cu +++ b/src/legate_sparse/array/util/zip_to_rect.cu @@ -39,12 +39,15 @@ __global__ void zip_rect1_kernel(size_t elems, template struct ZipToRect1ImplBody { + TaskContext context; + explicit ZipToRect1ImplBody(TaskContext context) : context(context) {} + void operator()(const AccessorWO, 1>& output, const AccessorRO& lo, const AccessorRO& hi, const Rect<1>& rect) { - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); auto elems = rect.volume(); auto blocks = get_num_blocks_1d(elems); zip_rect1_kernel<<>>(elems, rect.lo, output, lo, hi); diff --git a/src/legate_sparse/array/util/zip_to_rect_omp.cc b/src/legate_sparse/array/util/zip_to_rect_omp.cc index 03738d36..0b72d8d3 100644 --- a/src/legate_sparse/array/util/zip_to_rect_omp.cc +++ b/src/legate_sparse/array/util/zip_to_rect_omp.cc @@ -23,6 +23,9 @@ using namespace legate; template struct ZipToRect1ImplBody { + TaskContext context; + explicit ZipToRect1ImplBody(TaskContext context) : context(context) {} + void operator()(const AccessorWO, 1>& output, const AccessorRO& lo, const AccessorRO& hi, diff --git a/src/legate_sparse/array/util/zip_to_rect_template.inl b/src/legate_sparse/array/util/zip_to_rect_template.inl index c53411e8..7ebad169 100644 --- a/src/legate_sparse/array/util/zip_to_rect_template.inl +++ b/src/legate_sparse/array/util/zip_to_rect_template.inl @@ -29,6 +29,9 @@ struct ZipToRect1ImplBody; template struct ZipToRect1Impl { + TaskContext context; + explicit ZipToRect1Impl(TaskContext context) : context(context) {} + void operator()(ZipToRect1Args& args) const { auto output = args.out.write_accessor, 1>(); @@ -37,7 +40,7 @@ struct ZipToRect1Impl { if (args.out.domain().empty()) { return; } - ZipToRect1ImplBody()(output, lo, hi, args.out.shape<1>()); + ZipToRect1ImplBody{context}(output, lo, hi, args.out.shape<1>()); } }; @@ -47,10 +50,10 @@ static void zip_to_rect_1_template(TaskContext context) auto inputs = context.inputs(); ZipToRect1Args args{context.outputs()[0], inputs[0], inputs[1]}; if (inputs[0].data().type().code() == legate::Type::Code::INT64) { - ZipToRect1Impl{}(args); + ZipToRect1Impl{context}(args); } else { assert(inputs[0].data().type().code() == legate::Type::Code::UINT64); - ZipToRect1Impl{}(args); + ZipToRect1Impl{context}(args); } } diff --git a/src/legate_sparse/cffi.h b/src/legate_sparse/cffi.h index cdd53926..584311aa 100644 --- a/src/legate_sparse/cffi.h +++ b/src/legate_sparse/cffi.h @@ -45,6 +45,13 @@ enum LegateSparseOpCode { // like iterative linear solvers. LEGATE_SPARSE_AXPBY, + // Sparse direct linear solve + LEGATE_SPARSE_SPSOLVE, + + // Computes GEAM: alpha * A + beta * B = C + LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC, + LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE, + // nonzero API LEGATE_SPARSE_NONZERO, diff --git a/src/legate_sparse/cudalibs.cu b/src/legate_sparse/cudalibs.cu index 6ec45bd5..d2607fb2 100644 --- a/src/legate_sparse/cudalibs.cu +++ b/src/legate_sparse/cudalibs.cu @@ -22,7 +22,7 @@ namespace sparse { -CUDALibraries::CUDALibraries() : finalized_(false), cusparse_(nullptr) {} +CUDALibraries::CUDALibraries() : finalized_(false), cusparse_(nullptr), cudss_(nullptr) {} CUDALibraries::~CUDALibraries() { finalize(); } @@ -34,6 +34,9 @@ void CUDALibraries::finalize() if (cusparse_ != nullptr) { finalize_cusparse(); } + if (cudss_ != nullptr) { + finalize_cudss(); + } finalized_ = true; } @@ -51,6 +54,20 @@ cusparseHandle_t CUDALibraries::get_cusparse() return this->cusparse_; } +void CUDALibraries::finalize_cudss() +{ + CHECK_CUDSS(cudssDestroy(cudss_)); + cudss_ = nullptr; +} + +cudssHandle_t CUDALibraries::get_cudss() +{ + if (this->cudss_ == nullptr) { + CHECK_CUDSS(cudssCreate(&this->cudss_)); + } + return this->cudss_; +} + static CUDALibraries& get_cuda_libraries(legate::Processor proc) { if (proc.kind() != legate::Processor::TOC_PROC) { @@ -63,16 +80,18 @@ static CUDALibraries& get_cuda_libraries(legate::Processor proc) return cuda_libraries[proc_id]; } -legate::cuda::StreamView get_cached_stream() +cusparseHandle_t get_cusparse() { - return legate::cuda::StreamPool::get_stream_pool().get_stream(); + const auto proc = legate::Processor::get_executing_processor(); + auto& lib = get_cuda_libraries(proc); + return lib.get_cusparse(); } -cusparseHandle_t get_cusparse() +cudssHandle_t get_cudss() { const auto proc = legate::Processor::get_executing_processor(); auto& lib = get_cuda_libraries(proc); - return lib.get_cusparse(); + return lib.get_cudss(); } class LoadCUDALibsTask : public SparseTask { @@ -86,6 +105,7 @@ class LoadCUDALibsTask : public SparseTask { const auto proc = legate::Processor::get_executing_processor(); auto& lib = get_cuda_libraries(proc); lib.get_cusparse(); + lib.get_cudss(); } }; diff --git a/src/legate_sparse/cudalibs.h b/src/legate_sparse/cudalibs.h index 5a387200..47596a79 100644 --- a/src/legate_sparse/cudalibs.h +++ b/src/legate_sparse/cudalibs.h @@ -33,13 +33,16 @@ struct CUDALibraries { public: void finalize(); cusparseHandle_t get_cusparse(); + cudssHandle_t get_cudss(); private: void finalize_cusparse(); + void finalize_cudss(); private: bool finalized_; cusparseHandle_t cusparse_; + cudssHandle_t cudss_; }; } // namespace sparse diff --git a/src/legate_sparse/linalg/axpby.cc b/src/legate_sparse/linalg/axpby.cc index 547ad927..43e99520 100644 --- a/src/legate_sparse/linalg/axpby.cc +++ b/src/legate_sparse/linalg/axpby.cc @@ -23,6 +23,9 @@ using namespace legate; template struct AXPBYImplBody { + TaskContext context; + explicit AXPBYImplBody(TaskContext context) : context(context) {} + using VAL_TY = type_of; void operator()(const AccessorRW& y, diff --git a/src/legate_sparse/linalg/axpby.cu b/src/legate_sparse/linalg/axpby.cu index 784e77a3..f7ee1feb 100644 --- a/src/legate_sparse/linalg/axpby.cu +++ b/src/legate_sparse/linalg/axpby.cu @@ -48,6 +48,9 @@ __global__ void axpby_kernel(size_t elems, template struct AXPBYImplBody { + TaskContext context; + explicit AXPBYImplBody(TaskContext context) : context(context) {} + using VAL_TY = type_of; void operator()(const AccessorRW& y, @@ -58,7 +61,7 @@ struct AXPBYImplBody { { auto elems = rect.volume(); auto blocks = get_num_blocks_1d(elems); - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); axpby_kernel <<>>(elems, rect.lo[0], y, x, a, b); LEGATE_SPARSE_CHECK_CUDA_STREAM(stream); diff --git a/src/legate_sparse/linalg/axpby_omp.cc b/src/legate_sparse/linalg/axpby_omp.cc index eb019b5c..c5569fba 100644 --- a/src/legate_sparse/linalg/axpby_omp.cc +++ b/src/legate_sparse/linalg/axpby_omp.cc @@ -23,6 +23,9 @@ using namespace legate; template struct AXPBYImplBody { + TaskContext context; + explicit AXPBYImplBody(TaskContext context) : context(context) {} + using VAL_TY = type_of; void operator()(const AccessorRW& y, diff --git a/src/legate_sparse/linalg/axpby_template.inl b/src/legate_sparse/linalg/axpby_template.inl index 8651672d..d110f6de 100644 --- a/src/legate_sparse/linalg/axpby_template.inl +++ b/src/legate_sparse/linalg/axpby_template.inl @@ -29,6 +29,9 @@ struct AXPBYImplBody; template struct AXPBYImpl { + TaskContext context; + explicit AXPBYImpl(TaskContext context) : context(context) {} + template void operator()(AXPBYArgs& args) const { @@ -42,15 +45,15 @@ struct AXPBYImpl { } if (args.isalpha) { if (args.negate) { - AXPBYImplBody()(y, x, a, b, args.y.shape<1>()); + AXPBYImplBody{context}(y, x, a, b, args.y.shape<1>()); } else { - AXPBYImplBody()(y, x, a, b, args.y.shape<1>()); + AXPBYImplBody{context}(y, x, a, b, args.y.shape<1>()); } } else { if (args.negate) { - AXPBYImplBody()(y, x, a, b, args.y.shape<1>()); + AXPBYImplBody{context}(y, x, a, b, args.y.shape<1>()); } else { - AXPBYImplBody()(y, x, a, b, args.y.shape<1>()); + AXPBYImplBody{context}(y, x, a, b, args.y.shape<1>()); } } } @@ -67,7 +70,7 @@ static void axpby_template(TaskContext context) context.scalars()[0].value(), context.scalars()[1].value(), }; - value_type_dispatch(args.y.code(), AXPBYImpl{}, args); + value_type_dispatch(args.y.code(), AXPBYImpl{context}, args); } } // namespace sparse diff --git a/src/legate_sparse/linalg/spsolve.cc b/src/legate_sparse/linalg/spsolve.cc new file mode 100644 index 00000000..446d9e04 --- /dev/null +++ b/src/legate_sparse/linalg/spsolve.cc @@ -0,0 +1,34 @@ +/* Copyright 2022-2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "legate_sparse/linalg/spsolve.h" +#include "legate_sparse/util/dispatch.h" +#include "legate_sparse/util/legate_utils.h" + +namespace sparse { + +using namespace legate; + +namespace // unnamed +{ +static const auto sparse_reg_task_ = []() -> char { + SpSolve::register_variants(); + return 0; +}(); + +} // namespace + +} // namespace sparse diff --git a/src/legate_sparse/linalg/spsolve.cu b/src/legate_sparse/linalg/spsolve.cu new file mode 100644 index 00000000..ace6fe5e --- /dev/null +++ b/src/legate_sparse/linalg/spsolve.cu @@ -0,0 +1,184 @@ +/* Copyright 2022-2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "legate_sparse/linalg/spsolve.h" +#include "legate_sparse/util/cusparse_utils.h" +#include "legate_sparse/util/cudss_utils.h" +#include "legate_sparse/util/dispatch.h" +#include "legate_sparse/util/legate_utils.h" + +namespace sparse { + +struct SpSolveImpl { + TaskContext context; + explicit SpSolveImpl(TaskContext context) : context(context) {} + + template + void operator()(SpSolveArgs& args, int num_gpus) const + { + using INDEX_TY = type_of; + using VAL_TY = type_of; + + auto& A_pos = args.A_pos; + auto& A_crd = args.A_crd; + auto& A_vals = args.A_vals; + auto& b = args.b; + auto& x = args.x; // output + auto comms = args.comms; + uint64_t nrows_g = args.nrows_g; + uint64_t nnz_g = args.nnz_g; + uint64_t ncols_g = nrows_g; + + int hybrid_mode = 0; // 0 = GPU-only execution in cuDSS + + // cuDSS handle and stream set + auto handle = get_cudss(); + auto stream = context.get_task_stream(); + CHECK_CUDSS(cudssSetStream(handle, stream)); + + // create configuration and data objects + cudssConfig_t config; + cudssData_t solverData; + + CHECK_CUDSS(cudssConfigCreate(&config)); + CHECK_CUDSS(cudssConfigSet(config, CUDSS_CONFIG_HYBRID_MODE, &hybrid_mode, sizeof(int))); + CHECK_CUDSS(cudssDataCreate(handle, &solverData)); + + // A x = b + // (m, n) (n, 1) = (m, 1); m = nrows, n = ncols + // _l: local (e.g., shape of the partitioned array) + // _g: global (e.g., global shape of the array) + + int64_t nrows_l = A_pos.domain().get_volume(); + int64_t ncols_l = x.domain().get_volume(); + int64_t nnz_l = A_vals.domain().get_volume(); + + int64_t nrhs = 1; // Number of right-hand side + int64_t ldb = nrows_g; // leading dimension of b + int64_t ldx = ncols_g; // leading dimension of x + + auto A_indptr = CREATE_BUFFER(int64_t, nrows_l + 1, Memory::GPU_FB_MEM, "A_indptr"); + { + auto blocks = get_num_blocks_1d(nrows_l); + convertGlobalPosToLocalIndPtr<<>>( + nrows_l, A_pos.read_accessor, 1>().ptr(A_pos.domain().lo()), A_indptr.ptr(0)); + } + + CHECK_CUDSS(cudssSetStream(handle, stream)); + + cudssMatrix_t mat_A, vec_b, vec_x; + CHECK_CUDSS(cudssMatrixCreateCsr(&mat_A, // pointer to the matrix + nrows_g, // number of rows + ncols_g, // number of columns + nnz_g, // number of non-zeros + (void*)A_indptr.ptr(0), // offsets, + nullptr, // end index if start index was used + getPtrFromStore(A_crd), // column indices + getPtrFromStore(A_vals), // values + cudssIndexType(), // indexType + cudssDataType(), // valueType + CUDSS_MTYPE_GENERAL, // matrix type + CUDSS_MVIEW_FULL, // matrix view + CUDSS_BASE_ZERO // indexBase + )); + + // NOTE: + // nrhs should be derived from b (b.shape[1]) and MUST be 1 right now. + // When we support multi-dimensional right-hand sides, we need to + // make sure that a column major order is chosen in the mapper + + auto x_ptr = getPtrFromStore(x); + + // Create dense output vector, x, of shape (ncol_g, nrhs) + CHECK_CUDSS(cudssMatrixCreateDn(&vec_x, + ncols_g, // number of rows + nrhs, // number of RHS, set to 1 + ldx, // Leading dimension of x + (void*)x_ptr, // Values of the dense matrix + cudssDataType(), // Data type of the dense vector + CUDSS_LAYOUT_COL_MAJOR) // Layout + ); + + auto b_ptr = getPtrFromStore(b); + + // Create dense RHS vector, b, of shape (nrows_g, nrhs) + CHECK_CUDSS(cudssMatrixCreateDn(&vec_b, + nrows_g, // number of rows + nrhs, // number of RHS, set to 1 + ldb, // Leading dimension of b + (void*)b_ptr, // Values of the dense matrix + cudssDataType(), // Data type of the dense vector + CUDSS_LAYOUT_COL_MAJOR) // Layout + ); + + // Matrix and Vectors are partitioned row-wise + if (num_gpus > 1) { + ncclComm_t* comm = comms[0].get(); + cudssMatrixSetDistributionRow1d(mat_A, + static_cast(A_pos.domain().lo()[0]), + static_cast(A_pos.domain().hi()[0])); + cudssMatrixSetDistributionRow1d( + vec_b, static_cast(b.domain().lo()[0]), static_cast(b.domain().hi()[0])); + cudssMatrixSetDistributionRow1d( + vec_x, static_cast(x.domain().lo()[0]), static_cast(x.domain().hi()[0])); + + // path to libcudss_commlayer_nccl.so is obtained from the env CUDSS_COMM_LIB + CHECK_CUDSS(cudssSetCommLayer(handle, nullptr)); + CHECK_CUDSS(cudssDataSet(handle, solverData, CUDSS_DATA_COMM, comm, sizeof(ncclComm_t*))); + } + + // Solve + CHECK_CUDSS( + cudssExecute(handle, CUDSS_PHASE_ANALYSIS, config, solverData, mat_A, vec_x, vec_b)); + + CHECK_CUDSS( + cudssExecute(handle, CUDSS_PHASE_FACTORIZATION, config, solverData, mat_A, vec_x, vec_b)); + + CHECK_CUDSS(cudssExecute(handle, CUDSS_PHASE_SOLVE, config, solverData, mat_A, vec_x, vec_b)); + + // Destroy matrix, vectors, and setup + CHECK_CUDSS(cudssMatrixDestroy(mat_A)); + CHECK_CUDSS(cudssMatrixDestroy(vec_x)); + CHECK_CUDSS(cudssMatrixDestroy(vec_b)); + CHECK_CUDSS(cudssDataDestroy(handle, solverData)); + CHECK_CUDSS(cudssConfigDestroy(config)); + + LEGATE_SPARSE_CHECK_CUDA(cudaStreamSynchronize(stream)); + } +}; + +/* static */ void SpSolve::gpu_variant(TaskContext context) +{ + auto inputs = context.inputs(); + auto outputs = context.outputs(); + auto comms = context.communicators(); + + SpSolveArgs args{inputs[0], // A_pos + inputs[1], // A_crd + inputs[2], // A_vals + inputs[3], // b + outputs[0], // x + context.scalars()[0].value(), // nrows_g + context.scalars()[1].value(), // nnz_g + comms}; + int num_gpus = static_cast(context.get_launch_domain().hi()[0]) + 1; + index_type_floating_point_value_type_dispatch( + args.A_crd.code(), args.A_vals.code(), SpSolveImpl{context}, args, num_gpus); +} + +using namespace legate; + +} // namespace sparse diff --git a/src/legate_sparse/linalg/spsolve.h b/src/legate_sparse/linalg/spsolve.h new file mode 100644 index 00000000..68908f3e --- /dev/null +++ b/src/legate_sparse/linalg/spsolve.h @@ -0,0 +1,48 @@ +/* Copyright 2022-2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "legate_sparse/sparse.h" +#include "legate_sparse/sparse_c.h" +#include "legate.h" + +namespace sparse { + +struct SpSolveArgs { + const legate::PhysicalStore& A_pos; + const legate::PhysicalStore& A_crd; + const legate::PhysicalStore& A_vals; + const legate::PhysicalStore& b; + const legate::PhysicalStore& x; // output + const uint64_t nrows_g; // global number of rows + const uint64_t nnz_g; // global number of nonzeros + std::vector comms; +}; + +class SpSolve : public SparseTask { + public: + static inline const auto TASK_CONFIG = + legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPSOLVE}}; + static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS = + legate::VariantOptions{}.with_has_allocations(true); + +#ifdef LEGATE_USE_CUDA + static void gpu_variant(legate::TaskContext ctx); +#endif +}; + +} // namespace sparse diff --git a/src/legate_sparse/mapper/mapper.cc b/src/legate_sparse/mapper/mapper.cc index 6357d898..8c330b9d 100644 --- a/src/legate_sparse/mapper/mapper.cc +++ b/src/legate_sparse/mapper/mapper.cc @@ -126,6 +126,10 @@ std::optional LegateSparseMapper::allocation_pool_size(const Task& return std::nullopt; } + case LEGATE_SPARSE_SPSOLVE: { + return std::nullopt; + } + default: { // Handle any unhandled enum values LEGATE_ABORT("Unsupported Legate Sparse task_id: " + std::to_string(task_id)); diff --git a/src/legate_sparse/partition/fast_image_partition.cu b/src/legate_sparse/partition/fast_image_partition.cu index 47a79606..1825bce0 100644 --- a/src/legate_sparse/partition/fast_image_partition.cu +++ b/src/legate_sparse/partition/fast_image_partition.cu @@ -28,6 +28,9 @@ using namespace legate; template struct FastImageRangeImplBody { + TaskContext context; + explicit FastImageRangeImplBody(TaskContext context) : context(context) {} + using INDEX_TY = type_of; void operator()(const AccessorWO, 1>& out_pos, @@ -37,7 +40,7 @@ struct FastImageRangeImplBody { const Rect<1>& bounds) { ThrustAllocator alloc(Memory::GPU_FB_MEM); - auto stream = get_cached_stream(); + auto stream = context.get_task_stream(); auto thrust_exec_policy = thrust::cuda::par(alloc).on(stream); thrust::pair result = thrust::minmax_element( diff --git a/src/legate_sparse/partition/fast_image_partition_template.inl b/src/legate_sparse/partition/fast_image_partition_template.inl index 4e74731e..2b7cb3e6 100644 --- a/src/legate_sparse/partition/fast_image_partition_template.inl +++ b/src/legate_sparse/partition/fast_image_partition_template.inl @@ -30,6 +30,9 @@ struct FastImageRangeImplBody; template struct FastImageRangeImpl { + TaskContext context; + explicit FastImageRangeImpl(TaskContext context) : context(context) {} + template void operator()(FastImageRangeArgs& args) const { @@ -43,7 +46,7 @@ struct FastImageRangeImpl { if (args.input_crd.domain().empty()) { return; } - FastImageRangeImplBody()( + FastImageRangeImplBody{context}( output_pos, input_pos, input_crd, args.input_pos.shape<1>(), args.input_crd.shape<1>()); } }; @@ -52,7 +55,7 @@ template static void fast_image_range_template(TaskContext context) { FastImageRangeArgs args{context.output(0), context.input(0), context.input(1)}; - index_type_dispatch(args.input_crd.code(), FastImageRangeImpl{}, args); + index_type_dispatch(args.input_crd.code(), FastImageRangeImpl{context}, args); } } // namespace sparse diff --git a/src/legate_sparse/util/cuda_help.h b/src/legate_sparse/util/cuda_help.h index d009f9d6..61e83da0 100644 --- a/src/legate_sparse/util/cuda_help.h +++ b/src/legate_sparse/util/cuda_help.h @@ -18,9 +18,13 @@ #include #include "legate.h" -#include "legate/cuda/cuda.h" -#include "legate/cuda/stream_pool.h" + +// For sparse matrix ops like spGEMM and spMv #include + +// For direct solvers +#include + #include #define THREADS_PER_BLOCK 128 @@ -31,6 +35,12 @@ check_cusparse(result, __FILE__, __LINE__); \ } while (false) +#define CHECK_CUDSS(expr) \ + do { \ + cudssStatus_t result = (expr); \ + check_cudss(result, __FILE__, __LINE__); \ + } while (false) + #define CHECK_NCCL(expr) \ do { \ ncclResult_t result = (expr); \ @@ -102,6 +112,24 @@ __host__ inline void check_cusparse(cusparseStatus_t status, const char* file, i } } +__host__ inline void check_cudss(cudssStatus_t status, const char* file, int line) +{ + // TODO: Need to get the equivalent error message from cuDSS + if (status != CUDSS_STATUS_SUCCESS) { + fprintf(stderr, + "Internal CUDSS failure with error code %d in file %s at line %d\n", + status, + // TODO + file, + line); +#ifdef DEBUG_LEGATE_SPARSE + assert(false); +#else + exit(status); +#endif + } +} + __host__ inline void check_nccl(ncclResult_t error, const char* file, int line) { if (error != ncclSuccess) { @@ -118,10 +146,9 @@ __host__ inline void check_nccl(ncclResult_t error, const char* file, int line) } } -// Return a cached stream for the current GPU. -legate::cuda::StreamView get_cached_stream(); - // Method to get the CUSPARSE handle associated with the current GPU. cusparseHandle_t get_cusparse(); +cudssHandle_t get_cudss(); + } // namespace sparse diff --git a/src/legate_sparse/util/cudss_utils.h b/src/legate_sparse/util/cudss_utils.h new file mode 100644 index 00000000..e72d466c --- /dev/null +++ b/src/legate_sparse/util/cudss_utils.h @@ -0,0 +1,73 @@ +/* Copyright 2022-2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "legate_sparse/sparse.h" +#include "legate_sparse/util/cuda_help.h" +#include "legate_sparse/util/legate_utils.h" + +namespace sparse { + +using namespace legate; + +// Template dispatch for value type. +// Note: cuDSS only supports floating-point and complex types. +// Integer and boolean types are not supported by cuDSS. +template +cudaDataType_t cudssDataType(); + +template <> +inline cudaDataType_t cudssDataType() +{ + return CUDA_R_32F; +} + +template <> +inline cudaDataType_t cudssDataType() +{ + return CUDA_R_64F; +} + +template <> +inline cudaDataType_t cudssDataType>() +{ + return CUDA_C_32F; +} + +template <> +inline cudaDataType_t cudssDataType>() +{ + return CUDA_C_64F; +} + +// Template dispatch for the index type. +template +cudaDataType_t cudssIndexType(); + +template <> +inline cudaDataType_t cudssIndexType() +{ + return CUDA_R_32I; +} + +template <> +inline cudaDataType_t cudssIndexType() +{ + return CUDA_R_64I; +} + +} // namespace sparse diff --git a/src/legate_sparse/util/cusparse_utils.h b/src/legate_sparse/util/cusparse_utils.h index 6d496a3e..3ea1029b 100644 --- a/src/legate_sparse/util/cusparse_utils.h +++ b/src/legate_sparse/util/cusparse_utils.h @@ -14,9 +14,12 @@ * */ +#pragma once + #include "legate_sparse/sparse.h" #include "legate_sparse/util/cuda_help.h" #include "legate_sparse/util/legate_utils.h" +#include namespace sparse { @@ -75,8 +78,7 @@ void* getPtrFromStore(const legate::PhysicalStore& store) } else if (!store.is_writable() && store.is_readable()) { return const_cast(store.read_accessor().ptr(dom.lo())); } else if (store.is_reducible()) { - return store.reduce_accessor, true /* exclusive */, DIM>().ptr( - dom.lo()); + return store.reduce_accessor, true, DIM>().ptr(dom.lo()); } else { assert(false); return nullptr; @@ -100,13 +102,13 @@ inline cudaDataType cusparseDataType() } template <> -inline cudaDataType cusparseDataType>() +inline cudaDataType cusparseDataType>() { return CUDA_C_32F; } template <> -inline cudaDataType cusparseDataType>() +inline cudaDataType cusparseDataType>() { return CUDA_C_64F; } @@ -133,10 +135,10 @@ template cusparseSpMatDescr_t makeCuSparseCSR(const legate::PhysicalStore& pos, const legate::PhysicalStore& crd, const legate::PhysicalStore& vals, - size_t cols) + size_t cols, + cudaStream_t stream) { cusparseSpMatDescr_t matDescr; - auto stream = get_cached_stream(); auto pos_domain = pos.domain(); auto crd_domain = crd.domain(); @@ -169,10 +171,10 @@ template cusparseSpMatDescr_t makeCuSparseCSC(const legate::PhysicalStore& pos, const legate::PhysicalStore& crd, const legate::PhysicalStore& vals, - size_t rows) + size_t rows, + cudaStream_t stream) { cusparseSpMatDescr_t matDescr; - auto stream = get_cached_stream(); auto pos_domain = pos.domain(); auto crd_domain = crd.domain(); @@ -237,7 +239,7 @@ cusparseDnMatDescr_t makeCuSparseDenseMat(const legate::PhysicalStore& mat) valsPtr = const_cast(acc.ptr(d.lo())); ld = acc.accessor.strides[0] / sizeof(VAL_TY); } else if (mat.is_reducible()) { - auto acc = mat.reduce_accessor, true /* exclusive */, 2>(); + auto acc = mat.reduce_accessor, true, 2>(); valsPtr = acc.ptr(d.lo()); ld = acc.accessor.strides[0] / sizeof(VAL_TY); } else { diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index a8b2f17e..0629e734 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -42,7 +42,9 @@ def _create_mask(rows, density=0.3): row_idx = numpy.random.randint(0, rows, size=nnz) col_idx = numpy.random.randint(0, cols, size=nnz) data = numpy.ones(nnz, dtype=bool) - A_scipy = scipy_sparse.csr_array((data, (row_idx, col_idx)), shape=(rows, cols)) + A_scipy = scipy_sparse.csr_array( + (data, (row_idx, col_idx)), shape=(rows, cols) + ) # Sparse A_sparse = sparse.csr_array(A_scipy.todense()) @@ -90,7 +92,9 @@ def create_matrix(): """ def _create_matrix(N, tol=0.5): - _, A_scipy, _ = simple_system_gen(N, N, scipy_sparse.csr_array, tol=tol) + _, A_scipy, _ = simple_system_gen( + N, N, scipy_sparse.csr_array, tol=tol + ) A_sparse = sparse.csr_array(A_scipy) # Verify matrices are equivalent @@ -103,3 +107,318 @@ def _create_matrix(N, tol=0.5): return A_scipy, A_sparse return _create_matrix + + +@pytest.fixture +def create_tridiagonal_complex_hermitian_matrix(): + """Create a tridiagonal complex Hermitian sparse matrix. + + This fixture creates a tridiagonal complex Hermitian sparse matrix suitable + for eigenvalue computations. The matrix has a real main diagonal and complex + conjugate off-diagonals. + + Parameters + ---------- + N : int + Number of rows (and columns) in the square matrix. + + Returns + ------- + scipy.sparse.csr_array + A tridiagonal complex Hermitian sparse matrix in SciPy CSR format. + + Notes + ----- + The matrix is constructed with: + - Main diagonal: 4.0 + - Upper diagonal: -(1.0 + 1.0j) + - Lower diagonal: -(1.0 - 1.0j) (complex conjugate) + + """ + + def _create_tridiagonal_complex_hermitian_matrix(N: int): + """Returns a scipy.sparse csr_array that is tridiagonal Hermitian""" + main_diag_val = 4.0 + off_diag_val = -(1.0 + 1.0j) + + main_diag = numpy.full(N, main_diag_val) + upper_diag = numpy.full(N - 1, off_diag_val) + lower_diag = numpy.full(N - 1, numpy.conjugate(off_diag_val)) + + diagonals = [lower_diag, main_diag, upper_diag] + offsets = [-1, 0, 1] + + A = scipy_sparse.diags( + diagonals, + offsets, + shape=(N, N), + format="csr", + dtype=numpy.complex128, + ) + + return A + + return _create_tridiagonal_complex_hermitian_matrix + + +@pytest.fixture +def create_tridiagonal_real_symmetric_matrix(): + """Create a tridiagonal real symmetric sparse matrix. + + This fixture creates a tridiagonal real symmetric sparse matrix suitable + for eigenvalue computations. The matrix has a constant main diagonal and + constant off-diagonals. + + Parameters + ---------- + N : int + Number of rows (and columns) in the square matrix. + + Returns + ------- + scipy.sparse.csr_array + A tridiagonal real symmetric sparse matrix in SciPy CSR format. + + Notes + ----- + The matrix is constructed with: + - Main diagonal: 4.0 + - Upper diagonal: -1.0 + - Lower diagonal: -1.0 + + """ + + def _create_tridiagonal_real_symmetric_matrix(N: int): + """Returns a scipy.sparse csr_array that is tridiagonal symmetric""" + main_diag_val = 4.0 + off_diag_val = -1.0 + + main_diag = numpy.full(N, main_diag_val) + upper_diag = numpy.full(N - 1, off_diag_val) + lower_diag = numpy.full(N - 1, numpy.conjugate(off_diag_val)) + + diagonals = [lower_diag, main_diag, upper_diag] + offsets = [-1, 0, 1] + + A = scipy_sparse.diags( + diagonals, offsets, shape=(N, N), format="csr", dtype=numpy.float64 + ) + + return A + + return _create_tridiagonal_real_symmetric_matrix + + +@pytest.fixture +def create_sparse_real_symmetric_matrix(): + """Create a generic real symmetric sparse matrix with random sparsity. + + This fixture creates a real symmetric sparse matrix suitable for eigenvalue + computations. The sparsity pattern changes with N, making it suitable for + testing across different matrix sizes. + + Parameters + ---------- + N : int + Number of rows (and columns) in the square matrix. + density : float, optional + Approximate density of non-zero elements. Default is 0.3. + seed : int, optional + Random seed for reproducibility. Default is 42. + + Returns + ------- + scipy.sparse.csr_array + A real symmetric sparse matrix in SciPy CSR format. + + Notes + ----- + The matrix is constructed by: + 1. Generating a random sparse matrix + 2. Making it symmetric: A = (A + A.T) / 2 + 3. Adding a diagonal component to ensure positive definiteness + + """ + + def _create_sparse_real_symmetric_matrix(N: int, density=0.3, seed=42): + """Returns a scipy.sparse csr_array that is symmetric with random sparsity""" + numpy.random.seed(seed) + + # Generate random sparse matrix + nnz = int(N * N * density) + row_idx = numpy.random.randint(0, N, size=nnz) + col_idx = numpy.random.randint(0, N, size=nnz) + data = numpy.random.randn(nnz) + + A = scipy_sparse.csr_array((data, (row_idx, col_idx)), shape=(N, N)) + + # Make it symmetric: A = (A + A.T) / 2 + A = (A + A.T) / 2 + + # Add diagonal dominance to ensure well-conditioned matrix + # This helps with convergence in eigenvalue computations + A = A + scipy_sparse.eye(N, format="csr") * N + + return A + + return _create_sparse_real_symmetric_matrix + + +@pytest.fixture +def create_sparse_complex_hermitian_matrix(): + """Create a generic complex Hermitian sparse matrix with random sparsity. + + This fixture creates a complex Hermitian sparse matrix suitable for + eigenvalue computations. The sparsity pattern changes with N, making it + suitable for testing across different matrix sizes. + + Parameters + ---------- + N : int + Number of rows (and columns) in the square matrix. + density : float, optional + Approximate density of non-zero elements. Default is 0.3. + seed : int, optional + Random seed for reproducibility. Default is 42. + + Returns + ------- + scipy.sparse.csr_array + A complex Hermitian sparse matrix in SciPy CSR format. + + Notes + ----- + The matrix is constructed by: + 1. Generating a random complex sparse matrix + 2. Making it Hermitian: A = (A + A.H) / 2 + 3. Adding a diagonal component to ensure positive definiteness + + """ + + def _create_sparse_complex_hermitian_matrix(N: int, density=0.3, seed=42): + """Returns a scipy.sparse csr_array that is Hermitian with random sparsity""" + numpy.random.seed(seed) + + # Generate random complex sparse matrix + nnz = int(N * N * density) + row_idx = numpy.random.randint(0, N, size=nnz) + col_idx = numpy.random.randint(0, N, size=nnz) + data_real = numpy.random.randn(nnz) + data_imag = numpy.random.randn(nnz) + data = data_real + 1j * data_imag + + A = scipy_sparse.csr_array( + (data, (row_idx, col_idx)), shape=(N, N), dtype=numpy.complex128 + ) + + # Make it Hermitian: A = (A + A.H) / 2 + A = (A + A.conjugate().T) / 2 + + # Add diagonal dominance to ensure well-conditioned matrix + # This helps with convergence in eigenvalue computations + A = A + scipy_sparse.eye(N, format="csr", dtype=numpy.complex128) * N + + return A + + return _create_sparse_complex_hermitian_matrix + + +@pytest.fixture +def create_matrix_with_zero_diagonal(): + """Create a symmetric/Hermitian matrix with at least one zero diagonal entry. + + This fixture creates a sparse matrix with a missing diagonal element + to test error handling in eigenvalue computations. + + Parameters + ---------- + N : int + Number of rows (and columns) in the square matrix. + dtype : numpy.dtype + Data type of the matrix (numpy.float64 or numpy.complex128). + zero_index : int, optional + Index of the diagonal element to set to zero. Default is N//2. + density : float, optional + Approximate density of non-zero elements. Default is 0.3. + seed : int, optional + Random seed for reproducibility. Default is 42. + + Returns + ------- + scipy.sparse.csr_array + A sparse matrix with a zero diagonal entry. + + """ + + def _create_matrix_with_zero_diagonal( + N: int, dtype=numpy.float64, zero_index=None, density=0.3, seed=42 + ): + """Returns a scipy.sparse csr_array with a zero diagonal entry""" + if zero_index is None: + zero_index = N // 2 + + numpy.random.seed(seed) + + # Generate random sparse matrix + nnz = int(N * N * density) + row_idx = numpy.random.randint(0, N, size=nnz) + col_idx = numpy.random.randint(0, N, size=nnz) + + if dtype == numpy.complex128: + data_real = numpy.random.randn(nnz) + data_imag = numpy.random.randn(nnz) + data = data_real + 1j * data_imag + A = scipy_sparse.csr_array( + (data, (row_idx, col_idx)), shape=(N, N), dtype=dtype + ) + # Make it Hermitian + A = (A + A.conjugate().T) / 2 + # Add diagonal dominance except for the zero index + diag_vals = numpy.full(N, N, dtype=dtype) + diag_vals[zero_index] = 0.0 + A = A + scipy_sparse.diags(diag_vals, 0, format="csr", dtype=dtype) + else: + data = numpy.random.randn(nnz) + A = scipy_sparse.csr_array( + (data, (row_idx, col_idx)), shape=(N, N) + ) + # Make it symmetric + A = (A + A.T) / 2 + # Add diagonal dominance except for the zero index + diag_vals = numpy.full(N, N, dtype=dtype) + diag_vals[zero_index] = 0.0 + A = A + scipy_sparse.diags(diag_vals, 0, format="csr") + + # Remove the zero from the sparse representation + A.eliminate_zeros() + + return A + + return _create_matrix_with_zero_diagonal + + +@pytest.fixture +def create_non_square_matrix(): + """Create a non-square matrix for testing error handling. + + Parameters + ---------- + rows : int + Number of rows in the matrix. + cols : int + Number of columns in the matrix. + dtype : numpy.dtype + Data type of the matrix. + + Returns + ------- + numpy.ndarray + A non-square dense matrix. + + """ + + def _create_non_square_matrix(rows: int, cols: int, dtype=numpy.float64): + """Returns a non-square matrix""" + return numpy.random.randn(rows, cols).astype(dtype) + + return _create_non_square_matrix diff --git a/tests/integration/test_block_array.py b/tests/integration/test_block_array.py new file mode 100644 index 00000000..c4cbaad9 --- /dev/null +++ b/tests/integration/test_block_array.py @@ -0,0 +1,176 @@ +# Copyright 2024 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for block_array construction function.""" + +import sys + +import cupynumeric as np +import pytest +import scipy.sparse as sp + +import legate_sparse as sparse + +# Temporary release unblock for a known cupynumeric runtime issue. +pytestmark = pytest.mark.skip( + reason=( + "Temporarily disabled for release unblock: " + "https://github.com/nv-legate/cupynumeric/issues/1224" + ) +) + + +class TestBlockArray: + """Tests for the block_array function.""" + + def test_basic_2x2_blocks(self): + """Test basic 2x2 block assembly.""" + A = sparse.csr_array(np.array([[1, 2], [3, 4]])) + B = sparse.csr_array(np.array([[5, 6], [7, 8]])) + C = sparse.csr_array(np.array([[9, 10], [11, 12]])) + D = sparse.csr_array(np.array([[13, 14], [15, 16]])) + + result = sparse.block_array([[A, B], [C, D]]) + + expected = np.array( + [[1, 2, 5, 6], [3, 4, 7, 8], [9, 10, 13, 14], [11, 12, 15, 16]] + ) + assert np.array_equal(result.todense(), expected) + + def test_with_none_blocks(self): + """Test block assembly with None (zero) blocks.""" + A = sparse.csr_array(np.array([[1, 2], [3, 4]])) + B = sparse.csr_array(np.array([[5, 6], [7, 8]])) + + result = sparse.block_array([[A, None], [None, B]]) + + expected = np.array( + [[1, 2, 0, 0], [3, 4, 0, 0], [0, 0, 5, 6], [0, 0, 7, 8]] + ) + assert np.array_equal(result.todense(), expected) + + def test_rectangular_blocks(self): + """Test with rectangular blocks.""" + A = sparse.csr_array(np.array([[1, 2, 3], [4, 5, 6]])) + B = sparse.csr_array(np.array([[7], [8]])) + + result = sparse.block_array([[A, B]]) + + expected = np.array([[1, 2, 3, 7], [4, 5, 6, 8]]) + assert np.array_equal(result.todense(), expected) + + def test_single_block(self): + """Test with a single block.""" + A = sparse.csr_array(np.array([[1, 2], [3, 4]])) + result = sparse.block_array([[A]]) + assert np.array_equal(result.todense(), A.todense()) + + def test_dtype_inference(self): + """Test that dtype is correctly inferred.""" + A = sparse.csr_array(np.array([[1.5, 2.5]])) + B = sparse.csr_array(np.array([[3, 4]])) + result = sparse.block_array([[A], [B]]) + assert result.dtype == np.float64 + + def test_explicit_dtype(self): + """Test explicit dtype specification.""" + A = sparse.csr_array(np.array([[1, 2]])) + result = sparse.block_array([[A]], dtype=np.float32) + assert result.dtype == np.float32 + + def test_sparse_blocks(self): + """Test with actual sparse blocks (blocks with zeros).""" + # Create sparse matrices with actual zero patterns + data_A = np.array([1, 0, 0, 2]) + A = sparse.csr_array(data_A.reshape(2, 2)) + + data_B = np.array([0, 3, 4, 0]) + B = sparse.csr_array(data_B.reshape(2, 2)) + + result = sparse.block_array([[A, B]]) + + expected = np.array([[1, 0, 0, 3], [0, 2, 4, 0]]) + assert np.array_equal(result.todense(), expected) + + def test_matches_scipy(self): + """Test that output matches SciPy's block_array.""" + np.random.seed(42) + + # Create random sparse blocks + A_dense = np.random.rand(3, 4) + B_dense = np.random.rand(3, 2) + C_dense = np.random.rand(2, 4) + D_dense = np.random.rand(2, 2) + + # SciPy version + A_sp = sp.csr_array(A_dense) + B_sp = sp.csr_array(B_dense) + C_sp = sp.csr_array(C_dense) + D_sp = sp.csr_array(D_dense) + scipy_result = sp.block_array([[A_sp, B_sp], [C_sp, D_sp]]).todense() + + # Legate version + A_lg = sparse.csr_array(A_dense) + B_lg = sparse.csr_array(B_dense) + C_lg = sparse.csr_array(C_dense) + D_lg = sparse.csr_array(D_dense) + legate_result = sparse.block_array( + [[A_lg, B_lg], [C_lg, D_lg]] + ).todense() + + assert np.allclose(legate_result, scipy_result) + + +class TestBlockArrayErrors: + """Tests for block_array error handling.""" + + def test_empty_blocks_raises(self): + """Test that empty blocks raises ValueError.""" + with pytest.raises(ValueError, match="cannot be empty"): + sparse.block_array([]) + + def test_inconsistent_row_count_raises(self): + """Test that inconsistent row counts raise ValueError.""" + A = sparse.csr_array(np.array([[1, 2], [3, 4]])) + B = sparse.csr_array(np.array([[5, 6, 7]])) # Only 1 row + + with pytest.raises(ValueError, match="rows"): + sparse.block_array([[A, B]]) + + def test_inconsistent_col_count_raises(self): + """Test that inconsistent column counts raise ValueError.""" + A = sparse.csr_array(np.array([[1, 2], [3, 4]])) + B = sparse.csr_array(np.array([[5], [6], [7]])) # 3 rows, but 1 col + C = sparse.csr_array( + np.array([[8, 9]]) + ) # 1 row, but needs 3 cols below B + + with pytest.raises(ValueError): + sparse.block_array([[A, B], [C, None]]) + + def test_unsupported_format_raises(self): + """Test that unsupported format raises ValueError.""" + A = sparse.csr_array(np.array([[1, 2]])) + with pytest.raises(ValueError, match="csr"): + sparse.block_array([[A]], format="coo") + + def test_non_csr_block_raises(self): + """Test that non-CSR blocks raise TypeError.""" + A = sparse.csr_array(np.array([[1, 2]])) + with pytest.raises(TypeError, match="csr_array"): + sparse.block_array([[A, np.array([[3, 4]])]]) + + +if __name__ == "__main__": + sys.exit(pytest.main(sys.argv)) diff --git a/tests/integration/test_cg_solve.py b/tests/integration/test_cg_solve.py index d8e046e3..abed202b 100644 --- a/tests/integration/test_cg_solve.py +++ b/tests/integration/test_cg_solve.py @@ -50,7 +50,7 @@ def test_cg_solve(): x = sample_dense_vector(D, 0.1, seed) y = A @ x x_pred, iters = linalg.cg(A, y, tol=1e-8) - assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0) + assert np.allclose((A @ x_pred), y, rtol=1e-8) def test_cg_solve_with_callback(): @@ -92,7 +92,7 @@ def callback(x): residuals.append(y - A @ x) x_pred, iters = linalg.cg(A, y, tol=1e-8, callback=callback) - assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0) + assert np.allclose((A @ x_pred), y, rtol=1e-8) assert len(residuals) > 0 @@ -150,7 +150,7 @@ def matvec(x): x_pred, iters = linalg.cg( linalg.LinearOperator(A.shape, matvec=matvec), y, tol=1e-8 ) - assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0) + assert np.allclose((A @ x_pred), y, rtol=1e-8) def matvec(x, out=None): return A.dot(x, out=out) @@ -158,7 +158,7 @@ def matvec(x, out=None): x_pred, iters = linalg.cg( linalg.LinearOperator(A.shape, matvec=matvec), y, tol=1e-8 ) - assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0) + assert np.allclose((A @ x_pred), y, rtol=1e-8) if __name__ == "__main__": diff --git a/tests/integration/test_csr_from_csr.py b/tests/integration/test_csr_from_csr.py index 4dd7b2f8..bdfe57b1 100644 --- a/tests/integration/test_csr_from_csr.py +++ b/tests/integration/test_csr_from_csr.py @@ -31,8 +31,12 @@ def test_csr_from_csr_fixed(): 7 0 0 0 2 1 """ row_offsets = np.array([0, 2, 5, 7, 9, 11, 14], dtype=np.int64) - csr_vals = np.array([2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64) - col_indices = np.array([0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64) + csr_vals = np.array( + [2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64 + ) + col_indices = np.array( + [0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64 + ) matrix_shape = (6, 6) A = sparse.csr_array( # noqa: F841 diff --git a/tests/integration/test_csr_to_dense.py b/tests/integration/test_csr_to_dense.py index 7a444938..5177efa0 100644 --- a/tests/integration/test_csr_to_dense.py +++ b/tests/integration/test_csr_to_dense.py @@ -22,11 +22,17 @@ def test_csr_to_dense(): row_offsets = np.array([0, 2, 5, 7, 9, 11, 14], dtype=np.int64) - csr_vals = np.array([2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64) - col_indices = np.array([0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64) + csr_vals = np.array( + [2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64 + ) + col_indices = np.array( + [0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64 + ) matrix_shape = (6, 6) - A = sparse.csr_array((csr_vals, col_indices, row_offsets), shape=matrix_shape) + A = sparse.csr_array( + (csr_vals, col_indices, row_offsets), shape=matrix_shape + ) B = A.todense() expected_B = np.array( diff --git a/tests/integration/test_diags.py b/tests/integration/test_diags.py index 04cf7c56..a8fcc936 100644 --- a/tests/integration/test_diags.py +++ b/tests/integration/test_diags.py @@ -23,7 +23,9 @@ @pytest.mark.parametrize("N", [12, 34]) @pytest.mark.parametrize("diagonals", [3, 5]) -@pytest.mark.parametrize("dtype", (np.float32, np.float64, np.complex64, np.complex128)) +@pytest.mark.parametrize( + "dtype", (np.float32, np.float64, np.complex64, np.complex128) +) @pytest.mark.parametrize("fmt", ["csr", "dia"]) def test_diags(N, diagonals, dtype, fmt): A = sparse.diags( diff --git a/tests/integration/test_eigsh.py b/tests/integration/test_eigsh.py new file mode 100644 index 00000000..c3819d57 --- /dev/null +++ b/tests/integration/test_eigsh.py @@ -0,0 +1,392 @@ +# Copyright 2024 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cupynumeric as cn +import numpy +import pytest + +import legate_sparse.linalg as linalg +from legate_sparse import csr_array + + +@pytest.fixture +def check_eigsh_result(): + """Checks if the Eigenvalues match Ax = wx. + + Parameters + ---------- + A : csr_array + Input sparse matrix + w: numpy.ndarray + Eigen values + x: numpy.ndarray + Eigen vectors + res_tol: float, optional + Acceptable residual + """ + + def _check_eigsh_result(A, w, x, res_tol: float = 1e-3): + """Verify eigsh results by checking residual, Ax - wx""" + for i in range(w.size): + # ||Ax - wx|| / ||w|| + Ax = A @ x[:, i] + wx = w[i] * x[:, i] + res = cn.linalg.norm(Ax - wx) / cn.abs(w[i]) + assert res < res_tol, ( + f"Residual {res} exceeds tol of {res_tol} for {i}th eigen value" + ) + + return _check_eigsh_result + + +class TestEigsh: + """Test eigsh with various parameters following CuPy's testing approach.""" + + # ------ Test arguments: N, k, which + + @pytest.mark.parametrize("N", [10, 16]) + @pytest.mark.parametrize("which", ["LM", "LA", "SA"]) + @pytest.mark.parametrize("k", [1, 3]) + def test_eigsh_real_symmetric( + self, + N, + which, + k, + create_tridiagonal_real_symmetric_matrix, + check_eigsh_result, + ): + """Test eigsh with real symmetric tridiagonal matrices.""" + A_scipy = create_tridiagonal_real_symmetric_matrix(N) + A = csr_array(A_scipy.todense()) + + w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True) + + assert cn.allclose(cn.imag(w), 0, atol=1e-6), ( + "Eigenvalues should be real for real symmetric matrices" + ) + assert w.shape == (k,), f"Expected {k} eigenvalues, found {w.shape}" + + check_eigsh_result(A, w, x) + + @pytest.mark.parametrize("N", [10, 16]) + @pytest.mark.parametrize("which", ["LM", "LA", "SA"]) + @pytest.mark.parametrize("k", [1, 3]) + def test_eigsh_complex_hermitian( + self, + N, + which, + k, + create_tridiagonal_complex_hermitian_matrix, + check_eigsh_result, + ): + """Test eigsh with complex Hermitian tridiagonal matrices.""" + A_scipy = create_tridiagonal_complex_hermitian_matrix(N) + A = csr_array(A_scipy.todense()) + + w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True) + + assert cn.allclose(cn.imag(w), 0, atol=1e-6), ( + "Eigenvalues should be real for Hermitian matrices" + ) + assert w.shape == (k,), f"Expected {k} eigenvalues" + + check_eigsh_result(A, w, x) + + # ------ Test argument return_eigenvector + + def test_eigsh_eigenvalues_only_real( + self, create_tridiagonal_real_symmetric_matrix + ): + """Test eigsh with return_eigenvectors=False for real matrices.""" + N, k = 10, 2 + which = "LM" + A_scipy = create_tridiagonal_real_symmetric_matrix(N) + A = csr_array(A_scipy.todense()) + + w = linalg.eigsh(A, k=k, which=which, return_eigenvectors=False) + + assert cn.allclose(cn.imag(w), 0, atol=1e-6), ( + "Eigenvalues should be real" + ) + assert w.shape == (k,), f"Expected {k} eigenvalues" + + def test_eigsh_eigenvalues_only_complex( + self, create_tridiagonal_complex_hermitian_matrix + ): + """Test eigsh with return_eigenvectors=False for complex matrices.""" + N, k = 10, 2 + which = "LM" + A_scipy = create_tridiagonal_complex_hermitian_matrix(N) + A = csr_array(A_scipy.todense()) + + w = linalg.eigsh(A, k=k, which=which, return_eigenvectors=False) + + assert cn.allclose(cn.imag(w), 0, atol=1e-6), ( + "Eigenvalues should be real" + ) + assert w.shape == (k,), f"Expected {k} eigenvalues" + + # ------ Test argument v0 + + def test_eigsh_with_v0_real( + self, create_tridiagonal_real_symmetric_matrix, check_eigsh_result + ): + """Test eigsh with user-provided initial vector v0 for real matrices.""" + N, k = 10, 2 + A_scipy = create_tridiagonal_real_symmetric_matrix(N) + A = csr_array(A_scipy.todense()) + + v0 = numpy.array(cn.random.randn(N), dtype=numpy.float64) + + w, x = linalg.eigsh( + A, k=k, which="LM", v0=v0, return_eigenvectors=True + ) + + assert cn.allclose(cn.imag(w), 0, atol=1e-6), ( + "Eigenvalues should be real" + ) + check_eigsh_result(A, w, x) + + def test_eigsh_with_v0_complex( + self, create_tridiagonal_complex_hermitian_matrix, check_eigsh_result + ): + """Test eigsh with user-provided initial vector v0 for complex matrices.""" + N, k = 10, 2 + A_scipy = create_tridiagonal_complex_hermitian_matrix(N) + A = csr_array(A_scipy.todense()) + + v0 = cn.array( + numpy.random.randn(N) + 1j * numpy.random.randn(N), + dtype=numpy.complex128, + ) + + w, x = linalg.eigsh( + A, k=k, which="LM", v0=v0, return_eigenvectors=True + ) + + assert cn.allclose(cn.imag(w), 0, atol=1e-10), ( + "Eigenvalues should be real" + ) + check_eigsh_result(A, w, x) + + # ------ Test output sortedness + + def test_eigsh_sorted_eigenvalues( + self, create_tridiagonal_real_symmetric_matrix + ): + """Test that eigenvalues are returned sorted.""" + N, k = 20, 6 + A_scipy = create_tridiagonal_real_symmetric_matrix(N) + A = csr_array(A_scipy.todense()) + + w, _ = linalg.eigsh(A, k=k, which="LM", return_eigenvectors=True) + + # Eigenvalues should be sorted in ascending order + w_sorted = cn.sort(w) + assert cn.allclose(w, w_sorted), "Eigenvalues should be sorted" + + +class TestEigshLargeProblems: + """Test eigsh with larger problem sizes.""" + + @pytest.mark.parametrize("N", [15, 30]) + @pytest.mark.parametrize("k", [3, 6]) + @pytest.mark.parametrize("which", ["LM", "SA"]) + def test_eigsh_large_real_symmetric( + self, + N, + k, + which, + create_tridiagonal_real_symmetric_matrix, + check_eigsh_result, + ): + """Test eigsh with large real symmetric tridiagonal matrices.""" + A_scipy = create_tridiagonal_real_symmetric_matrix(N) + A = csr_array(A_scipy.todense()) + + w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True) + + assert cn.allclose(cn.imag(w), 0, atol=1e-6), ( + "Eigenvalues should be real" + ) + assert w.shape == (k,), f"Expected {k} eigenvalues" + check_eigsh_result(A, w, x) + + @pytest.mark.parametrize("N", [15, 30]) + @pytest.mark.parametrize("k", [3, 6]) + @pytest.mark.parametrize("which", ["LM", "SA"]) + def test_eigsh_large_complex_hermitian( + self, + N, + k, + which, + create_tridiagonal_complex_hermitian_matrix, + check_eigsh_result, + ): + """Test eigsh with large complex Hermitian tridiagonal matrices.""" + A_scipy = create_tridiagonal_complex_hermitian_matrix(N) + A = csr_array(A_scipy.todense()) + + w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True) + + assert cn.allclose(cn.imag(w), 0, atol=1e-6), ( + "Eigenvalues should be real" + ) + assert w.shape == (k,), f"Expected {k} eigenvalues" + check_eigsh_result(A, w, x) + + +class TestEigshRandomSparse: + """Test eigsh with random sparse symmetric/Hermitian matrices.""" + + @pytest.mark.parametrize("N", [15, 30]) + @pytest.mark.parametrize("k", [1, 3]) + @pytest.mark.parametrize("which", ["LM", "SA"]) + def test_eigsh_random_real_symmetric( + self, + N, + k, + which, + create_sparse_real_symmetric_matrix, + check_eigsh_result, + ): + """Test eigsh with random sparse real symmetric matrices.""" + A_scipy = create_sparse_real_symmetric_matrix(N, density=0.3, seed=42) + A = csr_array(A_scipy.todense()) + + w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True) + + assert cn.allclose(cn.imag(w), 0, atol=1e-6), ( + "Eigenvalues should be real" + ) + assert w.shape == (k,), f"Expected {k} eigenvalues" + check_eigsh_result(A, w, x) + + @pytest.mark.parametrize("N", [15, 30]) + @pytest.mark.parametrize("k", [1, 3]) + @pytest.mark.parametrize("which", ["LM", "SA"]) + def test_eigsh_random_complex_hermitian( + self, + N, + k, + which, + create_sparse_complex_hermitian_matrix, + check_eigsh_result, + ): + """Test eigsh with random sparse complex Hermitian matrices.""" + A_scipy = create_sparse_complex_hermitian_matrix( + N, density=0.3, seed=42 + ) + A = csr_array(A_scipy.todense()) + + w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True) + + assert cn.allclose(cn.imag(w), 0, atol=1e-6), ( + "Eigenvalues should be real" + ) + assert w.shape == (k,), f"Expected {k} eigenvalues" + check_eigsh_result(A, w, x) + + +class TestEigshLinearOperator: + """Test eigsh with LinearOperator input.""" + + @pytest.mark.parametrize("N", [10, 20]) + @pytest.mark.parametrize("k", [1, 3]) + @pytest.mark.parametrize("which", ["LM", "SA"]) + def test_eigsh_linear_operator_real( + self, + N, + k, + which, + create_tridiagonal_real_symmetric_matrix, + check_eigsh_result, + ): + """Test eigsh with LinearOperator wrapping a real symmetric matrix.""" + A_scipy = create_tridiagonal_real_symmetric_matrix(N) + A_dense = cn.array(A_scipy.todense()) + + A_op = linalg.LinearOperator( + shape=(N, N), matvec=lambda v: A_dense @ v, dtype=A_dense.dtype + ) + + w, x = linalg.eigsh(A_op, k=k, which=which, return_eigenvectors=True) + + assert cn.allclose(cn.imag(w), 0.0, atol=1e-6), ( + "Eigenvalues should be real" + ) + assert w.shape == (k,), f"Expected {k} eigenvalues" + check_eigsh_result(A_dense, w, x) + + @pytest.mark.parametrize("N", [10, 20]) + @pytest.mark.parametrize("k", [1, 3]) + @pytest.mark.parametrize("which", ["LM", "SA"]) + def test_eigsh_linear_operator_complex( + self, + N, + k, + which, + create_tridiagonal_complex_hermitian_matrix, + check_eigsh_result, + ): + """Test eigsh with LinearOperator wrapping a complex Hermitian matrix.""" + A_scipy = create_tridiagonal_complex_hermitian_matrix(N) + A_dense = cn.array(A_scipy.todense()) + + A_op = linalg.LinearOperator( + shape=(N, N), matvec=lambda v: A_dense @ v, dtype=A_dense.dtype + ) + + w, x = linalg.eigsh(A_op, k=k, which=which, return_eigenvectors=True) + + assert cn.allclose(cn.imag(w), 0.0, atol=1e-6), ( + "Eigenvalues should be real" + ) + assert w.shape == (k,), f"Expected {k} eigenvalues" + check_eigsh_result(A_dense, w, x) + + +class TestEigshErrors: + """Test eigsh error handling.""" + + def test_non_square_matrix(self): + """Test that non-square matrix raises ValueError.""" + A_rect = csr_array(numpy.random.randn(10, 15)) + with pytest.raises(ValueError, match="expected square matrix"): + linalg.eigsh(A_rect, k=1) + + def test_k_too_large(self): + """Test that k >= n raises ValueError.""" + n = 10 + A = csr_array(numpy.eye(n)) + with pytest.raises(ValueError, match="k must be smaller than n"): + linalg.eigsh(A, k=n) + + def test_k_zero_or_negative(self): + """Test that k <= 0 raises ValueError.""" + A = csr_array(numpy.eye(10)) + with pytest.raises(ValueError, match="k must be greater than 0"): + linalg.eigsh(A, k=0) + + def test_invalid_which(self): + """Test that invalid which raises ValueError.""" + A = csr_array(numpy.eye(10)) + with pytest.raises(ValueError, match="which must be"): + linalg.eigsh(A, k=1, which="INVALID") + + +if __name__ == "__main__": + import sys + + pytest.main(sys.argv) diff --git a/tests/integration/test_geam.py b/tests/integration/test_geam.py new file mode 100644 index 00000000..665c5092 --- /dev/null +++ b/tests/integration/test_geam.py @@ -0,0 +1,269 @@ +# Copyright 2024 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for GEAM API and sparse matrix arithmetic operations.""" + +import sys + +import cupynumeric as np +import pytest +from utils.banded_matrix import banded_matrix +from utils.sample import simple_system_gen + +import legate_sparse as sparse +from legate_sparse.csr import geam + + +# ============================================================================= +# GEAM API Tests - Error Cases +# ============================================================================= + + +def test_geam_sparse_dense_mismatch_A_dense(): + """Test that geam raises error when only one of the arrays is sparse.""" + N = 5 + np.random.seed(42) + A_dense = np.random.rand(N, N) + B_sparse = banded_matrix(N, 3) + + with pytest.raises((TypeError, AttributeError)): + geam(A_dense, B_sparse, 1.0, 2.0) + + with pytest.raises((TypeError, AttributeError)): + geam(B_sparse, A_dense, 1.0, 2.0) + + +def test_geam_wrong_sparsity_pattern_for_C(): + """Providing C with incompatible sparsity pattern leads to incorrect results.""" + N = 5 + np.random.seed(42) + + A = banded_matrix(N, 3) # tri-diagonal + B = banded_matrix(N, 5) # penta-diagonal + + C_correct = geam(A, B, 2.0, 3.0) + C_wrong = banded_matrix(N, 3) # wrong pattern - too few non-zeros + C_result = geam(A, B, 2.0, 3.0, C=C_wrong) + + # Results should NOT match due to incompatible sparsity + assert not np.allclose(C_correct.todense(), C_result.todense()) + + +# ============================================================================= +# GEAM API Tests - Success Cases +# ============================================================================= + + +@pytest.mark.parametrize("N", [5, 15, 30]) +def test_geam_basic_without_C(N): + """Test geam without providing C.""" + np.random.seed(42) + A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3) + B_dense, B_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3) + + C_sparse = geam(A_sparse, B_sparse, 2.5, -1.5) + C_expected = 2.5 * A_dense + (-1.5) * B_dense + + assert np.allclose(C_sparse.todense(), C_expected, rtol=1e-10, atol=1e-12) + + +@pytest.mark.parametrize("N", [5, 15, 30]) +def test_geam_basic_with_C(N): + """Test geam with pre-allocated C, then reuse it.""" + np.random.seed(42) + A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3) + B_dense, B_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3) + + C_sparse = geam(A_sparse, B_sparse, 2.0, 3.0) + assert np.allclose(C_sparse.todense(), 2.0 * A_dense + 3.0 * B_dense) + + C_sparse = geam(A_sparse, B_sparse, -1.0, 0.5, C=C_sparse) + assert np.allclose(C_sparse.todense(), -1.0 * A_dense + 0.5 * B_dense) + + +@pytest.mark.parametrize( + "alpha,beta", [(1.0, 1.0), (1.0, -1.0), (2.0, 0.0), (0.0, 3.0)] +) +def test_geam_various_scalars(alpha, beta): + """Test geam with various scalar combinations.""" + N = 15 + np.random.seed(42) + A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3) + B_dense, B_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3) + + C_sparse = geam(A_sparse, B_sparse, alpha, beta) + assert np.allclose(C_sparse.todense(), alpha * A_dense + beta * B_dense) + + +def test_geam_loop_with_C_reuse(): + """Test geam in a loop where C is reused across iterations.""" + N = 15 + np.random.seed(42) + + A_sparse = banded_matrix(N, 3) + B_sparse = banded_matrix(N, 3) + C_sparse = geam(A_sparse, B_sparse, 1.0, 1.0) + + for i in range(1, 5): + A_new = banded_matrix(N, 3, init_with_ones=False) + B_new = banded_matrix(N, 3, init_with_ones=False) + scale_A, scale_B = float(i + 1), float(i + 2) + + C_sparse = geam(A_new, B_new, scale_A, scale_B, C=C_sparse) + C_expected = scale_A * A_new.todense() + scale_B * B_new.todense() + + assert np.allclose(C_sparse.todense(), C_expected) + + +def test_geam_identical_matrices(): + """Test geam when A and B are identical.""" + N = 15 + np.random.seed(42) + A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3) + + C_sparse = geam(A_sparse, A_sparse, 2.0, 3.0) + assert np.allclose(C_sparse.todense(), 5.0 * A_dense) + + +def test_geam_disjoint_sparsity_patterns(): + """Test geam when A and B have disjoint sparsity patterns.""" + N = 15 + np.random.seed(42) + + A_dense = np.triu(np.random.rand(N, N)) + A_sparse = sparse.csr_array(A_dense) + B_dense = np.tril(np.random.rand(N, N), k=-1) + B_sparse = sparse.csr_array(B_dense) + + C_sparse = geam(A_sparse, B_sparse, 1.5, 2.5) + assert np.allclose(C_sparse.todense(), 1.5 * A_dense + 2.5 * B_dense) + + +# ============================================================================= +# Dunder Method Tests (__add__, __sub__, __radd__, __rsub__) +# ============================================================================= + + +class TestCSRArithmetic: + """Tests for CSR matrix arithmetic dunder methods.""" + + @pytest.fixture + def matrices(self): + """Create test matrices.""" + N = 15 + np.random.seed(42) + A_dense, A_sparse, _ = simple_system_gen( + N, N, sparse.csr_array, tol=0.3 + ) + B_dense, B_sparse, _ = simple_system_gen( + N, N, sparse.csr_array, tol=0.3 + ) + return A_dense, A_sparse, B_dense, B_sparse + + # ------------------------------------------------------------------------- + # Sparse + Sparse, Sparse - Sparse + # ------------------------------------------------------------------------- + + def test_add_sparse_sparse(self, matrices): + """A + B where both are sparse.""" + A_dense, A_sparse, B_dense, B_sparse = matrices + C = A_sparse + B_sparse + assert np.allclose(C.todense(), A_dense + B_dense) + + C = A_sparse - B_sparse + assert np.allclose(C.todense(), A_dense - B_dense) + + # ------------------------------------------------------------------------- + # Sparse + Dense, Dense + Sparse + # ------------------------------------------------------------------------- + + def test_add_sparse_dense(self, matrices): + """sparse + dense returns dense.""" + A_dense, A_sparse, B_dense, _ = matrices + C = A_sparse + B_dense + assert np.allclose(C, A_dense + B_dense) + + @pytest.mark.skip( + reason="cupynumeric intercepts dense+sparse before __radd__ is called" + ) + def test_add_dense_sparse(self, matrices): + """dense + sparse should return dense (currently broken in cupynumeric).""" + A_dense, _, B_dense, B_sparse = matrices + C = A_dense + B_sparse + assert np.allclose(C, A_dense + B_dense) + + # ------------------------------------------------------------------------- + # Sparse - Dense, Dense - Sparse + # ------------------------------------------------------------------------- + + def test_sub_sparse_dense(self, matrices): + """sparse - dense returns dense.""" + A_dense, A_sparse, B_dense, _ = matrices + C = A_sparse - B_dense + assert np.allclose(C, A_dense - B_dense) + + @pytest.mark.skip( + reason="cupynumeric intercepts dense-sparse before __rsub__ is called" + ) + def test_sub_dense_sparse(self, matrices): + """dense - sparse should return dense (currently broken in cupynumeric).""" + A_dense, _, B_dense, B_sparse = matrices + C = A_dense - B_sparse + assert np.allclose(C, A_dense - B_dense) + + # ------------------------------------------------------------------------- + # Sparse + Scalar, Scalar + Sparse + # ------------------------------------------------------------------------- + + def test_add_sparse_zero(self, matrices): + """A + 0 should return a copy of A.""" + A_dense, A_sparse, _, _ = matrices + C = A_sparse + 0 + assert np.allclose(C.todense(), A_dense) + + C = 0 + A_sparse + assert np.allclose(C.todense(), A_dense) + + def test_add_sparse_nonzero_scalar_raises(self, matrices): + """A + nonzero scalar should raise NotImplementedError.""" + _, A_sparse, _, _ = matrices + with pytest.raises(NotImplementedError): + _ = A_sparse + 5.0 + with pytest.raises(NotImplementedError): + _ = 5.0 + A_sparse + + # ------------------------------------------------------------------------- + # Sparse - Scalar, Scalar - Sparse + # ------------------------------------------------------------------------- + + def test_sub_sparse_zero(self, matrices): + """A - 0 should return a copy of A.""" + A_dense, A_sparse, _, _ = matrices + C = A_sparse - 0 + assert np.allclose(C.todense(), A_dense) + + C = 0 - A_sparse + assert np.allclose(C.todense(), -A_dense) + + def test_sub_sparse_nonzero_scalar_raises(self, matrices): + """Subtracting a nonzero scalar should raise NotImplementedError.""" + _, A_sparse, _, _ = matrices + with pytest.raises(NotImplementedError): + _ = A_sparse - 5.0 + with pytest.raises(NotImplementedError): + _ = 5.0 - A_sparse + + +if __name__ == "__main__": + sys.exit(pytest.main(sys.argv)) diff --git a/tests/integration/test_indexing.py b/tests/integration/test_indexing.py index 259c7996..01b2dbac 100644 --- a/tests/integration/test_indexing.py +++ b/tests/integration/test_indexing.py @@ -72,7 +72,7 @@ def test_incompatible_mask(self, N, create_matrix, create_mask): # make sure the values are updated correctly A_dense = numpy.asarray(A.todense()) - assert numpy.allclose(A_dense[mask_dense].sum() / num_nonzeros, value) + assert numpy.allclose(A_dense[mask_dense].sum(), value * num_nonzeros) # TODO: Add a check/test for location of nonzeros as well @@ -200,33 +200,11 @@ def test_random_column_order(self): This is important because CSR format requires column indices to be sorted within each row for efficient operations. """ - row_indices = cupynumeric.array( - [ - 2, - 4, - 5, - 3, - 5, - 1, - 1, - 5, - 5, - ] + row_indices = cupynumeric.array([2, 4, 5, 3, 5, 1, 1, 5, 5]) + col_indices = cupynumeric.array([3, 1, 2, 2, 5, 1, 4, 1, 3]) + data = cupynumeric.array( + [7.0, 9.0, 3.0, 4.0, 5.0, 19.0, 2.0, 99.0, 109.0] ) - col_indices = cupynumeric.array( - [ - 3, - 1, - 2, - 2, - 5, - 1, - 4, - 1, - 3, - ] - ) - data = cupynumeric.array([7.0, 9.0, 3.0, 4.0, 5.0, 19.0, 2.0, 99.0, 109.0]) # note that the data in row 5 is ordered (2, 5, 1, 3),which will get # sorted to (1, 2, 5, 3) during instantiation, which is needed for indexing diff --git a/tests/integration/test_manual_sorting.py b/tests/integration/test_manual_sorting.py index 7d3ed282..4999c946 100644 --- a/tests/integration/test_manual_sorting.py +++ b/tests/integration/test_manual_sorting.py @@ -15,6 +15,7 @@ import cupynumeric as np import numpy import pytest + from legate_sparse.utils import sort_by_rows_then_cols diff --git a/tests/integration/test_negate.py b/tests/integration/test_negate.py new file mode 100644 index 00000000..ab9a1e21 --- /dev/null +++ b/tests/integration/test_negate.py @@ -0,0 +1,38 @@ +# Copyright 2024 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for sparse matrix negation (__neg__).""" + +import sys + +import cupynumeric as np +import pytest +from utils.sample import simple_system_gen + +import legate_sparse as sparse + + +def test_negate(): + """-A returns a sparse matrix with negated values.""" + N = 15 + np.random.seed(42) + A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3) + + C = -A_sparse + + assert np.allclose(C.todense(), -A_dense) + + +if __name__ == "__main__": + sys.exit(pytest.main(sys.argv)) diff --git a/tests/integration/test_spgemm.py b/tests/integration/test_spgemm.py index 5954df79..9e4c725d 100644 --- a/tests/integration/test_spgemm.py +++ b/tests/integration/test_spgemm.py @@ -16,11 +16,11 @@ import cupynumeric as np import pytest -from legate_sparse.runtime import runtime from utils.banded_matrix import banded_matrix from utils.sample import simple_system_gen import legate_sparse as sparse +from legate_sparse.runtime import runtime @pytest.mark.parametrize("N", [5, 29]) diff --git a/tests/integration/test_spmv.py b/tests/integration/test_spmv.py index 0c3590df..1b953150 100644 --- a/tests/integration/test_spmv.py +++ b/tests/integration/test_spmv.py @@ -16,11 +16,11 @@ import cupynumeric as np import pytest -from legate_sparse.runtime import runtime from utils.banded_matrix import banded_matrix from utils.sample import simple_system_gen import legate_sparse as sparse +from legate_sparse.runtime import runtime @pytest.mark.parametrize("N", [5, 29]) @@ -105,5 +105,39 @@ def test_csr_spmv_unsupported_dtype(N, nnz_per_row, unsupported_dtype): y = A.dot(x) # noqa: F841 +@pytest.mark.parametrize("N", [5, 29]) +@pytest.mark.parametrize("M", [7, 17]) +@pytest.mark.parametrize("complex_dtype", [np.complex64, np.complex128]) +def test_csr_spmv_complex(N, M, complex_dtype): + """Test sparse matrix-vector multiplication with complex datatypes. + + This test verifies that sparse matrix-vector multiplication works + correctly for complex64 and complex128 datatypes. + + Parameters + ---------- + N : int + Number of rows in the matrix. + M : int + Number of columns in the matrix. + complex_dtype : dtype + Complex datatype to use (complex64 or complex128). + """ + + # get real and imag parts separately + A_dense_real, _, x_real = simple_system_gen(N, M, sparse.csr_array) + A_dense_imag, _, x_imag = simple_system_gen(N, M, sparse.csr_array) + + A_dense = A_dense_real.astype(complex_dtype) + 1j * A_dense_imag.astype( + complex_dtype + ) + x = x_real.astype(complex_dtype) + 1j * x_imag.astype(complex_dtype) + A = sparse.csr_array(A_dense.astype(complex_dtype)) + + y = A @ x + + assert np.all(np.isclose(y, A_dense @ x)) + + if __name__ == "__main__": sys.exit(pytest.main(sys.argv)) diff --git a/tests/integration/test_spsolve.py b/tests/integration/test_spsolve.py new file mode 100644 index 00000000..f4b03e2e --- /dev/null +++ b/tests/integration/test_spsolve.py @@ -0,0 +1,199 @@ +# Copyright 2024 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cupynumeric as np +import pytest +import scipy.sparse as scipy_sparse +import scipy.sparse.linalg as scipy_linalg +from utils.sample import sample_dense + +import legate_sparse.linalg as linalg +from legate_sparse import csr_array +from legate_sparse.runtime import runtime + +# Skip all tests in this module if no GPUs are available +# since spsolve is only supported on GPU +pytestmark = pytest.mark.skipif( + runtime.num_gpus == 0, reason="spsolve is only supported on GPU backend" +) + + +@pytest.mark.parametrize("N", [5, 10, 20, 50]) +def test_spsolve_identity_matrix(N): + """Test spsolve with an identity matrix.""" + A = csr_array(np.eye(N)) + b = np.ones(N) + x = linalg.spsolve(A, b) + + # For identity matrix, x should equal b + assert np.allclose(x, b, rtol=1e-10, atol=1e-12), ( + f"Identity matrix solution incorrect: max error = {np.max(np.abs(x - b))}" + ) + + +def test_spsolve_basic_square_matrix(): + """Test spsolve with a basic square matrix.""" + + N = 5 + np.random.seed(42) + A_dense = sample_dense(N, N, 0.3, 42) + A_dense = A_dense + N * np.eye(N) + + A = csr_array(A_dense) + b = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + x = linalg.spsolve(A, b) + + b_computed = A @ x + assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), ( + f"Solution verification failed: max error = {np.max(np.abs(b_computed - b))}" + ) + + A_scipy = scipy_sparse.csr_matrix(np.array(A.todense())) + x_scipy = scipy_linalg.spsolve(A_scipy, np.array(b)) + assert np.allclose(x, x_scipy, rtol=1e-5, atol=1e-6), ( + f"Solution differs from SciPy: max error = {np.max(np.abs(x - x_scipy))}" + ) + + +@pytest.mark.parametrize("N", [5, 10, 20, 50]) +def test_spsolve_diagonal_matrix(N): + """Test spsolve with a diagonal matrix.""" + diag_values = np.arange(1.0, N + 1.0) + A_dense = np.diag(diag_values) + A = csr_array(A_dense) + b = np.ones(N) + x = linalg.spsolve(A, b) + x_expected = b / diag_values + assert np.allclose(x, x_expected, rtol=1e-10, atol=1e-12), ( + f"Diagonal matrix solution incorrect: max error = {np.max(np.abs(x - x_expected))}" + ) + + +@pytest.mark.parametrize("N", [5, 10, 20, 50]) +def test_spsolve_tridiagonal_matrix(N): + """Test spsolve with a tridiagonal matrix.""" + main_diag = np.full(N, 4.0) + off_diag = np.full(N - 1, -1.0) + A_dense = np.diag(main_diag) + np.diag(off_diag, 1) + np.diag(off_diag, -1) + A = csr_array(A_dense) + b = np.ones(N) + x = linalg.spsolve(A, b) + + b_computed = A @ x + assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), ( + f"Tridiagonal solution verification failed: max error = {np.max(np.abs(b_computed - b))}" + ) + + A_scipy = scipy_sparse.csr_matrix(np.array(A.todense())) + x_scipy = scipy_linalg.spsolve(A_scipy, np.array(b)) + assert np.allclose(x, x_scipy, rtol=1e-5, atol=1e-6), ( + f"Tridiagonal solution differs from SciPy: max error = {np.max(np.abs(x - x_scipy))}" + ) + + +@pytest.mark.parametrize("N", [5, 10, 20, 50]) +def test_spsolve_symmetric_positive_definite(N): + """Test spsolve with a symmetric positive definite matrix. + We create an SPD matrix by A = B^T * B + N * I. + """ + seed = 42 + B_dense = sample_dense(N, N, 0.2, seed) + A_dense = B_dense.T @ B_dense + N * np.eye(N) + A = csr_array(A_dense) + + # make sure it's positive definite + eigenvalues = np.linalg.eigvals(A_dense) + assert np.all(eigenvalues > 0), "Matrix is not positive definite" + + b = np.random.rand(N) + x = linalg.spsolve(A, b) + + b_computed = A @ x + assert np.allclose(b_computed, b, rtol=1e-4, atol=1e-5), ( + f"SPD solution verification failed: max error = {np.max(np.abs(b_computed - b))}" + ) + + +@pytest.mark.parametrize( + "dtype", [np.float32, np.float64, np.complex64, np.complex128] +) +def test_spsolve_all_dtypes(dtype): + """Comprehensive test for spsolve with all cuDSS-supported data types. + + Note: cuDSS only supports floating-point and complex types. + Integer and boolean types are not supported + """ + N = 10 + + # Create a well-conditioned matrix for each dtype + if dtype in [np.complex64, np.complex128]: + # For complex types, create a Hermitian positive definite matrix + seed = 42 + np.random.seed(seed) + B = np.random.randn(N, N) + 1j * np.random.randn(N, N) + A_dense = (B @ B.conj().T + N * np.eye(N)).astype(dtype) + b = np.ones(N, dtype=dtype) + else: + seed = 42 + A_dense = sample_dense(N, N, 0.3, seed).astype(dtype) + A_dense = A_dense + N * np.eye(N, dtype=dtype) + b = np.ones(N, dtype=dtype) + + # Solve the system + A = csr_array(A_dense) + x = linalg.spsolve(A, b) + + b_computed = A @ x + assert np.allclose(b_computed, b, rtol=1e-4, atol=1e-5), ( + f"Solution verification failed for dtype {dtype}: max error = {np.max(np.abs(b_computed - b))}" + ) + + assert x.dtype == b.dtype, ( + f"Output dtype {x.dtype} doesn't match input dtype {b.dtype} for dtype {dtype}" + ) + + +@pytest.mark.parametrize("N", [5, 10, 20, 50]) +def test_spsolve_upper_triangular(N): + """Test spsolve with an upper triangular matrix.""" + A_dense = np.triu(np.random.rand(N, N) + np.eye(N)) + A = csr_array(A_dense) + b = np.ones(N) + x = linalg.spsolve(A, b) + + b_computed = A @ x + assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), ( + f"Upper triangular solution verification failed: max error = {np.max(np.abs(b_computed - b))}" + ) + + +@pytest.mark.parametrize("N", [5, 10, 20, 50]) +def test_spsolve_lower_triangular(N): + """Test spsolve with a lower triangular matrix.""" + A_dense = np.tril(np.ones((N, N)) + np.eye(N)) + A = csr_array(A_dense) + b = np.ones(N) + x = linalg.spsolve(A, b) + + b_computed = A @ x + assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), ( + f"Lower triangular solution verification failed: max error = {np.max(np.abs(b_computed - b))}" + ) + + +if __name__ == "__main__": + import sys + + pytest.main(sys.argv) + sys.exit(0) diff --git a/tests/integration/test_unary_operation.py b/tests/integration/test_unary_operation.py index f1f3c07d..432381d3 100644 --- a/tests/integration/test_unary_operation.py +++ b/tests/integration/test_unary_operation.py @@ -22,11 +22,17 @@ def test_unary_operation(): row_offsets = np.array([0, 2, 5, 7, 9, 11, 14], dtype=np.int64) - csr_vals = np.array([2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64) - col_indices = np.array([0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64) + csr_vals = np.array( + [2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64 + ) + col_indices = np.array( + [0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64 + ) matrix_shape = (6, 6) - A = sparse.csr_array((csr_vals, col_indices, row_offsets), shape=matrix_shape) + A = sparse.csr_array( + (csr_vals, col_indices, row_offsets), shape=matrix_shape + ) B = A * 2 Bvalues = np.asarray(B.vals) diff --git a/tests/integration/utils/banded_matrix.py b/tests/integration/utils/banded_matrix.py index fda5ef5f..2467f452 100644 --- a/tests/integration/utils/banded_matrix.py +++ b/tests/integration/utils/banded_matrix.py @@ -90,7 +90,9 @@ def banded_matrix( pred = np.arange(nnz_per_row - half_nnz, nnz_per_row + 1) post = np.flip(pred) - nnz_arr = np.concatenate((pred, np.ones(main_rows) * nnz_per_row, post)) + nnz_arr = np.concatenate( + (pred, np.ones(main_rows) * nnz_per_row, post) + ) if sparse.__name__ == "legate_sparse": row_offsets = np.zeros(N + 1).astype(sparse.coord_ty) diff --git a/tests/testdata/GlossGT.mtx b/tests/testdata/GlossGT.mtx index 27869886..b3bbe5d0 100644 --- a/tests/testdata/GlossGT.mtx +++ b/tests/testdata/GlossGT.mtx @@ -14,15 +14,15 @@ %------------------------------------------------------------------------------- % notes: % ------------------------------------------------------------------------------ -% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse +% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse % matrix collection, Tim Davis. For Pajek datasets, See V. Batagelj & A. Mrvar, -% http://vlado.fmf.uni-lj.si/pub/networks/data/. +% http://vlado.fmf.uni-lj.si/pub/networks/data/. % ------------------------------------------------------------------------------ -% Bill Cherowitzo: Graph and Digraph Glossary -% http://www-math.cudenver.edu/~wcherowi/courses/m4408/glossary.html -% Pajek's network: Barbara Zemlji"c, 2. nov 2003 -% The original problem had 3D xyz coordinates, but all values of z were equal -% to 0, and have been removed. This graph has 2D coordinates. +% Bill Cherowitzo: Graph and Digraph Glossary +% http://www-math.cudenver.edu/~wcherowi/courses/m4408/glossary.html +% Pajek's network: Barbara Zemlji"c, 2. nov 2003 +% The original problem had 3D xyz coordinates, but all values of z were equal +% to 0, and have been removed. This graph has 2D coordinates. %------------------------------------------------------------------------------- 72 72 122 3 4 diff --git a/tests/testdata/Ragusa18.mtx b/tests/testdata/Ragusa18.mtx index 2e8bd6ce..24eaa03b 100644 --- a/tests/testdata/Ragusa18.mtx +++ b/tests/testdata/Ragusa18.mtx @@ -14,9 +14,9 @@ %------------------------------------------------------------------------------- % notes: % ------------------------------------------------------------------------------ -% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse +% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse % matrix collection, Tim Davis. For Pajek datasets, See V. Batagelj & A. Mrvar, -% http://vlado.fmf.uni-lj.si/pub/networks/data/. +% http://vlado.fmf.uni-lj.si/pub/networks/data/. % ------------------------------------------------------------------------------ %------------------------------------------------------------------------------- 23 23 64 diff --git a/tests/testdata/karate.mtx b/tests/testdata/karate.mtx index 59df7607..9ecdff42 100644 --- a/tests/testdata/karate.mtx +++ b/tests/testdata/karate.mtx @@ -12,14 +12,14 @@ % kind: undirected graph %------------------------------------------------------------------------------- % notes: -% Network collection from M. Newman -% http://www-personal.umich.edu/~mejn/netdata/ -% -% The graph "karate" contains the network of friendships between the 34 -% members of a karate club at a US university, as described by Wayne Zachary +% Network collection from M. Newman +% http://www-personal.umich.edu/~mejn/netdata/ +% +% The graph "karate" contains the network of friendships between the 34 +% members of a karate club at a US university, as described by Wayne Zachary % in 1977. If you use these data in your work, please cite W. W. Zachary, An % information flow model for conflict and fission in small groups, Journal of -% Anthropological Research 33, 452-473 (1977). +% Anthropological Research 33, 452-473 (1977). %------------------------------------------------------------------------------- 34 34 78 2 1