diff --git a/.clang-format b/.clang-format
index 6d5353f9..4f33a094 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,3 +1,4 @@
+---
 Language: Cpp
 # BasedOnStyle: Google
 AccessModifierOffset: -1
@@ -17,22 +18,22 @@ AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments:  false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
 BinPackParameters: false
 BraceWrapping:
-  AfterClass:            false
+  AfterClass: false
   AfterControlStatement: false
-  AfterEnum:             false
-  AfterFunction:         false
-  AfterNamespace:        false
-  AfterObjCDeclaration:  false
-  AfterStruct:           false
-  AfterUnion:            false
-  AfterExternBlock:      false
-  BeforeCatch:           false
-  BeforeElse:            false
-  IndentBraces:          false
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  IndentBraces: false
   # disabling the below splits, else, they'll just add to the vertical length of source files!
   SplitEmptyFunction: false
   SplitEmptyRecord: false
@@ -61,23 +62,23 @@ FixNamespaceComments: true
 ForEachMacros:
 IncludeBlocks: Preserve
 IncludeCategories:
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
+  - Regex: ^<.*\.h>
+    Priority: 1
+  - Regex: ^<.*
+    Priority: 2
+  - Regex: .*
+    Priority: 3
+IncludeIsMainRegex: ([-_](test|unittest))?$
 IndentCaseLabels: true
 IndentPPDirectives: None
-IndentWidth:     2
+IndentWidth: 2
 IndentWrappedFunctionNames: false
 InsertBraces: true
 JavaScriptQuotes: Leave
 JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
-MacroBlockEnd:   ''
+MacroBlockEnd: ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBinPackProtocolList: Never
@@ -95,14 +96,7 @@ PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
 RawStringFormats:
   - Language: Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
+    Delimiters: [cc, CC, cpp, Cpp, CPP, c++, C++]
     CanonicalDelimiter: ''
 # Enabling comment reflow causes doxygen comments to be messed up in their formats!
 ReflowComments: true
diff --git a/.flake8 b/.flake8
deleted file mode 100644
index 04eea09b..00000000
--- a/.flake8
+++ /dev/null
@@ -1,10 +0,0 @@
-[flake8]
-ignore = 
-  # 'foo' is too complex (N)
-  C901, 
-  # continuation line missing indentation or outdented
-  E122, 
-  E203, E501,
-  F403, F821, W503
-max-line-length = 80
-max-complexity = 18
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b24026ed..b42e51d7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,31 +1,44 @@
+---
 repos:
-    - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: 'v1.5.1'
-      hooks:
-            - id: mypy
-              language: system
-              pass_filenames: false
-              args: ['legate_sparse']
-    - repo: https://github.com/psf/black
-      rev: 23.9.1
-      hooks:
-            - id: black
-    - repo: https://github.com/PyCQA/isort
-      rev: 5.12.0
-      hooks:
-            - id: isort
-              args: ["--profile", "black"]
-    - repo: https://github.com/PyCQA/flake8
-      rev: 6.1.0
-      hooks:
-            - id: flake8
-              args: [--config=.flake8]
-    - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v16.0.6'  # Use the sha / tag you want to point at
-      hooks:
-        - id: clang-format
-          files: \.(cu|cuh|h|cc|inl)$
-          types_or: []
-
+  - repo: https://github.com/adrienverge/yamllint
+    rev: v1.38.0
+    hooks:
+      - id: yamllint
+        types: [yaml]
+        args: [-c, ./scripts/pre-commit/yamllint.yml]
+        exclude: meta\.yaml$
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      - id: check-json  # checks that all json files have proper syntax
+      - id: check-toml  # checks that all toml files have proper syntax
+      - id: end-of-file-fixer  # check all files end in a newline
+        # handled by clang-format
+        exclude_types: [c, c++, cuda]
+      - id: pretty-format-json
+        args: [--autofix, --indent=4]
+      - id: trailing-whitespace  # remove trailing whitespace
+        # don't mess up diff files
+        exclude: ^src/cmake/patches/.*\.diff$
+        # handled by clang-format
+        exclude_types: [c, c++, cuda]
+      - id: check-symlinks
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.15.9
+    hooks:
+      - id: ruff-format
+      - id: ruff
+        args: ["--config=./pyproject.toml", "--fix"]
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v22.1.2  # Use the sha / tag you want to point at
+    hooks:
+      - id: clang-format
+        files: \.(cu|cuh|h|cc|inl)$
+        types_or: []
+ci:
+  autoupdate_schedule: quarterly
 default_language_version:
-    python: python3
+  python: python3
diff --git a/.style.yapf b/.style.yapf
index 02b96779..df2b8071 100644
--- a/.style.yapf
+++ b/.style.yapf
@@ -339,4 +339,3 @@ split_penalty_logical_operator=300
 
 # Use the Tab character for indentation.
 use_tabs=False
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c32254c3..d91a3119 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ include(rapids-find)
 ###################################
 # Project
 
-set(legate_sparse_version 25.07.00)
+set(legate_sparse_version 26.02.00)
 
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
 set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g")
@@ -110,4 +110,3 @@ if(CMAKE_GENERATOR STREQUAL "Ninja")
   endfunction()
   add_touch_legate_sparse_ninja_build_target()
 endif()
-
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7b663aad..8b55877c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -55,25 +55,25 @@ git push -u origin <local-branch>:<remote-branch>
   ```
     Developer Certificate of Origin
     Version 1.1
-    
+
     Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
     1 Letterman Drive
     Suite D4700
     San Francisco, CA, 94129
-    
+
     Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
   ```
 
   ```
     Developer's Certificate of Origin 1.1
-    
+
     By making a contribution to this project, I certify that:
-    
+
     (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
-    
+
     (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
-    
+
     (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
-    
+
     (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
-  ```
\ No newline at end of file
+  ```
diff --git a/LICENSE b/LICENSE
index 4947287f..f433b1a5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -174,4 +174,4 @@
       incurred by, or claims asserted against, such Contributor by reason
       of your accepting any such warranty or additional liability.
 
-   END OF TERMS AND CONDITIONS
\ No newline at end of file
+   END OF TERMS AND CONDITIONS
diff --git a/README.md b/README.md
index b03ea8c6..5cc45556 100644
--- a/README.md
+++ b/README.md
@@ -21,27 +21,27 @@ limitations under the License.
 Legate Sparse is a [Legate](https://github.com/nv-legate/legate) library
 that aims to provide a distributed and accelerated drop-in replacement for the
 [scipy.sparse](https://docs.scipy.org/doc/scipy/reference/sparse.html) library
-on top of the [Legate](https://github.com/nv-legate/legate) runtime. 
-Legate Sparse interoperates with 
+on top of the [Legate](https://github.com/nv-legate/legate) runtime.
+Legate Sparse interoperates with
 [cuPyNumeric](https://github.com/nv-legate/cupynumeric),
-a distributed and accelerated drop-in replacement 
+a distributed and accelerated drop-in replacement
 for [NumPy](https://numpy.org/doc/stable/reference/index.html#reference), to
 enable writing programs that operate on distributed dense and sparse arrays.
-Take a look at the `examples` directory for some applications that can 
+Take a look at the `examples` directory for some applications that can
 use Legate Sparse. We have implemented
 an explicit partial-differential equation (PDE) [solver](examples/pde.py).
 More complex and interesting applications are on the way -- stay tuned!
 
-Legate Sparse is currently in alpha and supports a subset of APIs 
-and options from scipy.sparse, so if you need an API, please open 
-an issue and give us a summary of its usage. 
+Legate Sparse is currently in alpha and supports a subset of APIs
+and options from scipy.sparse, so if you need an API, please open
+an issue and give us a summary of its usage.
 
 # Installation
 
-To use Legate Sparse, `legate` and `cupynumeric` libraries have to be installed. 
-They can be installed either by pulling the respective conda packages 
-or by manually building from source. For more information, 
-see build instructions for [Legate](https://github.com/nv-legate/legate) 
+To use Legate Sparse, `legate` and `cupynumeric` libraries have to be installed.
+They can be installed either by pulling the respective conda packages
+or by manually building from source. For more information,
+see build instructions for [Legate](https://github.com/nv-legate/legate)
 and [cuPyNumeric](https://github.com/nv-legate/cupynumeric).
 
 Follow the steps in this section.
@@ -51,7 +51,7 @@ Follow the steps in this section.
 The `legate-sparse` conda package already depends on `legate` and `cupynumeric`,
 and it will install these dependencies automatically.
 
-To create a new environment and install: 
+To create a new environment and install:
 ```
 conda create -n myenv -c legate -c conda-forge legate-sparse
 ```
@@ -65,9 +65,9 @@ conda install -c legate -c conda-forge legate-sparse
 
 To write programs using Legate Sparse, import the `legate_sparse` module, which
 contains methods and types found in `scipy.sparse`. Note that the module is imported as `legate_sparse`
-and not `legate.sparse`. Here is an example program saved as `main.py`. 
+and not `legate.sparse`. Here is an example program saved as `main.py`.
 
-For more details on how to run legate programs, check 
+For more details on how to run legate programs, check
 our [documentation](https://docs.nvidia.com/cupynumeric).
 To run the application on a single GPU, use this command:
 
@@ -79,10 +79,10 @@ import legate_sparse as sparse
 import cupynumeric as np
 
 # number of diagonals in the matrix (including main diagonal)
-n_diagonals = 3 
+n_diagonals = 3
 
 # number of rows in the matrix
-nrows = 5 
+nrows = 5
 
 # generate two tridiaonal matrices (n_diagonals=3) and multiply them
 A = sparse.diags(
@@ -102,13 +102,13 @@ B = sparse.diags(
 )
 
 # spGEMM operation: multiplication of two sparse matrices
-C = A @ B 
+C = A @ B
 print(C.todense())
 print()
 
 # spMV operation: multiplication of a sparse matrix and a dense vector
 x = np.ones(nrows)
-C = A @ x 
+C = A @ x
 print(C)
 
 assert np.array_equal(A.todense().sum(axis=1), C)
diff --git a/cmake/thirdparty/get_cudss.cmake b/cmake/thirdparty/get_cudss.cmake
new file mode 100644
index 00000000..0ebfc199
--- /dev/null
+++ b/cmake/thirdparty/get_cudss.cmake
@@ -0,0 +1,28 @@
+#=============================================================================
+# Copyright 2022-2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_or_configure_cudss)
+
+    if(TARGET cudss)
+        return()
+    endif()
+
+    # cuDSS provides its own CMake config, so we use find_package directly
+    find_package(cudss REQUIRED)
+
+endfunction()
+
+find_or_configure_cudss()
diff --git a/cmake/thirdparty/get_legate.cmake b/cmake/thirdparty/get_legate.cmake
index 727671fd..142bd22a 100644
--- a/cmake/thirdparty/get_legate.cmake
+++ b/cmake/thirdparty/get_legate.cmake
@@ -18,56 +18,19 @@ function(find_or_configure_legate)
   set(oneValueArgs VERSION REPOSITORY BRANCH EXCLUDE_FROM_ALL)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  include("${rapids-cmake-dir}/export/detail/parse_version.cmake")
-  rapids_export_parse_version(${PKG_VERSION} legate PKG_VERSION)
-
   include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
   rapids_cpm_package_details(legate version git_repo git_branch shallow exclude_from_all)
 
-  set(version ${PKG_VERSION})
-  set(exclude_from_all ${PKG_EXCLUDE_FROM_ALL})
-  if(PKG_BRANCH)
-    set(git_branch "${PKG_BRANCH}")
-  endif()
-  if(PKG_REPOSITORY)
-    set(git_repo "${PKG_REPOSITORY}")
-  endif()
+  # Normalize version to match conda pkg naming (e.g., 26.01.00 -> 26.01.0)
+  string(REPLACE "00" "0" version "${version}")
 
   set(FIND_PKG_ARGS
       GLOBAL_TARGETS     legate::legate
       BUILD_EXPORT_SET   legate-sparse-exports
       INSTALL_EXPORT_SET legate-sparse-exports)
 
-  # First try to find legate via find_package()
-  # so the `Legion_USE_*` variables are visible
-  # Use QUIET find by default.
-  set(_find_mode QUIET)
-  # If legate_DIR/legate_ROOT are defined as something other than empty or NOTFOUND
-  # use a REQUIRED find so that the build does not silently download legate.
-  if(legate_DIR OR legate_ROOT)
-    set(_find_mode REQUIRED)
-  endif()
-  rapids_find_package(legate ${version} EXACT CONFIG ${_find_mode} ${FIND_PKG_ARGS})
-
-  if(legate_FOUND)
-    message(STATUS "CPM: using local package legate@${version}")
-  else()
-    include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cpm_helpers.cmake)
-    get_cpm_git_args(legate_cpm_git_args REPOSITORY ${git_repo} BRANCH ${git_branch})
-
-    message(VERBOSE "legate.sparse: legate version: ${version}")
-    message(VERBOSE "legate.sparse: legate git_repo: ${git_repo}")
-    message(VERBOSE "legate.sparse: legate git_branch: ${git_branch}")
-    message(VERBOSE "legate.sparse: legate exclude_from_all: ${exclude_from_all}")
-    message(VERBOSE "legate.sparse: legate legate_cpm_git_args: ${legate_cpm_git_args}")
-
-    rapids_cpm_find(legate ${version} ${FIND_PKG_ARGS}
-        CPM_ARGS
-          ${legate_cpm_git_args}
-          FIND_PACKAGE_ARGUMENTS EXACT
-          EXCLUDE_FROM_ALL       ${exclude_from_all}
-    )
-  endif()
+  # Require legate to be pre-installed; do not fall back to cloning.
+  rapids_find_package(legate ${version} EXACT CONFIG REQUIRED ${FIND_PKG_ARGS})
 
   set(Legion_USE_CUDA ${Legion_USE_CUDA} PARENT_SCOPE)
   set(Legion_USE_OpenMP ${Legion_USE_OpenMP} PARENT_SCOPE)
diff --git a/cmake/versions.json b/cmake/versions.json
index 6c5440f4..85c7e7ae 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -1,24 +1,24 @@
 {
-  "packages" : {
-    "legate" : {
-      "repo": "legate.internal",
-      "org": "nv-legate",
-      "version": "25.07.00",
-      "git_url" : "git@github.com:nv-legate/legate.git",
-      "git_shallow": false,
-      "always_download": false,
-      "git_tag" : "a46dc3d5b176ff9546bc831409c394c1bbc3b936",
-      "anaconda_label": "main"
-    },
-    "cupynumeric" : {
-      "repo": "cupynumeric.internal",
-      "org": "nv-legate",
-      "version": "25.07.00",
-      "git_url" : "git@github.com:nv-legate/cupynumeric",
-      "git_shallow": false,
-      "always_download": false,
-      "git_tag" : "6132d8450049a7abd7786fb4d60444eb5b4e25db",
-      "anaconda_label": "main"
+    "packages": {
+        "cupynumeric": {
+            "always_download": false,
+            "anaconda_label": "main",
+            "git_shallow": false,
+            "git_tag": "ae1c787828a9327ad00a076739706f41d196a043",
+            "git_url": "git@github.com:nv-legate/cupynumeric.internal",
+            "org": "nv-legate",
+            "repo": "cupynumeric.internal",
+            "version": "26.01.00"
+        },
+        "legate": {
+            "always_download": false,
+            "anaconda_label": "main",
+            "git_shallow": false,
+            "git_tag": "3ccb639605eecd8e9fee52c2d7d56ea799f4864e",
+            "git_url": "git@github.com:nv-legate/legate.internal.git",
+            "org": "nv-legate",
+            "repo": "legate.internal",
+            "version": "26.01.00"
+        }
     }
-  }
 }
diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
old mode 100644
new mode 100755
diff --git a/conda/conda-build/conda_build_config.yaml b/conda/conda-build/conda_build_config.yaml
index ada8dda2..a67aaba9 100644
--- a/conda/conda-build/conda_build_config.yaml
+++ b/conda/conda-build/conda_build_config.yaml
@@ -10,6 +10,14 @@ python:
   - 3.12
   - 3.13
 
+# Pin sysroot glibc to match Legate's current baseline and avoid newer
+# toolchains needing RELR-aware binutils.
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
+  - "2.28"
+
 numpy_version:
   # Not 2.1.0 which segfaults on asarray() sometimes, see
   # https://github.com/numpy/numpy/pull/27249
@@ -17,3 +25,6 @@ numpy_version:
 
 cmake_version:
   - ">=3.20.1,!=3.23.0"
+
+cuda_compiler:
+  - cuda-nvcc
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 9bdad28f..209abd04 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -10,8 +10,11 @@
 ## The placeholder version is strictly for making two-pass conda build process.
 ## It should not be used for any other purpose, and this is not a default version.
 {% set placeholder_version = '0.0.0.dev' %}
-{% set default_cuda_version = '12.2.2' %}
-{% set cuda_version='.'.join(environ.get('CUDA', default_cuda_version).split('.')[:2]) %}
+{% set legate_cuda_version = environ.get('LEGATE_CUDA_VERSION') %}
+{% if not legate_cuda_version %}
+invalid_yaml_missing_cuda_version: LEGATE_CUDA_VERSION must be set
+{% endif %}
+{% set cuda_version='.'.join(legate_cuda_version.split('.')[:2]) %}
 {% set cuda_major=cuda_version.split('.')[0]|int %}
 {% set py_version=environ.get('CONDA_PY', '') %}
 
@@ -100,14 +103,16 @@ requirements:
     - make
     - ninja
     - cmake {{ cmake_version }}
-    - {{ compiler('c') }} =11.2
-    - {{ compiler('cxx') }} =11.2
-    # the nvcc requirement is necessary because it contains crt/host_config.h used by cuda runtime. This is a packaging bug that has been reported.
-    - cuda-nvcc
-    # cudart needed for CPU and GPU builds because of curand
-    - cuda-cudart-dev
+    - {{ stdlib("c") }}
+    - {{ compiler('c') }} =14
+    - {{ compiler('cxx') }} =14
+    - pkg-config
+{% if gpu_enabled_bool %}
     - cuda-version ={{ cuda_version }}
+    - {{ compiler('cuda') }}
+    - cuda-cudart-dev
     - libcusparse-dev
+{% endif %}
 
 
   host:
@@ -127,11 +132,12 @@ requirements:
     # legate, there may not be a cupynumeric package that is compatible.  So, we
     # list cupynumeric here to get a pair of legate and cupynumeric that are
     # compatible.
+    - cuda-version ={{ cuda_version }}
     - cuda-cccl
     - libcusparse
-    - cuda-version ={{ cuda_version }}
+    - libcudss-dev
+    - nccl <2.29
     - cuda-cudart
-    - nccl
 {% endif %}
 
   run:
@@ -142,6 +148,9 @@ requirements:
 {% if gpu_enabled_bool %}
     - libnvjitlink
     - libcusparse
+    # ship the NCCL comm layer so multi-GPU cudss runs can load libcudss_commlayer_nccl.so
+    - libcudss-commlayer-nccl
+    - nccl >=2.0,<2.29
     # Pin to all minor versions of CUDA newer than the one built against, within the same major version.
     # cuda-version constrains the CUDA runtime version and ensures a compatible driver is available
     - {{ pin_compatible('cuda-version', min_pin='x.x', max_pin='x') }}
diff --git a/examples/common.py b/examples/common.py
index 99174ed6..e7cfb396 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -11,15 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import argparse
 import importlib
+from typing import TYPE_CHECKING, Any, Protocol, cast
 
-import numpy
-from typing_extensions import Protocol
+if TYPE_CHECKING:
+    from types import ModuleType
 
+    import numpy.typing as npt
+    from legate.timing._lib.timing import PyTime
+    from legate_sparse import csr_array
 
-def get_arg_number(arg):
+np: ModuleType
+sparse: ModuleType
+linalg: ModuleType
+
+
+def get_arg_number(arg: str) -> int:
     """Parse a string argument that may contain size suffixes.
 
     Parameters
@@ -68,11 +78,11 @@ class Timer(Protocol):
     for measuring execution time in the examples.
     """
 
-    def start(self):
+    def start(self) -> None:
         """Start timing."""
         ...
 
-    def stop(self):
+    def stop(self) -> float:
         """Stop timing and return duration.
 
         Blocks execution until everything before it has completed.
@@ -92,19 +102,21 @@ class LegateTimer(Timer):
     measurement of GPU operations.
     """
 
-    def __init__(self):
-        self._start = None
+    def __init__(self) -> None:
+        self._start: PyTime | None = None
 
-    def start(self):
+    def start(self) -> None:
         """Start timing using Legate's time function."""
         from legate.timing import time
 
         self._start = time()
 
-    def stop(self):
+    def stop(self) -> float:
         """Stop timing and return duration in milliseconds."""
         from legate.timing import time
 
+        assert self._start is not None
+
         _end = time()
         return (_end - self._start) / 1000.0
 
@@ -116,24 +128,26 @@ class CuPyTimer(Timer):
     in CuPy applications.
     """
 
-    def __init__(self):
-        self._start_event = None
+    def __init__(self) -> None:
+        self._start_event: Any | None = None
 
-    def start(self):
+    def start(self) -> None:
         """Start timing using CUDA events."""
-        from cupy import cuda
+        from cupy import cuda  # type: ignore [import-untyped]
 
         self._start_event = cuda.Event()
         self._start_event.record()
 
-    def stop(self):
+    def stop(self) -> float:
         """Stop timing and return duration in milliseconds."""
         from cupy import cuda
 
+        assert self._start_event is not None
+
         end_event = cuda.Event()
         end_event.record()
         end_event.synchronize()
-        return cuda.get_elapsed_time(self._start_event, end_event)
+        return cast(float, cuda.get_elapsed_time(self._start_event, end_event))
 
 
 class NumPyTimer(Timer):
@@ -143,19 +157,21 @@ class NumPyTimer(Timer):
     of CPU operations in NumPy/SciPy applications.
     """
 
-    def __init__(self):
-        self._start_time = None
+    def __init__(self) -> None:
+        self._start_time: float | None = None
 
-    def start(self):
+    def start(self) -> None:
         """Start timing using perf_counter_ns."""
         from time import perf_counter_ns
 
         self._start_time = perf_counter_ns() / 1000.0
 
-    def stop(self):
+    def stop(self) -> float:
         """Stop timing and return duration in milliseconds."""
         from time import perf_counter_ns
 
+        assert self._start_time is not None
+
         end_time = perf_counter_ns() / 1000.0
         return (end_time - self._start_time) / 1000.0
 
@@ -171,32 +187,31 @@ class DummyScope:
     that may or may not use resource scoping.
     """
 
-    def __init__(self):
-        ...
+    def __init__(self) -> None: ...
 
-    def __enter__(self):
+    def __enter__(self) -> None:
         """Enter the context (no-op)."""
         ...
 
-    def __exit__(self, _, __, ___):
+    def __exit__(self, _: Any, __: Any, ___: Any) -> None:
         """Exit the context (no-op)."""
         ...
 
-    def __getitem__(self, item):
+    def __getitem__(self, item: Any) -> DummyScope:
         """Return self for any indexing (no-op)."""
         return self
 
-    def count(self, _):
+    def count(self, _: Any) -> int:
         """Return 1 for any count operation."""
         return 1
 
     @property
-    def preferred_kind(self):
+    def preferred_kind(self) -> None:
         """Return None for preferred kind."""
         return None
 
 
-def get_phase_procs(use_legate: bool):
+def get_phase_procs(use_legate: bool) -> tuple[Any, Any]:
     """Get processor configurations for different phases of computation.
 
     Parameters
@@ -252,7 +267,9 @@ def get_phase_procs(use_legate: bool):
         return DummyScope(), DummyScope()
 
 
-def parse_common_args():
+def parse_common_args() -> tuple[
+    str, Timer, ModuleType, ModuleType, ModuleType, bool
+]:
     """Parse common command line arguments for example scripts.
 
     Returns
@@ -274,6 +291,8 @@ def parse_common_args():
     - "cupy": Uses cupy, cupyx.scipy.sparse, and cupyx.scipy.sparse.linalg
     - "scipy": Uses numpy, scipy.sparse, and scipy.sparse.linalg
     """
+    global np, sparse, linalg
+
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--package",
@@ -283,6 +302,8 @@ def parse_common_args():
     )
     args, _ = parser.parse_known_args()
 
+    timer: Timer
+
     if args.package == "legate":
         timer = LegateTimer()
         np_name = "cupynumeric"
@@ -306,9 +327,9 @@ def parse_common_args():
 
         use_legate = False
 
-    globals()["np"] = importlib.import_module(np_name)
-    globals()["sparse"] = importlib.import_module(sp_name)
-    globals()["linalg"] = importlib.import_module(lg_name)
+    np = importlib.import_module(np_name)
+    sparse = importlib.import_module(sp_name)
+    linalg = importlib.import_module(lg_name)
 
     return args.package, timer, np, sparse, linalg, use_legate
 
@@ -317,7 +338,9 @@ def parse_common_args():
 #
 # `diags` construct csr from dia array, while when from_diags=False
 # we construct csr arrya directly - might be slightly faster
-def banded_matrix(N, nnz_per_row, from_diags=False):
+def banded_matrix(
+    N: int, nnz_per_row: int, from_diags: bool = False
+) -> csr_array:
     """Construct a banded matrix with 1.0 as values.
 
     Parameters
@@ -375,7 +398,9 @@ def banded_matrix(N, nnz_per_row, from_diags=False):
 
         pred = np.arange(nnz_per_row - half_nnz, nnz_per_row + 1)
         post = np.flip(pred)
-        nnz_arr = np.concatenate((pred, np.ones(main_rows) * nnz_per_row, post))
+        nnz_arr = np.concatenate(
+            (pred, np.ones(main_rows) * nnz_per_row, post)
+        )
         row_offsets = np.zeros(N + 1).astype(sparse.coord_ty)
         row_offsets[1 : N + 1] = np.cumsum(nnz_arr)
         nnz = row_offsets[-1]
@@ -399,7 +424,12 @@ def banded_matrix(N, nnz_per_row, from_diags=False):
         )
 
 
-def stencil_grid(S, grid, dtype=None, format=None):
+def stencil_grid(
+    S: Any,
+    grid: tuple[int, int],
+    dtype: npt.dtype[Any] | None = None,
+    format: str | None = None,
+) -> csr_array:
     """Construct a sparse matrix resulting from a stencil
     discretization on rectilinear grids.
 
@@ -437,6 +467,8 @@ def stencil_grid(S, grid, dtype=None, format=None):
     >>> A = stencil_grid(S, (3, 3))
     >>> print(A.toarray())
     """
+    import numpy
+
     N_v = int(numpy.prod(grid))  # number of vertices in the mesh
     N_s = int((S != 0).sum(dtype=int))  # number of nonzero stencil entries
 
@@ -497,7 +529,7 @@ def stencil_grid(S, grid, dtype=None, format=None):
     return sparse.dia_array((data, diags), shape=(N_v, N_v)).tocsr()
 
 
-def poisson2D(N):
+def poisson2D(N: int) -> csr_array:
     """Construct the 2D Poisson matrix.
 
     Parameters
@@ -536,7 +568,9 @@ def poisson2D(N):
     diag_size = N * N - 1
     first = np.full((N - 1), -1.0)
     chunks = np.concatenate([np.zeros(1), first])
-    diag_a = np.concatenate([first, np.tile(chunks, (diag_size - (N - 1)) // N)])
+    diag_a = np.concatenate(
+        [first, np.tile(chunks, (diag_size - (N - 1)) // N)]
+    )
     diag_g = -1.0 * np.ones(N * (N - 1))
     diag_c = 4.0 * np.ones(N * N)
 
@@ -549,7 +583,7 @@ def poisson2D(N):
     return sparse.diags(diagonals, offsets, dtype=np.float64).tocsr()
 
 
-def diffusion2D(N, epsilon=1.0, theta=0.0):
+def diffusion2D(N: int, epsilon: float = 1.0, theta: float = 0.0) -> csr_array:
     """Construct a 2D diffusion matrix with anisotropy.
 
     Parameters
diff --git a/examples/direct_solve_banded_system.py b/examples/direct_solve_banded_system.py
new file mode 100644
index 00000000..1876acfa
--- /dev/null
+++ b/examples/direct_solve_banded_system.py
@@ -0,0 +1,86 @@
+import argparse
+from common import get_arg_number, parse_common_args
+
+"""Sparse Direct Solve Benchmark.
+
+This script benchmarks sparse direct solve for a banded system of equations
+
+"""
+
+
+def create_system_of_eqns(nrows, dtype):
+    """
+    Creates a system of equations A*x = b where:
+    - A has 4 on the main diagonal (k=0), 1 on the first and second upper diagonal (k=1, 2)
+    - and 1 on the first lower diagonal (k=-1)
+    - The solution x is [1, 2, 3, ..., nrows]
+    - b is computed as A @ x
+    """
+
+    main_diag = np.full(nrows, 4.0)
+    upper1_diag = np.ones(nrows - 1)
+    upper2_diag = np.ones(nrows - 2)
+    lower1_diag = np.ones(nrows - 1)
+
+    A = sparse.diags(
+        [lower1_diag, main_diag, upper1_diag, upper2_diag],
+        offsets=[-1, 0, 1, 2],
+        shape=(nrows, nrows),
+        dtype=np.float64,
+        format="csr",
+    )
+    x_expected = np.arange(1, nrows + 1, dtype=dtype)
+    b = A @ x_expected
+
+    return (A, b, x_expected)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-n",
+        "--nrows",
+        type=str,
+        default="12",
+        dest="nrows",
+        help="Number of rows in the generated matrix (accepts suffixes 'k', 'm', 'g')",
+    )
+
+    parser.add_argument(
+        "--nwarmups",
+        type=int,
+        default=2,
+        dest="nwarmups",
+        help="Number of warmup iterations before spsolve is timed",
+    )
+
+    args, _ = parser.parse_known_args()
+    package, timer, np, sparse, _, _ = parse_common_args()
+
+    nrows = get_arg_number(args.nrows)
+    nwarmups = args.nwarmups
+
+    assert nrows > 0, "Matrix must contain atleast one row"
+    assert nwarmups >= 0, "Warmup iterations must be >= 0"
+
+    timer.start()
+    A, b, x_expected = create_system_of_eqns(nrows, np.float64)
+    elapsed_time_setup = timer.stop()
+
+    for _ in range(nwarmups):
+        x = sparse.linalg.spsolve(A, b)
+
+    timer.start()
+    x = sparse.linalg.spsolve(A, b)
+    elapsed_time_solve = timer.stop()
+
+    error_l2_norm = np.linalg.norm(x_expected - x) / np.linalg.norm(x_expected)
+
+    print(f"Dimension of A              : {A.shape}")
+    print(f"Dimension of b              : {b.shape}")
+    print(f"Dimension of x              : {x.shape}")
+    print(f"NNZ of A                    : {A.nnz}")
+    print(f"Elapsed time for setup (ms) : {elapsed_time_setup}")
+    print(f"Elapsed time for solve (ms) : {elapsed_time_solve}")
+    print(f"Error in solution           : {error_l2_norm}")
diff --git a/examples/gmg.py b/examples/gmg.py
deleted file mode 100644
index 6491ef59..00000000
--- a/examples/gmg.py
+++ /dev/null
@@ -1,492 +0,0 @@
-# Copyright 2022-2024 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Portions of this file are also subject to the following license:
-#
-# The MIT License (MIT)
-#
-# Copyright (c) 2008-2015 PyAMG Developers
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import argparse
-
-# for some small data manipulations on host
-import numpy
-from common import diffusion2D, get_phase_procs, parse_common_args, poisson2D
-
-
-def max_eigenvalue(A, iters=15):
-    # Compute eigenvector associated with maximum eigenvalue via power
-    # iteration.  This is the same as Steven's imp for estimating spectral
-    # radius.
-    x1 = np.random.rand(A.shape[1]).reshape(-1, 1)
-    for _ in range(iters):
-        x1 = A @ x1
-        x1 /= np.linalg.norm(x1)
-    # Compute and return max eigenvalue via Raleigh quotient.
-    # This is np.dot(A @ x1, x1) / np.dot(x1, x1)
-    # but since x1 is a unit vector, we can assume denominator is 1.
-    return np.dot(x1.T, A @ x1).item()
-
-
-class GMG(object):
-    """
-    Geometric Multigrid solver for the 2D Poisson problem.
-
-    - Source on correctness of restriction / prolongation operators: [1]
-    - Sources on V-cycle algorithm: [1, 2, 3, 4]
-    - Source on preconditioned conjugate gradient and Gauss-Seidel smoothing: [4]
-
-    [1] https://www.researchgate.net/publication/220690328_A_Multigrid_Tutorial_2nd_Edition
-    [2] https://github.com/pyamg/pyamg
-    [3] http://www.cs.columbia.edu/cg/pdfs/28_GPUSim.pdf
-    [4] https://netlib.org/utk/people/JackDongarra/PAPERS/HPCG-benchmark.pdf
-    """  # noqa: E501
-
-    def __init__(self, A, shape, levels, smoother, gridop, machine):
-        self.A = A
-        self.shape = shape
-        self.N = numpy.prod(self.shape)
-        self.levels = levels
-        self.restriction_op = {
-            "injection": injection_operator,
-            "linear": linear_operator,
-        }[gridop]
-        self.smoother = {"jacobi": WeightedJacobi}[smoother]()
-        self.operators = self.compute_operators(A)
-        self.temp = None
-        self.machine = machine
-        self.proc_kind = machine.preferred_target
-
-    def compute_operators(self, A):
-        operators = []
-        dim = self.N
-        self.smoother.init_level_params(A, 0)
-        for level in range(self.levels):
-            R, dim = self.compute_restriction_level(dim)
-            P = R.T
-            # assert sparse.issparse(P)
-            A = R @ A @ P
-            # assert sparse.issparse(A)
-            self.smoother.init_level_params(A, level + 1)
-            operators.append((R, A, P))
-        return operators
-
-    def cycle(self, r):
-        # Kick off the cycle with the top-level machine.
-        # TODO (marsaev): there are issues with scoping
-        # disabling it for now
-        return self._cycle(self.A, r, 0, self.machine)
-
-    def _cycle(self, A, r, level, machine):
-        if level == self.levels - 1:
-            return self.smoother.coarse(A, r, None, level=level)
-        x = None
-        # Do one pre-smoothing iteration.
-        R, coarse_A, P = self.operators[level]
-        x = self.smoother.pre(A, r, x, level=level)
-        # Compute the residual.
-        fine_r = r - A.dot(x)
-
-        # Restrict the residual.
-        if use_legate:
-            # TODO (marsaev): there col-split splmv optimization
-            coarse_r = R.dot(fine_r)
-        else:
-            coarse_r = R.dot(fine_r)
-
-        # Compute coarse solution using a subset of the machine.
-        # TODO (marsaev): there are issues with scoping
-        # disabling it for now
-        coarse_x = self._cycle(coarse_A, coarse_r, level + 1, self.machine)
-
-        fine_x = P @ coarse_x
-        x_corrected = x + fine_x
-        # Do one post-smoothing iteration.
-        return self.smoother.post(A, r, x_corrected, level=level)
-
-    def compute_restriction_level(self, fine_dim):
-        return self.restriction_op(fine_dim)
-
-    def linear_operator(self):
-        return linalg.LinearOperator(
-            self.A.shape, dtype=float, matvec=lambda r: self.cycle(r)
-        )
-
-
-class WeightedJacobi(object):
-    def __init__(self, omega=4.0 / 3.0):
-        # Basically, similar solution to PyAMG.
-        self.level_params = []
-        self._init_omega = omega
-
-    def init_level_params(self, A, level):
-        D_inv = 1.0 / A.diagonal()
-        # We need to create a new sparse matrix with just this modified
-        # diagonal of A. sparse.eye doesn't have this nob, but we can take
-        # the output of sparse.eye and mess with it to get the matrix
-        # that we want.
-        D_inv_nnz = min(A.shape[0], A.shape[1])
-        D_inv_mat = sparse.csr_array(
-            (
-                np.ones(D_inv_nnz).astype(A.dtype),
-                (
-                    np.arange(D_inv_nnz).astype(sparse.coord_ty),
-                    np.arange(D_inv_nnz).astype(sparse.coord_ty),
-                ),
-            ),
-            shape=A.shape,
-            dtype=A.dtype,
-            copy=False,
-        )
-        """
-        sparse.eye(
-            A.shape[0], n=A.shape[1], dtype=A.dtype, format="csr"
-        )
-        """
-        D_inv_mat.data = 1.0 / D_inv
-        spectral_radius = max_eigenvalue(A @ D_inv_mat, 1)
-        omega = self._init_omega / spectral_radius
-        self.level_params.append((omega, D_inv))
-        assert len(self.level_params) - 1 == level
-
-    def __call__(self, A, r, x, level):
-        omega, D_inv = self.level_params[level]
-        return (1 - omega) * x + omega * (r - A @ x + x / D_inv) * D_inv
-
-    def pre(self, A, r, x, level):
-        if x is not None:
-            raise Exception("Expected x is None.")
-        omega, D_inv = self.level_params[level]
-        return omega * r * D_inv
-
-    def post(self, A, r, x, level):
-        omega, D_inv = self.level_params[level]
-        return x + omega * (r - A @ x) * D_inv
-
-    def coarse(self, A, r, x, level):
-        return self.pre(A, r, x, level)
-        # return sparse.linalg.spsolve(A, r)
-
-
-def injection_operator(fine_dim):
-    fine_shape = (int(np.sqrt(fine_dim)),) * 2
-    coarse_shape = fine_shape[0] // 2, fine_shape[1] // 2
-    coarse_dim = numpy.prod(coarse_shape)
-    Rp = np.arange(coarse_dim + 1)
-    Rx = np.ones((coarse_dim,), dtype=np.float64)
-    ij = np.arange(coarse_dim, dtype=np.int64)
-    i = ij % coarse_shape[1]
-    j = ij // coarse_shape[1]
-    Rj = 2 * i + 2 * j * coarse_shape[1]
-    R = sparse.csr_matrix((Rx, Rj, Rp), shape=(coarse_dim, fine_dim), dtype=np.float64)
-    return R, coarse_dim
-
-
-def linear_operator(fine_dim):
-    fine_shape = (int(np.sqrt(fine_dim)),) * 2
-    coarse_shape = fine_shape[0] // 2, fine_shape[1] // 2
-    coarse_dim = np.prod(coarse_shape)
-    # Construct CSR directly.
-    Rp = numpy.empty(coarse_dim + 1, dtype=np.int64)
-    # Get an upper bound on the total number of non-zeroes, and construct Rj
-    # and Rx based on this bound.  Computing this value exactly is tedious and
-    # the extra allocation can be truncated at the end.  We won't need more
-    # than 9*coarse_dim rows.
-    nnz = 9 * coarse_dim
-    Rj = numpy.empty((nnz,), dtype=np.int64)
-    Rx = numpy.empty((nnz,), dtype=np.float64)
-    p = 0
-
-    def flatten(i, j):
-        return i * fine_shape[1] + j
-
-    for ij in range(coarse_dim):
-        Rp[ij] = p
-        # For linear interpolation,
-        # we have 9 points over which to average in the 2d case.
-        # The coefficient matrix will act as a stencil operator.
-        i, j = (ij // coarse_shape[1]), (ij % coarse_shape[1])
-        # Corners.
-        # r[2*i-1, 2*j-1] = 1/16
-        # r[2*i-1, 2*j+1] = 1/16
-        # r[2*i+1, 2*j-1] = 1/16
-        # r[2*i+1, 2*j+1] = 1/16
-        # Edges.
-        # r[2*i, 2*j+1] = 2/16
-        # r[2*i, 2*j-1] = 2/16
-        # r[2*i-1, 2*j] = 2/16
-        # r[2*i+1, 2*j] = 2/16
-        # Center.
-        # r[2 * i, 2 * j] = 4/16
-        # Ensure indices are constructed in order.
-        # Assumes row-major ordering.
-        if 0 <= 2 * i - 1:
-            if 0 <= 2 * j - 1:
-                # top-left
-                Rj[p], Rx[p] = flatten(2 * i - 1, 2 * j - 1), 1 / 16
-                p += 1
-            # top-middle
-            Rj[p], Rx[p] = flatten(2 * i - 1, 2 * j), 2 / 16
-            p += 1
-            if 2 * j + 1 < fine_dim:
-                # top-right
-                Rj[p], Rx[p] = flatten(2 * i - 1, 2 * j + 1), 1 / 16
-                p += 1
-        if 0 <= 2 * j - 1:
-            # middle-left
-            Rj[p], Rx[p] = flatten(2 * i, 2 * j - 1), 2 / 16
-            p += 1
-        # middle-middle
-        Rj[p], Rx[p] = flatten(2 * i, 2 * j), 4 / 16
-        p += 1
-        if 2 * j + 1 < fine_dim:
-            # middle-right
-            Rj[p], Rx[p] = flatten(2 * i, 2 * j + 1), 2 / 16
-            p += 1
-        if 2 * i + 1 < fine_dim:
-            if 0 <= 2 * j - 1:
-                # bottom-left
-                Rj[p], Rx[p] = flatten(2 * i + 1, 2 * j - 1), 1 / 16
-                p += 1
-            # bottom-middle
-            Rj[p], Rx[p] = flatten(2 * i + 1, 2 * j), 2 / 16
-            p += 1
-            if 2 * j + 1 < fine_dim:
-                # bottom-right
-                Rj[p], Rx[p] = flatten(2 * i + 1, 2 * j + 1), 1 / 16
-                p += 1
-
-    Rp[coarse_dim] = p
-    Rx, Rj, Rp = np.array(Rx[:p]), np.array(Rj[:p]), np.array(Rp)
-    R = sparse.csr_matrix((Rx[:p], Rj[:p], Rp), shape=(coarse_dim, fine_dim))
-    return R, coarse_dim
-
-
-def required_driver_memory(N):
-    NN = N * N
-    fine_shape = (int(np.sqrt(NN)),) * 2
-    coarse_shape = fine_shape[0] // 2, fine_shape[1] // 2
-    coarse_dim = numpy.prod(coarse_shape)
-    nnz = 9 * coarse_dim
-    elements = nnz + coarse_dim + 1
-    bytes = elements * 8
-    mb = bytes / 10**6
-    print("Max required driver memory for N=%d is %fMB" % (N, mb))
-
-
-def print_diagnostics(operators):
-    """Print basic statistics about the multigrid hierarchy."""
-    output = "MultilevelSolver\n"
-    output += f"Number of Levels:     {len(operators)}\n"
-    # output += f"Operator Complexity: {operator_complexity(levels):6.3f}\n"
-    # output += f"Grid Complexity:     {grid_complexity(levels):6.3f}\n"
-
-    total_nnz = sum(level[1].nnz for level in operators)
-
-    #          123456712345678901 123456789012 123456789
-    #               0       10000        49600 [52.88%]
-    output += "  level   unknowns     nonzeros\n"
-    for n, level in enumerate(operators):
-        A = level[1]
-        ratio = 100 * A.nnz / total_nnz
-        output += f"{n:>6} {A.shape[1]:>11} {A.nnz:>12} [{ratio:2.2f}%]\n"
-
-    print(output)
-
-
-def execute(N, data, smoother, gridop, levels, maxiter, tol, verbose, warmup, timer):
-    build, solve = get_phase_procs(use_legate)
-
-    if warmup:
-        tA = diffusion2D(64, epsilon=0.1, theta=np.pi / 4)
-        tB = tA.T
-        tC = tB @ tA  # noqa: F841
-
-    # Generate matrix
-    timer.start()
-    if data == "poisson":
-        A = poisson2D(N)
-        b = np.random.rand(N**2)
-    elif data == "diffusion":
-        A = diffusion2D(N)
-        b = np.random.rand(N**2)
-    else:
-        raise NotImplementedError(data)
-    print(f"GMG: {A.shape}")
-    print(f"Data creation time: {timer.stop()} ms")
-
-    assert smoother == "jacobi", "Only Jacobi smoother is currently supported."
-
-    if verbose:
-
-        def callback(x):
-            print(f"Residual: {np.linalg.norm(b - (A @ x))}")
-
-    else:
-        callback = None
-
-    required_driver_memory(N)
-    # Setup
-    timer.start()
-    mg_solver = GMG(
-        A=A,
-        shape=(N, N),
-        levels=levels,
-        smoother=smoother,
-        gridop=gridop,
-        machine=solve,
-    )
-    M = mg_solver.linear_operator()
-    print(f"GMG init time: {timer.stop()} ms")
-
-    print_diagnostics(mg_solver.operators)
-
-    # Warm up the runtime.
-    float(
-        np.linalg.norm(
-            A.dot(
-                np.zeros(
-                    A.shape[1],
-                )
-            )
-        )
-    )
-    float(
-        np.linalg.norm(
-            M.matvec(
-                np.zeros(
-                    M.shape[1],
-                )
-            )
-        )
-    )
-    # Make another call to random here as well.
-    float(np.linalg.norm(np.random.rand(b.shape[0])))
-
-    # Solve
-    timer.start()
-    x, iters = linalg.cg(A, b, rtol=tol, maxiter=maxiter, M=M, callback=callback)
-    total = timer.stop()
-
-    norm_ini = np.linalg.norm(b)
-    norm_res = np.linalg.norm(b - (A @ x))
-
-    # Check convergence with relative tolerance
-    convergence_status = True if norm_res <= norm_ini * tol else False
-    print(f"Dimension of A                      : {A.shape}")
-    print(f"Did the solution converge           : {convergence_status}")
-    print(f"Final relative residual norm        : {norm_res / norm_ini}")
-    print(f"Number of iterations                : {iters}")
-    print(f"Total elapsed time (ms)             : {total}")
-    print(f"Time per iteration (ms)             : {total / iters}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-n",
-        "--num",
-        type=int,
-        default=16,
-        dest="N",
-        help="number of elements in one dimension",
-    )
-    parser.add_argument(
-        "-d",
-        "--data",
-        dest="data",
-        choices=["poisson", "diffusion"],
-        type=str,
-        default="poisson",
-        help="The problem instance to solve.",
-    )
-    parser.add_argument(
-        "-s",
-        "--smoother",
-        dest="smoother",
-        choices=["jacobi"],
-        type=str,
-        default="jacobi",
-        help="Smoother to use.",
-    )
-    parser.add_argument(
-        "-g",
-        "--gridop",
-        dest="gridop",
-        choices=["linear", "injection"],
-        type=str,
-        default="injection",
-        help="Intergrid transfer operator to use.",
-    )
-    parser.add_argument(
-        "-l",
-        "--levels",
-        dest="levels",
-        type=int,
-        default=2,
-        help="Number of multigrid levels.",
-    )
-    parser.add_argument(
-        "-m",
-        "--maxiter",
-        type=int,
-        default=200,
-        dest="maxiter",
-        help="bound the maximum number of iterations",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        dest="verbose",
-        action="store_true",
-        help="print verbose output",
-    )
-    parser.add_argument(
-        "--tol",
-        type=float,
-        default=1e-10,
-        dest="tol",
-        help="Convergence relative norm check threshold",
-    )
-
-    parser.add_argument(
-        "-w",
-        "--warmup",
-        dest="warmup",
-        action="store_true",
-        help="Perform some Warmup operations before running timings",
-    )
-
-    args, _ = parser.parse_known_args()
-    _, timer, np, sparse, linalg, use_legate = parse_common_args()
-    execute(**vars(args), timer=timer)
diff --git a/examples/matrix_power.py b/examples/matrix_power.py
index cc52c08b..69a807aa 100644
--- a/examples/matrix_power.py
+++ b/examples/matrix_power.py
@@ -32,11 +32,17 @@
 --package: Backend to use (legate, cupy, scipy)
 """
 
+from __future__ import annotations
+
 import argparse
 from functools import reduce
+from typing import TYPE_CHECKING
+
+from common import Timer, get_arg_number, parse_common_args
 
-import numpy.typing as npt
-from common import get_arg_number, parse_common_args
+if TYPE_CHECKING:
+    import numpy.typing as npt
+    from legate_sparse import csr_array
 
 # global states random_seed, rng
 global random_seed, rng
@@ -46,7 +52,9 @@
 # ----------------------------
 
 
-def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike = None):
+def create_csr_with_nnz_per_row(
+    nrows: int, nnz_per_row: int, dtype: npt.DTypeLike | None = None
+) -> csr_array:
     """Create a CSR matrix with a prescribed number of nonzeros in each row.
 
     Parameters
@@ -84,7 +92,9 @@ def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike =
     return matrix
 
 
-def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
+def create_csr_with_nnz_total(
+    nrows: int, nnz_total: int, dtype: npt.DTypeLike | None = None
+) -> csr_array:
     """Create a CSR matrix with a prescribed total number of nonzeros.
 
     Parameters
@@ -113,7 +123,9 @@ def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
     coo_rows = rng.integers(0, nrows, nnz_total)
     coo_cols = rng.integers(0, ncols, nnz_total)
     vals = np.ones(nnz_total, dtype=dtype)
-    matrix = sparse.csr_matrix((vals, (coo_rows, coo_cols)), shape=(nrows, ncols))
+    matrix = sparse.csr_matrix(
+        (vals, (coo_rows, coo_cols)), shape=(nrows, ncols)
+    )
 
     return matrix
 
@@ -123,7 +135,9 @@ def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
 # ------------------------
 
 
-def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
+def compute_A_power_k(
+    A: csr_array, timer: Timer, nwarmups: int = 2, k: int = 4
+) -> None:
     """Compute A^k and measure performance.
 
     Parameters
@@ -180,7 +194,9 @@ def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
         print(
             f"Elapsed time for spgemm for hop {hop} (ms) : {elapsed_time_spgemm[hop]}"
         )
-        print(f"Elapsed time for copy   for hop {hop} (ms) : {elapsed_time_copy[hop]}")
+        print(
+            f"Elapsed time for copy   for hop {hop} (ms) : {elapsed_time_copy[hop]}"
+        )
 
 
 if __name__ == "__main__":
@@ -243,13 +259,11 @@ def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
     )
 
     args, _ = parser.parse_known_args()
-    _, timer, np, sparse, linalg, use_legate = parse_common_args()
+    package, timer, np, sparse, linalg, use_legate = parse_common_args()
 
     nrows = get_arg_number(args.nrows)
     nnz_total = get_arg_number(args.nnz_total)
 
-    # this is a global variable
-    global random_seed, rng
     random_seed = args.random_seed
 
     if args.same_sparsity_for_cpu_and_gpu:
@@ -277,4 +291,6 @@ def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
 
     compute_A_power_k(A, timer, int(args.nwarmups), int(args.k))
 
-    print(f"Elapsed time in matrix creation (ms)   : {elapsed_time_matrix_gen}")
+    print(
+        f"Elapsed time in matrix creation (ms)   : {elapsed_time_matrix_gen}"
+    )
diff --git a/examples/pde.py b/examples/pde.py
index d9ca0095..6745ee32 100644
--- a/examples/pde.py
+++ b/examples/pde.py
@@ -38,16 +38,19 @@
 --package: Backend to use (legate, cupy, scipy)
 """
 
+from __future__ import annotations
+
 # This PDE solving application is derived from
 # https://aquaulb.github.io/book_solving_pde_mooc/solving_pde_mooc/notebooks/05_IterativeMethods/05_01_Iteration_and_2D.html.
 
 import argparse
 import sys
+from typing import Any
 
-from common import get_phase_procs, parse_common_args
+from common import Timer, get_phase_procs, parse_common_args
 
 
-def d2_mat_dirichlet_2d(nx, ny, dx, dy):
+def d2_mat_dirichlet_2d(nx: int, ny: int, dx: float, dy: float) -> Any:
     """
     Constructs the matrix for the centered second-order accurate
     second-order derivative for Dirichlet boundary conditions in 2D
@@ -114,7 +117,7 @@ def d2_mat_dirichlet_2d(nx, ny, dx, dy):
     return d2mat
 
 
-def p_exact_2d(X, Y):
+def p_exact_2d(X: Any, Y: Any) -> Any:
     """Computes the exact solution of the Poisson equation in the domain
     [0, 1]x[-0.5, 0.5] with rhs:
     b = (np.sin(np.pi * X) * np.cos(np.pi * Y) +
@@ -133,14 +136,26 @@ def p_exact_2d(X, Y):
         exact solution of the Poisson equation
     """
 
-    sol = -1.0 / (2.0 * np.pi**2) * np.sin(np.pi * X) * np.cos(np.pi * Y) - 1.0 / (
-        50.0 * np.pi**2
-    ) * np.sin(5.0 * np.pi * X) * np.cos(5.0 * np.pi * Y)
+    sol = -1.0 / (2.0 * np.pi**2) * np.sin(np.pi * X) * np.cos(
+        np.pi * Y
+    ) - 1.0 / (50.0 * np.pi**2) * np.sin(5.0 * np.pi * X) * np.cos(
+        5.0 * np.pi * Y
+    )
 
     return sol
 
 
-def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters, timer):
+def execute(
+    nx: int,
+    ny: int,
+    plot: bool,
+    plot_fname: str,
+    throughput: bool,
+    tol: float,
+    max_iters: int,
+    warmup_iters: int,
+    timer: Timer,
+) -> None:
     # Grid parameters.
     xmin, xmax = 0.0, 1.0  # limits in the x direction
     ymin, ymax = -0.5, 0.5  # limits in the y direction
@@ -181,9 +196,9 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         # Compute the rhs. Note that we non-dimensionalize the coordinates
         # x and y with the size of the domain in their respective dire-
         # ctions.
-        b = np.sin(np.pi * X) * np.cos(np.pi * Y) + np.sin(5.0 * np.pi * X) * np.cos(
-            5.0 * np.pi * Y
-        )
+        b = np.sin(np.pi * X) * np.cos(np.pi * Y) + np.sin(
+            5.0 * np.pi * X
+        ) * np.cos(5.0 * np.pi * Y)
 
         # b is currently a 2D array. We need to convert it to a column-major
         # ordered 1D array. This is done with the flatten numpy function.
@@ -194,7 +209,7 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         # count combinations as well. Even more annoyingly, doing any sort
         # of flatten results in some bad assignment of equivalence sets within
         # Legion's dependence analysis. So if we're just testing solve
-        # throughput, use an array of all ones.
+        # throughpu: boolt, use an array of all ones.
         if throughput:
             n = b.shape[0] - 2
             bflat = np.ones((n * n,))
@@ -218,7 +233,13 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         # If we're testing throughput, run only the prescribed number of iterations.
         if throughput:
             if use_legate:
-                p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters, conv_test_iters=max_iters)
+                p_sol, iters = linalg.cg(
+                    A,
+                    bflat,
+                    rtol=tol,
+                    maxiter=max_iters,
+                    conv_test_iters=max_iters,
+                )
             else:
                 p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters)
         else:
@@ -242,8 +263,12 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
 
             # Check convergence with relative tolerance
             convergence_status = True if norm_res <= norm_ini * tol else False
-            print(f"Did the solution converge           : {convergence_status}")
-            print(f"Final relative residual norm        : {norm_res / norm_ini}")
+            print(
+                f"Did the solution converge           : {convergence_status}"
+            )
+            print(
+                f"Final relative residual norm        : {norm_res / norm_ini}"
+            )
             if iters > 0:
                 print(f"Number of iterations                : {iters}")
                 print(f"Time per iteration (ms)             : {total / iters}")
@@ -321,10 +346,14 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
     )
 
     args, _ = parser.parse_known_args()
-    _, timer, np, sparse, linalg, use_legate = parse_common_args()
+    package, timer, np, sparse, linalg, use_legate = parse_common_args()
 
-    if args.throughput and (args.max_iters is None or args.warmup_iters is None):
-        print("Must provide --max-iters and --warmup-iters when using --throughput.")
+    if args.throughput and (
+        args.max_iters is None or args.warmup_iters is None
+    ):
+        print(
+            "Must provide --max-iters and --warmup-iters when using --throughput."
+        )
         sys.exit(1)
 
     execute(**vars(args), timer=timer)
diff --git a/examples/poisson_5point_example.py b/examples/poisson_5point_example.py
new file mode 100644
index 00000000..e2685a17
--- /dev/null
+++ b/examples/poisson_5point_example.py
@@ -0,0 +1,193 @@
+# Copyright 2022-2025 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Solve Poisson equation: -∇²u = f(x,y) on domain [0,1]×[0,1]
+With Dirichlet boundary conditions: u = 0 on boundary. We use
+a manufactured solution approach u(x,y) = sin(2πx) * sin(2πy)
+and use that to compute the RHS.
+"""
+
+from __future__ import annotations
+
+import argparse
+
+from common import parse_common_args, get_arg_number
+
+
+def create_poisson_mat(n, h):
+    """
+    Create the 2D Poisson equation discretization matrix using 5-point stencil.
+
+    The 5-point stencil for -∇²u at point (i,j) is:
+    -u_{i,j-1} - u_{i-1,j} + 4*u_{i,j} - u_{i+1,j} - u_{i,j+1} = h²*f_{i,j}
+
+    Parameters
+    ----------
+    n : int
+        Number of interior grid points in each direction (n×n grid)
+    h : float
+        Grid spacing (h = 1/(n+1))
+
+    Returns
+    -------
+    A : sparse CSR matrix
+        The discretization matrix of shape (n**2, n**2)
+    """
+    N = n * n  # Total number of unknowns
+
+    # stencil:
+    #    -1
+    # -1  4  -1
+    #    -1
+    main_diag = 4.0 * np.ones(N) / (h * h)
+    off_diag1 = -1.0 * np.ones(N - 1) / (h * h)
+    off_diag2 = -1.0 * np.ones(N - n) / (h * h)
+
+    # cupynumeric doesn't support non-unit strides in indexing,
+    # so use a mask array to set every "n" elements to zero
+    zero_out_indices = np.array(range(n - 1, N - 1, n), dtype=int)
+    off_diag1[zero_out_indices] = 0.0
+
+    # The offsets   : [-n,      -1,     0,     1,     n  ]
+    # correspond to : [below, left, center, right, above ]
+    diagonals = [off_diag2, off_diag1, main_diag, off_diag1, off_diag2]
+    offsets = [-n, -1, 0, 1, n]
+
+    # Create the sparse matrix and convert to CSR format
+    return sparse.diags(
+        diagonals, offsets, shape=(N, N), dtype=np.float64, format="csr"
+    )
+
+
+def manufactured_solution(x, y):
+    "u(x,y) = sin(2πx) * sin(2πy) satisfies u=0 on the boundary of [0,1]×[0,1]"
+    return np.sin(2 * np.pi * x) * np.sin(2 * np.pi * y)
+
+
+def compute_rhs(x, y):
+    """
+    Compute the right-hand side f(x,y) for the manufactured solution.
+
+    For u(x,y) = sin(2πx) * sin(2πy), we have:
+    -∇²u = 8π² * sin(2πx) * sin(2πy) = f(x,y)
+    """
+    return 8 * np.pi**2 * np.sin(2 * np.pi * x) * np.sin(2 * np.pi * y)
+
+
+def solve_poisson_2d(n, verbose=True) -> float:
+    """
+    Solve the 2D Poisson equation with Dirichlet boundary conditions.
+
+    Parameters
+    ----------
+    n : int
+        Number of interior grid points in each direction
+    verbose : bool
+        Whether to print detailed output
+
+    Returns
+    -------
+    error : float
+        The L2 error between numerical and analytical solutions
+    """
+    h = 1.0 / (n + 1)
+
+    if verbose:
+        print(f"Solving 2D Poisson equation on {n}×{n} grid")
+        print(f"Grid spacing h = {h:.6f}")
+        print(f"Total unknowns: {n * n}")
+
+    # Create grid points (interior points only) and flatten it
+    x = np.linspace(h, 1 - h, n)
+    y = np.linspace(h, 1 - h, n)
+    X, Y = np.meshgrid(x, y, indexing="ij")
+    X_flat = X.flatten()
+    Y_flat = Y.flatten()
+
+    A = create_poisson_mat(n, h)
+    b = compute_rhs(X_flat, Y_flat)
+
+    if verbose:
+        print(f"Matrix shape     : {A.shape}")
+        print(f"Matrix non-zeros : {A.nnz}")
+        print(f"Sparsity         : {A.nnz / (n * n) ** 2:.6f}")
+        print("\nSolving linear system Ax = b using spsolve...")
+
+    x_numerical = linalg.spsolve(A, b)
+    x_analytical = manufactured_solution(X_flat, Y_flat)
+
+    error_vec = x_numerical - x_analytical
+    l2_error = np.linalg.norm(error_vec) * h  # Scale by h for L2 norm
+    l_inf_error = np.max(np.abs(error_vec))
+    relative_error = l2_error / (np.linalg.norm(x_analytical) * h)
+
+    residual = A @ x_numerical - b
+    residual_norm = np.linalg.norm(residual)
+
+    if verbose:
+        print("\nResults:")
+        print(f"L2 error         : {l2_error:.6e}")
+        print(f"L∞ error         : {l_inf_error:.6e}")
+        print(f"Relative L2 error: {relative_error:.6e}")
+        print(f"Residual norm ||Ax - b||: {residual_norm:.6e}")
+
+    return l2_error
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Solve 2D Poisson equation with 5-point stencil"
+    )
+    parser.add_argument(
+        "--size",
+        "-n",
+        type=str,
+        default="32",
+        help="Number of interior grid points in each direction (default: 32)",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Use this argument for verbose output",
+    )
+
+    args, _ = parser.parse_known_args()
+    package, timer, np, sparse, linalg, use_legate = parse_common_args()
+
+    n_interior = get_arg_number(args.size)
+
+    solve_poisson_2d(n_interior, verbose=args.verbose)
+
+    print("\n" + "=" * 60)
+    print("Verification: Testing with smaller grid for convergence check")
+    print("=" * 60)
+
+    # Perform convergence tests
+    n1, n2 = n_interior, n_interior * 2
+    l2_error1 = solve_poisson_2d(n1, verbose=False)
+    l2_error2 = solve_poisson_2d(n2, verbose=False)
+
+    convergence_rate = np.log2(l2_error1 / l2_error2)
+    print(f"Grid refinement                  : {n1}×{n1} → {n2}×{n2}")
+    print(f"Error reduction factor           : {l2_error1 / l2_error2:.3f}")
+    print(f"Convergence rate                 : {convergence_rate:.3f}")
+    print("Expected rate for 5-point stencil: ~2.0")
+
+    if abs(convergence_rate - 2.0) < 0.5:
+        print(
+            "\n✓ Solution verified: convergence rate is close to expected value"
+        )
+    else:
+        print("\n⚠ Warning: convergence rate differs from expected value")
diff --git a/examples/spgemm_microbenchmark.py b/examples/spgemm_microbenchmark.py
index e30c05dd..0f97be69 100644
--- a/examples/spgemm_microbenchmark.py
+++ b/examples/spgemm_microbenchmark.py
@@ -32,12 +32,24 @@
 --package: Backend to use (legate, cupy, scipy)
 """
 
+from __future__ import annotations
+
 import argparse
+from typing import TYPE_CHECKING
+
+from common import (
+    Timer,
+    banded_matrix,
+    get_arg_number,
+    get_phase_procs,
+    parse_common_args,
+)
 
-from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args
+if TYPE_CHECKING:
+    from legate_sparse import csr_array
 
 
-def spgemm_dispatch(A, B):
+def spgemm_dispatch(A: csr_array, B: csr_array) -> csr_array:
     """Dispatch sparse matrix-matrix multiplication operation.
 
     Parameters
@@ -61,7 +73,9 @@ def spgemm_dispatch(A, B):
     return C
 
 
-def get_matrices(N, nnz_per_row, fname1, fname2):
+def get_matrices(
+    N: int, nnz_per_row: int, fname1: str, fname2: str
+) -> tuple[csr_array, csr_array]:
     """Get matrices for SpGEMM benchmark.
 
     Parameters
@@ -100,7 +114,15 @@ def get_matrices(N, nnz_per_row, fname1, fname2):
         return A, A.copy()
 
 
-def run_spgemm(N, nnz_per_row, fname1, fname2, iters, stable, timer):
+def run_spgemm(
+    N: int,
+    nnz_per_row: int,
+    fname1: str,
+    fname2: str,
+    iters: int,
+    stable: bool,
+    timer: Timer,
+) -> None:
     """Run sparse matrix-matrix multiplication benchmark.
 
     Parameters
@@ -229,7 +251,7 @@ def run_spgemm(N, nnz_per_row, fname1, fname2, iters, stable, timer):
     )
 
     args, _ = parser.parse_known_args()
-    _, timer, np, sparse, linalg, use_legate = parse_common_args()
+    package, timer, np, sparse, linalg, use_legate = parse_common_args()
 
     init_procs, bench_procs = get_phase_procs(use_legate)
 
diff --git a/examples/spmv_microbenchmark.py b/examples/spmv_microbenchmark.py
index c6f11ff8..b449b026 100644
--- a/examples/spmv_microbenchmark.py
+++ b/examples/spmv_microbenchmark.py
@@ -34,13 +34,27 @@
 --package: Backend to use (legate, cupy, scipy)
 """
 
+from __future__ import annotations
+
 import argparse
+from typing import TYPE_CHECKING, Any
+
+from common import (
+    Timer,
+    banded_matrix,
+    get_arg_number,
+    get_phase_procs,
+    parse_common_args,
+)
 
-from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args
+if TYPE_CHECKING:
+    from legate_sparse import csr_array
 
 
 # Writing to pre-allocated array is preferred
-def spmv_dispatch(A, x, y, i, repartition):
+def spmv_dispatch(
+    A: csr_array, x: Any, y: Any, i: int, repartition: bool
+) -> None:
     """Dispatch sparse matrix-vector multiplication operation.
 
     Parameters
@@ -77,7 +91,9 @@ def spmv_dispatch(A, x, y, i, repartition):
             y = A @ x
 
 
-def run_spmv(A, iters, repartition, timer):
+def run_spmv(
+    A: csr_array, iters: int, repartition: bool, timer: Timer
+) -> None:
     """Run sparse matrix-vector multiplication benchmark.
 
     Parameters
@@ -105,9 +121,9 @@ def run_spmv(A, iters, repartition, timer):
     x = np.ones((A.shape[1],))
     y = np.zeros((A.shape[0],))
 
-    assert not repartition or (
-        A.shape[0] == A.shape[1]
-    ), "Matrix should be square for switching x and y"
+    assert not repartition or (A.shape[0] == A.shape[1]), (
+        "Matrix should be square for switching x and y"
+    )
 
     # Warm up runs
     warmup_iters = 5
@@ -186,7 +202,7 @@ def run_spmv(A, iters, repartition, timer):
     )
 
     args, _ = parser.parse_known_args()
-    _, timer, np, sparse, linalg, use_legate = parse_common_args()
+    package, timer, np, sparse, linalg, use_legate = parse_common_args()
 
     init_procs, bench_procs = get_phase_procs(use_legate)
 
diff --git a/install.py b/install.py
index c46e03a7..6be4fdae 100755
--- a/install.py
+++ b/install.py
@@ -109,7 +109,9 @@ def was_previously_built_with_different_build_isolation(
         legate_sparse_build_dir is not None
         and os.path.exists(legate_sparse_build_dir)
         and os.path.exists(
-            cmake_cache := os.path.join(legate_sparse_build_dir, "CMakeCache.txt")
+            cmake_cache := os.path.join(
+                legate_sparse_build_dir, "CMakeCache.txt"
+            )
         )
     ):
         try:
@@ -298,9 +300,15 @@ def validate_path(path):
         cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")]
 
     cmake_flags += f"""\
--DCMAKE_BUILD_TYPE={(
-    "Debug" if debug else "RelWithDebInfo" if debug_release else "Release"
-)}
+-DCMAKE_BUILD_TYPE={
+        (
+            "Debug"
+            if debug
+            else "RelWithDebInfo"
+            if debug_release
+            else "Release"
+        )
+    }
 -DBUILD_SHARED_LIBS=ON
 -DBUILD_MARCH={str(march)}
 -DCMAKE_CUDA_ARCHITECTURES={str(arch)}
@@ -345,7 +353,9 @@ def validate_path(path):
         }
     )
 
-    execute_command(pip_install_cmd, verbose, cwd=legate_sparse_dir, env=cmd_env)
+    execute_command(
+        pip_install_cmd, verbose, cwd=legate_sparse_dir, env=cmd_env
+    )
 
 
 def driver():
diff --git a/legate_sparse/__init__.py b/legate_sparse/__init__.py
index c8f44589..35d65fb9 100644
--- a/legate_sparse/__init__.py
+++ b/legate_sparse/__init__.py
@@ -17,12 +17,15 @@
 
 """
 
-import scipy.sparse as _sp  # type: ignore
+from __future__ import annotations
+
+import scipy.sparse as _sp
 
 from .coverage import clone_module  # noqa: F401
 from .csr import csr_array, csr_matrix  # noqa: F401
 from .dia import dia_array, dia_matrix  # noqa: F401
-from .module import *  # noqa: F401
+from .module import *  # noqa: F401,F403
+from .construct import block_array  # noqa: F401
 
 clone_module(_sp, globals())
 
diff --git a/legate_sparse/_version.py b/legate_sparse/_version.py
index b50be7bd..ff2762af 100644
--- a/legate_sparse/_version.py
+++ b/legate_sparse/_version.py
@@ -69,7 +69,9 @@ def decorate(f):
     return decorate
 
 
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
+def run_command(
+    commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
+):
     """Call the given command(s)."""
     assert isinstance(commands, list)
     process = None
@@ -263,7 +265,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
     env.pop("GIT_DIR", None)
     runner = functools.partial(runner, env=env)
 
-    _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
+    _, rc = runner(
+        GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True
+    )
     if rc != 0:
         if verbose:
             print("Directory %s not under git control" % root)
@@ -292,7 +296,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
     pieces["short"] = full_out[:7]  # maybe improved later
     pieces["error"] = None
 
-    branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root)
+    branch_name, rc = runner(
+        GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root
+    )
     # --abbrev-ref was added in git-1.6.3
     if rc != 0 or branch_name is None:
         raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
@@ -341,7 +347,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
         mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
             # unparsable. Maybe git-describe is misbehaving?
-            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
+            pieces["error"] = (
+                "unable to parse git-describe output: '%s'" % describe_out
+            )
             return pieces
 
         # tag
@@ -370,7 +378,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
         pieces["distance"] = int(count_out)  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
+    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
     # Use only the last line.  Previous lines may contain GPG signature
     # information.
     date = date.splitlines()[-1]
@@ -458,7 +468,9 @@ def render_pep440_pre(pieces):
     if pieces["closest-tag"]:
         if pieces["distance"]:
             # update the post release segment
-            tag_version, post_version = pep440_split_post(pieces["closest-tag"])
+            tag_version, post_version = pep440_split_post(
+                pieces["closest-tag"]
+            )
             rendered = tag_version
             if post_version is not None:
                 rendered += ".post%d.dev%d" % (
@@ -647,7 +659,9 @@ def get_versions():
     verbose = cfg.verbose
 
     try:
-        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
+        return git_versions_from_keywords(
+            get_keywords(), cfg.tag_prefix, verbose
+        )
     except NotThisMethod:
         pass
 
diff --git a/legate_sparse/base.py b/legate_sparse/base.py
index c9d99a31..fd01a266 100644
--- a/legate_sparse/base.py
+++ b/legate_sparse/base.py
@@ -44,8 +44,12 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from __future__ import annotations
 
-import cupynumeric
+from typing import TYPE_CHECKING
+
+import cupynumeric as cn
+import numpy as np
 from legate.core import LogicalStore, align
 
 from .config import SparseOpCode, rect1
@@ -58,6 +62,13 @@
     store_to_cupynumeric_array,
 )
 
+if TYPE_CHECKING:
+    from typing import Any, Callable
+
+    import numpy.typing as npt
+
+    from cupynumeric.types import CastingKind
+
 
 # CompressedBase is a base class for several different kinds of sparse
 # matrices, such as CSR, CSC, COO and DIA.
@@ -74,8 +85,28 @@ class CompressedBase:
     Use specific format classes like csr_array instead.
     """
 
+    shape: tuple[int, ...]
+    pos: LogicalStore
+    dtype: npt.dtype[Any]
+    format: str
+    crd: LogicalStore
+    _data: cn.ndarray
+
+    def __init__(self, *args: Any, **kw: Any) -> None:
+        super().__init__(*args, **kw)
+
+    @property
+    def data(self) -> cn.ndarray:
+        return self._data
+
+    @property
+    def size(self) -> int:
+        raise NotImplementedError
+
     @classmethod
-    def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
+    def nnz_to_pos_cls(
+        cls, q_nnz: LogicalStore
+    ) -> tuple[LogicalStore, cn.ndarray]:
         """Convert non-zero counts to position arrays.
 
         This class method converts an array of non-zero counts per row/column
@@ -93,15 +124,13 @@ def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
             is the total number of non-zeros.
         """
         q_nnz_arr = store_to_cupynumeric_array(q_nnz)
-        cs = cupynumeric.cumsum(q_nnz_arr)
+        cs = cn.cumsum(q_nnz_arr)
         cs_shifted = cs - q_nnz_arr
         cs_store = get_store_from_cupynumeric_array(cs)
         cs_shifted_store = get_store_from_cupynumeric_array(cs_shifted)
         # Zip the scan result into a rect1 region for the pos.
         pos = runtime.create_store(
-            rect1,  # type: ignore
-            shape=(q_nnz.shape[0],),
-            optimize_scalar=False,
+            rect1, shape=(q_nnz.shape[0],), optimize_scalar=False
         )
         task = runtime.create_auto_task(SparseOpCode.ZIP_TO_RECT1)
         pos_var = task.add_output(pos)
@@ -113,7 +142,9 @@ def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
         # Don't convert cs[-1] to an int to avoid blocking.
         return pos, cs[-1]
 
-    def nnz_to_pos(self, q_nnz: LogicalStore):
+    def nnz_to_pos(
+        self, q_nnz: LogicalStore
+    ) -> tuple[LogicalStore, cn.ndarray]:
         """Convert non-zero counts to position arrays for this instance.
 
         Parameters
@@ -129,7 +160,12 @@ def nnz_to_pos(self, q_nnz: LogicalStore):
         """
         return CompressedBase.nnz_to_pos_cls(q_nnz)
 
-    def asformat(self, format, copy=False):
+    def copy(self) -> CompressedBase:
+        raise NotImplementedError()
+
+    def asformat(
+        self, format: str | None, copy: bool = False
+    ) -> CompressedBase:
         """Convert the matrix to a specified format.
 
         Parameters
@@ -158,7 +194,9 @@ def asformat(self, format, copy=False):
                 return self
         else:
             try:
-                convert_method = getattr(self, "to" + format)
+                convert_method: Callable[..., CompressedBase] = getattr(
+                    self, "to" + format
+                )
             except AttributeError as e:
                 raise ValueError("Format {} is unknown.".format(format)) from e
 
@@ -169,7 +207,12 @@ def asformat(self, format, copy=False):
                 return convert_method()
 
     # The implementation of sum is mostly lifted from scipy.sparse.
-    def sum(self, axis=None, dtype=None, out=None):
+    def sum(
+        self,
+        axis: int | None = None,
+        dtype: npt.dtype[Any] | None = None,
+        out: cn.ndarray | None = None,
+    ) -> cn.ndarray:
         """Sum the matrix elements over a given axis.
 
         Parameters
@@ -237,10 +280,10 @@ def sum(self, axis=None, dtype=None, out=None):
             # TODO: (marsaev) currently not supported as we don't have rmatmul yet
             # (need CSC to have easier sum over columns)
             raise NotImplementedError
-            ret = self.__rmatmul__(cupynumeric.ones((1, m), dtype=res_dtype))
+            # ret = self.__rmatmul__(cn.ones((1, m), dtype=res_dtype))
         else:
             # sum over rows
-            ret = self @ cupynumeric.ones((n, 1), dtype=res_dtype)
+            ret = self @ cn.ones((n, 1), dtype=res_dtype)
 
         if out is not None and out.shape != ret.shape:
             raise ValueError("dimensions do not match")
@@ -248,7 +291,7 @@ def sum(self, axis=None, dtype=None, out=None):
         return ret.sum(axis=axis, dtype=dtype, out=out)
 
     # needed by _data_matrix
-    def _with_data(self, data, copy=True):
+    def _with_data(self, data: Any, copy: bool = True) -> CompressedBase:
         """Returns a matrix object with the same sparsity structure as self,
         but with different data.
 
@@ -290,8 +333,13 @@ def _with_data(self, data, copy=True):
                 copy=False,
             )
 
-    def astype(self, dtype, casting="unsafe", copy=True):
-        dtype = cupynumeric.dtype(dtype)
+    def astype(
+        self,
+        dtype: npt.dtype[Any],
+        casting: CastingKind = "unsafe",
+        copy: bool = True,
+    ) -> CompressedBase:
+        dtype = np.dtype(dtype)
         # if type doesn't match, create a matrix copy with casted data array
         if self.dtype != dtype:
             return self._with_data(
@@ -304,24 +352,24 @@ def astype(self, dtype, casting="unsafe", copy=True):
 # These univariate ufuncs preserve zeros.
 _ufuncs_with_fixed_point_at_zero = frozenset(
     [
-        cupynumeric.sin,
-        cupynumeric.tan,
-        cupynumeric.arcsin,
-        cupynumeric.arctan,
-        cupynumeric.sinh,
-        cupynumeric.tanh,
-        cupynumeric.arcsinh,
-        cupynumeric.arctanh,
-        cupynumeric.rint,
-        cupynumeric.sign,
-        cupynumeric.expm1,
-        cupynumeric.log1p,
-        cupynumeric.deg2rad,
-        cupynumeric.rad2deg,
-        cupynumeric.floor,
-        cupynumeric.ceil,
-        cupynumeric.trunc,
-        cupynumeric.sqrt,
+        cn.sin,
+        cn.tan,
+        cn.arcsin,
+        cn.arctan,
+        cn.sinh,
+        cn.tanh,
+        cn.arcsinh,
+        cn.arctanh,
+        cn.rint,
+        cn.sign,
+        cn.expm1,
+        cn.log1p,
+        cn.deg2rad,
+        cn.rad2deg,
+        cn.floor,
+        cn.ceil,
+        cn.trunc,
+        cn.sqrt,
     ]
 )
 
@@ -329,14 +377,14 @@ def astype(self, dtype, casting="unsafe", copy=True):
 for npfunc in _ufuncs_with_fixed_point_at_zero:
     name = npfunc.__name__
 
-    def _create_method(op):
-        def method(self):
+    def _create_method(op: Callable[[Any], Any]) -> Callable[[Any], Any]:
+        def method(self: Any) -> Any:
             result = op(self.data)
             return self._with_data(result)
 
-        method.__doc__ = "Element-wise %s.\n\nSee `numpy.%s` for more information." % (
-            name,
-            name,
+        method.__doc__ = (
+            "Element-wise %s.\n\nSee `numpy.%s` for more information."
+            % (name, name)
         )
         method.__name__ = name
 
@@ -345,56 +393,8 @@ def method(self):
     setattr(CompressedBase, name, _create_method(npfunc))
 
 
-# DenseSparseBase is a base class for sparse matrices that have a TACO
-# format of {Dense, Sparse}. For our purposes, that means CSC and CSR
-# matrices.
-class DenseSparseBase:
-    """Base class for sparse matrices with dense-sparse format.
-
-    This class provides functionality for sparse matrices that have a TACO
-    format of {Dense, Sparse}, which includes CSR and CSC matrices.
-
-    Notes
-    -----
-    This is an internal base class and should not be instantiated directly.
-    Use specific format classes like csr_array instead.
-    """
-
-    def __init__(self):
-        """Initialize the DenseSparseBase class."""
-        self._balanced_pos_partition = None
-
-    # consider using _with_data() here
-    @classmethod
-    def make_with_same_nnz_structure(cls, mat, arg, shape=None, dtype=None):
-        """Create a new matrix with the same non-zero structure as mat.
-
-        Parameters
-        ----------
-        mat : sparse matrix
-            The reference matrix whose structure to copy.
-        arg : array_like
-            The data for the new matrix.
-        shape : tuple, optional
-            The shape of the new matrix. If None, uses mat.shape.
-        dtype : dtype, optional
-            The data type of the new matrix. If None, uses mat.dtype.
-
-        Returns
-        -------
-        sparse matrix
-            A new matrix with the same structure as mat but with data from arg.
-        """
-        if shape is None:
-            shape = mat.shape
-        if dtype is None:
-            dtype = mat.dtype
-        result = cls(arg, shape=shape, dtype=dtype)
-        return result
-
-
 # unpack_rect1_store unpacks a rect1 store into two int64 stores.
-def unpack_rect1_store(pos):
+def unpack_rect1_store(pos: LogicalStore) -> tuple[LogicalStore, LogicalStore]:
     """Unpack a rect1 store into two int64 stores.
 
     This function unpacks the compressed position array used in CSR/CSC
@@ -423,7 +423,9 @@ def unpack_rect1_store(pos):
 
 
 # pack_to_rect1_store packs two int64 stores into a rect1 store.
-def pack_to_rect1_store(lo, hi, output=None):
+def pack_to_rect1_store(
+    lo: LogicalStore, hi: LogicalStore, output: LogicalStore | None = None
+) -> LogicalStore:
     """Pack two int64 stores into a rect1 store.
 
     This function packs separate start and end position arrays into the
diff --git a/legate_sparse/config.py b/legate_sparse/config.py
index 8c601981..ab146521 100644
--- a/legate_sparse/config.py
+++ b/legate_sparse/config.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import os
 import platform
@@ -29,6 +30,10 @@ class _LegateSparseSharedLib:
     implements the core sparse matrix operations.
     """
 
+    LEGATE_SPARSE_LOAD_CUDALIBS: int
+    LEGATE_SPARSE_UNLOAD_CUDALIBS: int
+
+    LEGATE_SPARSE_CSR_TO_DENSE: int
     LEGATE_SPARSE_DENSE_TO_CSR: int
     LEGATE_SPARSE_DENSE_TO_CSR_NNZ: int
     LEGATE_SPARSE_ZIP_TO_RECT_1: int
@@ -49,6 +54,9 @@ class _LegateSparseSharedLib:
     LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR: int
     LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU: int
     LEGATE_SPARSE_AXPBY: int
+    LEGATE_SPARSE_SPSOLVE: int
+    LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC: int
+    LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE: int
 
 
 def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
@@ -88,7 +96,7 @@ class LegateSparseLib:
     library with the Legate runtime.
     """
 
-    def __init__(self, name):
+    def __init__(self, name: str) -> None:
         """Initialize the Legate sparse library.
 
         Parameters
@@ -98,9 +106,6 @@ def __init__(self, name):
         """
         self.name = name
         self.runtime = None
-        self.shared_object = None
-
-        self.name = name
 
         shared_lib_path = self.get_shared_library()
         assert shared_lib_path is not None
@@ -118,7 +123,9 @@ def __init__(self, name):
 
     def register(self) -> None:
         """Register the library with the Legate runtime."""
-        callback = getattr(self.shared_object, "legate_sparse_perform_registration")
+        callback = getattr(
+            self.shared_object, "legate_sparse_perform_registration"
+        )
         callback()
 
     def get_shared_library(self) -> str:
@@ -131,7 +138,9 @@ def get_shared_library(self) -> str:
         """
         from legate_sparse.install_info import libpath
 
-        return os.path.join(libpath, "liblegate_sparse" + self.get_library_extension())
+        return os.path.join(
+            libpath, "liblegate_sparse" + self.get_library_extension()
+        )
 
     def get_legate_library(self) -> Library:
         """Get the Legate library object.
@@ -181,7 +190,14 @@ def get_library_extension() -> str:
 """Name of the Legate sparse library."""
 
 sparse_lib = LegateSparseLib(SPARSE_LIB_NAME)
-sparse_lib.register()
+
+# Guard against double registration (can happen during Sphinx documentation builds)
+try:
+    sparse_lib.register()
+except Exception:
+    # Library may already be registered from a previous import
+    pass
+
 _sparse = sparse_lib.shared_object
 # has to be called after register()
 _library = sparse_lib.get_legate_library()
@@ -225,6 +241,10 @@ class SparseOpCode(IntEnum):
     SPGEMM_CSR_CSR_CSR = _sparse.LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR
     SPGEMM_CSR_CSR_CSR_GPU = _sparse.LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU
 
+    SPSOLVE = _sparse.LEGATE_SPARSE_SPSOLVE
+    GEAM_CSR_CSR_SYMBOLIC = _sparse.LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC
+    GEAM_CSR_CSR_COMPUTE = _sparse.LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE
+
 
 # Register some types for us to use.
 rect1 = types.rect_type(1)
diff --git a/legate_sparse/construct.py b/legate_sparse/construct.py
new file mode 100644
index 00000000..89a16c6e
--- /dev/null
+++ b/legate_sparse/construct.py
@@ -0,0 +1,260 @@
+# Copyright 2022-2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Portions of this file are also subject to the following license:
+#
+# Copyright (c) 2001-2002 Enthought, Inc. 2003-2022, SciPy Developers.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import annotations
+
+import cupynumeric as cn
+
+from .csr import csr_array
+
+
+def _block(blocks, format="csr", dtype=None):
+    """Build a sparse CSR array from sparse sub-blocks using COO intermediate.
+
+    1. Extracts (row, col, data) from each block
+    2. Adjusts indices by block offsets
+    3. Concatenates all coordinates
+    4. Builds CSR from COO format
+    """
+    if format != "csr":
+        raise ValueError("Only 'csr' format is supported for block_array")
+
+    if not isinstance(blocks, (list, tuple)):
+        blocks = list(blocks)
+
+    blocks = [
+        list(row) if isinstance(row, (list, tuple)) else [row]
+        for row in blocks
+    ]
+
+    n_block_rows = len(blocks)
+    if n_block_rows == 0:
+        raise ValueError("blocks cannot be empty")
+
+    n_block_cols = len(blocks[0])
+    if n_block_cols == 0:
+        raise ValueError("blocks cannot be empty")
+
+    # Row height and col width for a sub-block looks like this.
+    # +--------------+
+    # | ^            |
+    # | | row height |
+    # | v            |
+    # +--------------+
+    # <- col width ->
+
+    # store row heights and col widths of each sub-block
+    row_heights = [None] * n_block_rows
+    col_widths = [None] * n_block_cols
+
+    for i in range(n_block_rows):
+        for j in range(n_block_cols):
+            block = blocks[i][j]
+            if block is None:
+                continue
+
+            if not isinstance(block, csr_array):
+                raise TypeError(
+                    f"blocks[{i}][{j}] must be a csr_array or None, "
+                    f"got {type(block).__name__}"
+                )
+
+            block_nrows, block_ncols = block.shape
+
+            # Check/set row height for this block row.
+            # The row heights of all the sub-blocks in a row of the input
+            # should be the same, else we can't concatenate horizontally
+            if row_heights[i] is None:
+                row_heights[i] = block_nrows
+            elif row_heights[i] != block_nrows:
+                raise ValueError(
+                    f"blocks[{i}][{j}] has {block_nrows} rows, "
+                    f"expected {row_heights[i]}"
+                )
+
+            # Check/set column width for this block column.
+            # The col widths of all the sub-blocks in a col of the input
+            # should be the same, else we can't concatenate vertically
+            if col_widths[j] is None:
+                col_widths[j] = block_ncols
+            elif col_widths[j] != block_ncols:
+                raise ValueError(
+                    f"blocks[{i}][{j}] has {block_ncols} columns, "
+                    f"expected {col_widths[j]}"
+                )
+
+    # The input can have None instead of a csr matrix. To correctly compute
+    # the row offsets for those cases, we set the row height to 0 if the
+    # input is None.
+    row_heights = cn.array([h if h is not None else 0 for h in row_heights])
+    col_widths = cn.array([w if w is not None else 0 for w in col_widths])
+
+    # Compute the no. or rows and cols in the output matrix.
+    total_nrows = cn.sum(row_heights).item()
+    total_ncols = cn.sum(col_widths).item()
+
+    # When the output matrix is empty, we don't need to concatenate.
+    if total_nrows == 0 or total_ncols == 0:
+        result_dtype = dtype if dtype is not None else cn.float64
+        return csr_array((total_nrows, total_ncols), dtype=result_dtype)
+
+    row_offsets = cn.concatenate([cn.array([0]), cn.cumsum(row_heights)])
+    col_offsets = cn.concatenate([cn.array([0]), cn.cumsum(col_widths)])
+
+    if dtype is None:
+        dtypes = [b.dtype for row in blocks for b in row if b is not None]
+        dtype = cn.result_type(*dtypes) if dtypes else cn.float64
+
+    all_rows = []
+    all_cols = []
+    all_data = []
+
+    # Populate the concatenated (rows, cols, data) arrays for the
+    # output matrix. The outer loop concatenates the sub-blocks vertically
+    # while the inner loop concatenates them horizontally. This is done
+    # without creating any intermediate csr representation.
+    for i in range(n_block_rows):
+        row_offset = row_offsets[i].item()
+
+        for j in range(n_block_cols):
+            block = blocks[i][j]
+
+            # If block is empty, the (rows, cols, data) of the output matrix
+            # doesn't get modified, so we continue with the loop.
+            if block is None:
+                continue
+
+            col_offset = col_offsets[j].item()
+            block_nrows = block.shape[0]
+
+            indptr = block.indptr
+            indices = block.indices
+            data = block.data
+
+            # Empty csr matrices don't modify the output matrix either, so we
+            # continue with the loop.
+            if data.size == 0:
+                continue
+
+            # Expand the indptr array to store the row indices.
+            # For each row r, repeating r by (indptr[r+1] - indptr[r]) times
+            # the needed storage to store non-zero entries.
+            nnz_per_row = cn.diff(indptr)
+            block_rows = cn.repeat(cn.arange(block_nrows), nnz_per_row)
+
+            # After concatenating the matrices, we get one block matrix that
+            # can be represented by (rows, cols, data) arrays. Note that
+            # we have to add the offsets for both the row and col indices
+            # that correspond to the non-zero in the previous sub-block as
+            # concatenate them horizontally. This is because the output matrix
+            # is going to be represented as one giant CSR matrix.
+            all_rows.append(block_rows + row_offset)
+            all_cols.append(indices + col_offset)
+            all_data.append(data)
+
+    if not all_data:
+        result_dtype = dtype if dtype is not None else cn.float64
+        return csr_array((total_nrows, total_ncols), dtype=result_dtype)
+
+    concatenated_rows = cn.concatenate(all_rows)
+    concatenated_cols = cn.concatenate(all_cols)
+    concatenated_data = cn.concatenate(all_data).astype(dtype)
+
+    return csr_array(
+        (concatenated_data, (concatenated_rows, concatenated_cols)),
+        shape=(total_nrows, total_ncols),
+        dtype=dtype,
+    )
+
+
+def block_array(blocks, format="csr", dtype=None):
+    """Build a sparse array from sparse sub-blocks.
+
+    Parameters
+    ----------
+    blocks : array_like
+        A 2-D array-like of shape (M, N) where each element is a sparse
+        CSR array or None. None elements are treated as zero matrices.
+    format : str, optional
+        Output format. Currently only 'csr' is supported. Default is 'csr'.
+    dtype : dtype, optional
+        Data type of the output array. If None, inferred from the blocks.
+
+    Returns
+    -------
+    csr_array
+        A sparse CSR array formed by combining the sub-blocks.
+
+    Raises
+    ------
+    ValueError
+        - If `format` is not 'csr'.
+        - If `blocks` is empty (has zero rows or zero columns).
+        - If sub-blocks in the same row have different numbers of rows.
+        - If sub-blocks in the same column have different numbers of columns.
+    TypeError
+        - If any non-None block is not a csr_array.
+
+    Notes
+    -----
+    This function may not be performant when the number of sub-blocks is large,
+    as it iterates over all blocks sequentially to extract and concatenate their
+    COO coordinates.
+
+    Examples
+    --------
+    >>> import legate_sparse as sparse
+    >>> A = sparse.csr_array([[1, 2], [3, 4]])
+    >>> B = sparse.csr_array([[5], [6]])
+    >>> C = sparse.csr_array([[7, 8, 9]])
+    >>> result = sparse.block_array([[A, B], [C, None]])
+    >>> result.todense()
+    array([[1, 2, 5],
+           [3, 4, 6],
+           [7, 8, 9]])
+    """
+    return _block(blocks, format, dtype)
diff --git a/legate_sparse/coverage.py b/legate_sparse/coverage.py
index 8765044e..a6fa2bae 100644
--- a/legate_sparse/coverage.py
+++ b/legate_sparse/coverage.py
@@ -16,7 +16,7 @@
 
 from functools import wraps
 from types import FunctionType, MethodDescriptorType, MethodType, ModuleType
-from typing import Any, Container, Mapping, Optional, cast
+from typing import Any, Callable, Container, Mapping, TypeVar, cast
 
 from legate.core import track_provenance
 from typing_extensions import Protocol
@@ -27,7 +27,7 @@
 def filter_namespace(
     ns: Mapping[str, Any],
     *,
-    omit_names: Optional[Container[str]] = None,
+    omit_names: Container[str] | None = None,
     omit_types: tuple[type, ...] = (),
 ) -> dict[str, Any]:
     omit_names = omit_names or set()
@@ -43,8 +43,7 @@ def should_wrap(obj: object) -> bool:
 
 
 class AnyCallable(Protocol):
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        ...
+    def __call__(self, *args: Any, **kwargs: Any) -> Any: ...
 
 
 def wrap(func: AnyCallable) -> Any:
@@ -56,7 +55,9 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
     return wrapper
 
 
-def clone_module(origin_module: ModuleType, new_globals: dict[str, Any]) -> None:
+def clone_module(
+    origin_module: ModuleType, new_globals: dict[str, Any]
+) -> None:
     """Copy attributes from one module to another, excluding submodules
 
     Function types are wrapped with a decorator to report API calls. All
@@ -84,7 +85,10 @@ def clone_module(origin_module: ModuleType, new_globals: dict[str, Any]) -> None
             new_globals[attr] = wrapped
 
 
-def clone_scipy_arr_kind(origin_class: type) -> Any:
+T = TypeVar("T")
+
+
+def clone_scipy_arr_kind(origin_class: type) -> Callable[[T], T]:
     """Copy attributes from an origin class to the input class.
 
     Method types are wrapped with a decorator to report API calls. All
@@ -92,7 +96,7 @@ def clone_scipy_arr_kind(origin_class: type) -> Any:
 
     """
 
-    def body(cls: type):
+    def body(cls: T) -> T:
         for attr, value in cls.__dict__.items():
             # Only need to wrap things that are in the origin class to begin
             # with
diff --git a/legate_sparse/csr.py b/legate_sparse/csr.py
index 3008356e..051298e9 100644
--- a/legate_sparse/csr.py
+++ b/legate_sparse/csr.py
@@ -44,12 +44,14 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from __future__ import annotations
 
 import warnings
+from typing import TYPE_CHECKING, cast
 
-import cupynumeric
-import numpy
-import scipy  # type: ignore
+import cupynumeric as cn
+import numpy as np
+import scipy
 from legate.core import (
     ImageComputationHint,
     Scalar,
@@ -60,12 +62,7 @@
     types,
 )
 
-from .base import (
-    CompressedBase,
-    DenseSparseBase,
-    pack_to_rect1_store,
-    unpack_rect1_store,
-)
+from .base import CompressedBase, pack_to_rect1_store, unpack_rect1_store
 from .config import SparseOpCode, rect1
 from .coverage import clone_scipy_arr_kind
 from .runtime import runtime
@@ -75,22 +72,31 @@
     SUPPORTED_DATATYPES,
     array_from_store_or_array,
     cast_arr,
-    cast_to_common_type,
     cast_to_store,
     copy_store,
+    find_common_type,
     find_last_user_stacklevel,
     get_storage_type,
     get_store_from_cupynumeric_array,
+    is_dense,
     is_dtype_supported,
     is_scalar_like,
+    is_sparse,
     sort_by_rows_then_cols,
     store_from_store_or_array,
     store_to_cupynumeric_array,
 )
 
+if TYPE_CHECKING:
+    from typing import Any, Callable
+
+    import numpy.typing as npt
+
+    from cupynumeric.types import CastingKind
+
 
 @clone_scipy_arr_kind(scipy.sparse.csr_array)
-class csr_array(CompressedBase, DenseSparseBase):
+class csr_array(CompressedBase):
     """Compressed Sparse Row array.
 
     This can be instantiated in several ways:
@@ -187,7 +193,13 @@ class csr_array(CompressedBase, DenseSparseBase):
            [4, 5, 6]])
     """
 
-    def __init__(self, arg, shape=None, dtype=None, copy=False):
+    def __init__(
+        self,
+        arg: Any,
+        shape: tuple[int, ...] | None = None,
+        dtype: npt.dtype[Any] | None = None,
+        copy: bool = False,
+    ) -> None:
         """Initialize a CSR array.
 
         Parameters
@@ -233,11 +245,11 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         # Note that cupynumeric.dtype(None) returns float64, so make
         # sure dtype is passed to csr_array if it is known apriori,
         # especially when copying the matrix
-        dtype = cupynumeric.dtype(dtype)
+        dtype = np.dtype(dtype)
 
         # If from numpy.array - convert to cupynumeric array first
-        if isinstance(arg, numpy.ndarray):
-            arg = cupynumeric.array(arg)
+        if isinstance(arg, np.ndarray):
+            arg = cn.array(arg)
 
         # from scipy.sparse.csr_array
         if isinstance(arg, scipy.sparse.csr_array) or isinstance(
@@ -247,7 +259,7 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
             arg = (arg.data, arg.indices, arg.indptr)
 
         # from dense cupynumeric array
-        if isinstance(arg, cupynumeric.ndarray):
+        if isinstance(arg, cn.ndarray):
             assert arg.ndim == 2
 
             shape = arg.shape
@@ -257,18 +269,18 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
             src_store = get_store_from_cupynumeric_array(arg)
 
             q_nnz = runtime.create_store(nnz_ty, shape=Shape((shape[0],)))
-            task = runtime.create_auto_task(SparseOpCode.DENSE_TO_CSR_NNZ)
+            task1 = runtime.create_auto_task(SparseOpCode.DENSE_TO_CSR_NNZ)
             promoted_q_nnz = q_nnz.promote(1, shape[1])
-            nnz_per_row_part = task.add_output(promoted_q_nnz)
-            src_part = task.add_input(src_store)
-            task.add_constraint(broadcast(nnz_per_row_part, (1,)))
-            task.add_constraint(align(nnz_per_row_part, src_part))
-            task.execute()
+            nnz_per_row_part = task1.add_output(promoted_q_nnz)
+            src_part = task1.add_input(src_store)
+            task1.add_constraint(broadcast(nnz_per_row_part, (1,)))
+            task1.add_constraint(align(nnz_per_row_part, src_part))
+            task1.execute()
 
             # Assemble the output CSR array using the non-zeros per row.
-            self.pos, nnz = self.nnz_to_pos(q_nnz)
+            self.pos, nnz_scalar = self.nnz_to_pos(q_nnz)
             # Block and convert the nnz future into an int.
-            nnz = int(nnz)
+            nnz = int(nnz_scalar)
             self.crd = runtime.create_store(coord_ty, shape=((nnz,)))
             self.vals = runtime.create_store(arg.dtype, shape=((nnz,)))
 
@@ -276,14 +288,14 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
             # and 2-D input array, our only option is launch single process
             # which will handle all of the data, which makes this funciton not usable
             # on scale.
-            task = runtime.create_manual_task(SparseOpCode.DENSE_TO_CSR, (1,))
+            task2 = runtime.create_manual_task(SparseOpCode.DENSE_TO_CSR, (1,))
 
             promoted_pos = self.pos.promote(1, shape[1])
-            task.add_input(promoted_pos)
-            src_part = task.add_input(src_store)
-            task.add_output(self.crd)
-            task.add_output(self.vals)
-            task.execute()
+            task2.add_input(promoted_pos)
+            task2.add_input(src_store)
+            task2.add_output(self.crd)
+            task2.add_output(self.vals)
+            task2.execute()
 
             # we ignore dtype (TODO: is this behaviour matches SciPy?) and use arg.dtype
             dtype = arg.dtype
@@ -298,7 +310,9 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
             self.canonical_format = arg.canonical_format
 
         elif isinstance(arg, tuple):
-            dtype, shape = self._init_from_tuple_inputs(arg, dtype, shape, copy)
+            dtype, shape = self._init_from_tuple_inputs(
+                arg, dtype, shape, copy
+            )
         else:
             raise NotImplementedError("Can't convert to CSR from the input")
 
@@ -315,13 +329,19 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         if dtype is None:
             dtype = temp_vals_type
         if temp_vals_type is not dtype:
-            self.data = self.data.astype(dtype)
-        if not isinstance(dtype, numpy.dtype):
-            dtype = numpy.dtype(dtype)
+            self._data = self._data.astype(dtype)
+        if not isinstance(dtype, np.dtype):
+            dtype = np.dtype(dtype)
         # Saving the type
         self._dtype = dtype
 
-    def _init_from_tuple_inputs(self, arg, dtype, shape, copy):
+    def _init_from_tuple_inputs(
+        self,
+        arg: tuple[Any, ...],
+        dtype: npt.dtype[Any] | None,
+        shape: tuple[int, ...] | None,
+        copy: bool,
+    ) -> tuple[npt.dtype[Any], tuple[int, ...]]:
         """Initialize CSR array from tuple inputs.
 
         This internal method handles the various tuple-based constructor formats:
@@ -333,9 +353,9 @@ def _init_from_tuple_inputs(self, arg, dtype, shape, copy):
         ----------
         arg : tuple
             The input tuple in one of the supported formats.
-        dtype : dtype, optional
+        dtype : dtype
             The desired data type.
-        shape : tuple, optional
+        shape : tuple
             The shape of the array.
         copy : bool
             Whether to copy the input data.
@@ -353,12 +373,14 @@ def _init_from_tuple_inputs(self, arg, dtype, shape, copy):
             If the tuple format is not supported.
         """
 
-        def _get_empty_csr(dtype, nrows_plus_one):
+        def _get_empty_csr(
+            dtype: npt.dtype[Any] | None, nrows_plus_one: int
+        ) -> tuple[cn.ndarray, cn.ndarray, cn.ndarray]:
             """Helper function to create empty CSR arrays."""
             return (
-                cupynumeric.zeros(0, dtype=dtype),
-                cupynumeric.zeros(0, dtype=coord_ty),
-                cupynumeric.zeros(nrows_plus_one, dtype=coord_ty),
+                cn.zeros(0, dtype=dtype),
+                cn.zeros(0, dtype=coord_ty),
+                cn.zeros(nrows_plus_one, dtype=coord_ty),
             )
 
         # Couple of options here
@@ -367,16 +389,14 @@ def _get_empty_csr(dtype, nrows_plus_one):
             # csr_array((M, N), [dtype])
             if not isinstance(arg[1], tuple):
                 (M, N) = arg
-                if not isinstance(M, (int, numpy.integer)) or not isinstance(
-                    N, (int, numpy.integer)
+                if not isinstance(M, (int, np.integer)) or not isinstance(
+                    N, (int, np.integer)
                 ):
                     NotImplementedError(
                         "Input tuple for empty CSR ctor should be it's shape"
                     )
                 shape = arg
-                dtype = (
-                    cupynumeric.float64 if dtype is None else cupynumeric.dtype(dtype)
-                )
+                dtype = np.float64 if dtype is None else np.dtype(dtype)
 
                 # and pass this to next ctor
                 arg = _get_empty_csr(dtype, M + 1)
@@ -394,12 +414,12 @@ def _get_empty_csr(dtype, nrows_plus_one):
                     copy = False
                 else:
                     # if passed numpy arrays - convert them
-                    if isinstance(st_row, numpy.ndarray):
-                        st_row = cupynumeric.array(st_row)
-                    if isinstance(st_col, numpy.ndarray):
-                        st_col = cupynumeric.array(st_col)
-                    if isinstance(st_data, numpy.ndarray):
-                        st_data = cupynumeric.array(st_data)
+                    if isinstance(st_row, np.ndarray):
+                        st_row = cn.array(st_row)
+                    if isinstance(st_col, np.ndarray):
+                        st_col = cn.array(st_col)
+                    if isinstance(st_data, np.ndarray):
+                        st_data = cn.array(st_data)
 
                     if not self.indices_sorted:
                         # NOTE that CSR format does not require sorting the data
@@ -407,9 +427,15 @@ def _get_empty_csr(dtype, nrows_plus_one):
                         # sorted by rows and then by columns, so we sort the data
                         # by columns as well
 
-                        row_array = array_from_store_or_array(st_row, copy=copy)
-                        col_array = array_from_store_or_array(st_col, copy=copy)
-                        new_data = array_from_store_or_array(st_data, copy=copy)
+                        row_array = array_from_store_or_array(
+                            st_row, copy=copy
+                        )
+                        col_array = array_from_store_or_array(
+                            st_col, copy=copy
+                        )
+                        new_data = array_from_store_or_array(
+                            st_data, copy=copy
+                        )
 
                         indices = sort_by_rows_then_cols(row_array, col_array)
 
@@ -417,10 +443,10 @@ def _get_empty_csr(dtype, nrows_plus_one):
                         row_array = row_array[indices]
                         col_array = col_array[indices]
 
-                        row_offsets = cupynumeric.append(
-                            cupynumeric.array([0]),
-                            cupynumeric.cumsum(
-                                cupynumeric.bincount(row_array, minlength=shape[0])
+                        row_offsets = cn.append(
+                            cn.array([0]),
+                            cn.cumsum(
+                                cn.bincount(row_array, minlength=shape[0])
                             ),
                         )
 
@@ -432,10 +458,10 @@ def _get_empty_csr(dtype, nrows_plus_one):
                     else:
                         # we need to convert row indices to row offsets/indptr
                         row_array = array_from_store_or_array(st_row)
-                        row_offsets = cupynumeric.append(
-                            cupynumeric.array([0]),
-                            cupynumeric.cumsum(
-                                cupynumeric.bincount(row_array, minlength=shape[0])
+                        row_offsets = cn.append(
+                            cn.array([0]),
+                            cn.cumsum(
+                                cn.bincount(row_array, minlength=shape[0])
                             ),
                         )
                         if copy:
@@ -452,12 +478,12 @@ def _get_empty_csr(dtype, nrows_plus_one):
             (data, indices, indptr) = arg
 
             # if passed numpy arrays - convert them
-            if isinstance(data, numpy.ndarray):
-                data = cupynumeric.array(data)
-            if isinstance(indices, numpy.ndarray):
-                indices = cupynumeric.array(indices).astype(coord_ty)
-            if isinstance(indptr, numpy.ndarray):
-                indptr = cupynumeric.array(indptr).astype(coord_ty)
+            if isinstance(data, np.ndarray):
+                data = cn.array(data)
+            if isinstance(indices, np.ndarray):
+                indices = cn.array(indices).astype(coord_ty)
+            if isinstance(indptr, np.ndarray):
+                indptr = cn.array(indptr).astype(coord_ty)
 
             # checking that shape matches with expectations for row_offsets
             if indptr.shape[0] == shape[0] + 1:
@@ -470,8 +496,12 @@ def _get_empty_csr(dtype, nrows_plus_one):
                 )
                 # copy explicitly, just in case (there are paths that won't create temp object)
                 # For crd we enforce our internal type
-                self.crd = store_from_store_or_array(cast_arr(indices, coord_ty), copy)
-                self.vals = store_from_store_or_array(cast_to_store(data), copy)
+                self.crd = store_from_store_or_array(
+                    cast_arr(indices, coord_ty), copy
+                )
+                self.vals = store_from_store_or_array(
+                    cast_to_store(data), copy
+                )
 
             # Otherwise we assume that we are passing pos store from existing csr_array
             # This is internal only functionality, and we assume here only Store or cupynumeric.array
@@ -487,15 +517,22 @@ def _get_empty_csr(dtype, nrows_plus_one):
 
             dtype = get_storage_type(data)
 
+        assert dtype is not None
+        assert shape is not None
+
         return dtype, shape
 
+    # correct return type value on this subclass
+    def _with_data(self, data: Any, copy: bool = True) -> csr_array:
+        return cast(csr_array, super()._with_data(data, copy))
+
     @property
-    def dim(self):
+    def dim(self) -> int:
         """Number of dimensions (always 2 for CSR arrays)."""
         return self.ndim
 
     @property
-    def nnz(self):
+    def nnz(self) -> int:
         """Number of stored values, including explicit zeros.
 
         Returns
@@ -506,7 +543,12 @@ def nnz(self):
         return self.vals.shape[0]
 
     @property
-    def dtype(self):
+    def size(self) -> int:
+        """Number of stored values"""
+        return self.nnz
+
+    @property
+    def dtype(self) -> npt.dtype[Any]:
         """Data type of the array.
 
         Returns
@@ -518,7 +560,7 @@ def dtype(self):
         return self._dtype
 
     # Enable direct operation on the values array.
-    def get_data(self):
+    def get_data(self) -> cn.ndarray:
         """Get the data array of the CSR matrix.
 
         Returns
@@ -529,7 +571,7 @@ def get_data(self):
         return store_to_cupynumeric_array(self.vals)
 
     # From array,
-    def set_data(self, data):
+    def set_data(self, data: cn.ndarray) -> None:
         """Set the data array of the CSR matrix.
 
         Parameters
@@ -542,9 +584,9 @@ def set_data(self, data):
         AssertionError
             If data is not a cupynumeric.ndarray.
         """
-        if isinstance(data, numpy.ndarray):
-            data = cupynumeric.array(data)
-        assert isinstance(data, cupynumeric.ndarray)
+        if isinstance(data, np.ndarray):
+            data = cn.array(data)
+        assert isinstance(data, cn.ndarray)
         self.vals = get_store_from_cupynumeric_array(data)
         self._dtype = data.dtype
 
@@ -553,7 +595,7 @@ def set_data(self, data):
     )
 
     # Enable direct operation on the indices array.
-    def get_indices(self):
+    def get_indices(self) -> cn.ndarray:
         """Get the column indices array of the CSR matrix.
 
         Returns
@@ -563,7 +605,7 @@ def get_indices(self):
         """
         return store_to_cupynumeric_array(self.crd)
 
-    def set_indices(self, indices):
+    def set_indices(self, indices: cn.ndarray) -> None:
         """Set the column indices array of the CSR matrix.
 
         Parameters
@@ -581,19 +623,21 @@ def set_indices(self, indices):
         Setting new indices will mark the matrix as not having sorted indices
         and not being in canonical format.
         """
-        if isinstance(indices, numpy.ndarray):
-            indices = cupynumeric.array(indices)
-        assert isinstance(indices, cupynumeric.ndarray)
+        if isinstance(indices, np.ndarray):
+            indices = cn.array(indices)
+        assert isinstance(indices, cn.ndarray)
         self.crd = get_store_from_cupynumeric_array(indices)
         # we can't guarantee new indices are sorted
         self.canonical_format = False
         self.indices_sorted = False
 
     indices = property(
-        fget=get_indices, fset=set_indices, doc="CSR format index array of the matrix"
+        fget=get_indices,
+        fset=set_indices,
+        doc="CSR format index array of the matrix",
     )
 
-    def get_indptr(self):
+    def get_indptr(self) -> cn.ndarray:
         """Get the index pointer array of the CSR matrix.
 
         Returns
@@ -605,14 +649,14 @@ def get_indptr(self):
         """
         row_start_st, row_end_st = unpack_rect1_store(self.pos)
         row_start = store_to_cupynumeric_array(row_start_st)
-        return cupynumeric.append(row_start, [self.nnz])
+        return cn.append(row_start, [self.nnz])
 
     # Disallow changing intptrs directly
     indptr = property(
         fget=get_indptr, doc="CSR format index pointer array of the matrix"
     )
 
-    def _get_row_indices(self):
+    def _get_row_indices(self) -> cn.ndarray:
         """Helper routine that converts pos to row indices.
 
         This internal method expands the compressed row storage format's position
@@ -638,7 +682,7 @@ def _get_row_indices(self):
         task.execute()
         return store_to_cupynumeric_array(row_indices)
 
-    def has_sorted_indices(self):
+    def has_sorted_indices(self) -> bool:
         """Determine whether the matrix has sorted indices.
 
         Returns
@@ -648,7 +692,7 @@ def has_sorted_indices(self):
         """
         return self.indices_sorted
 
-    def has_canonical_format(self):
+    def has_canonical_format(self) -> bool:
         """Determine whether the matrix is in canonical format.
 
         Returns
@@ -665,7 +709,7 @@ def has_canonical_format(self):
         return self.canonical_format
 
     # The rest of the methods
-    def diagonal(self, k=0):
+    def diagonal(self, k: int = 0) -> cn.ndarray:
         """Return the k-th diagonal of the matrix.
 
         Parameters
@@ -691,7 +735,7 @@ def diagonal(self, k=0):
         """
         rows, cols = self.shape
         if k <= -rows or k >= cols:
-            return cupynumeric.empty(0, dtype=self.dtype)
+            return cn.empty(0, dtype=self.dtype)
         output = runtime.create_store(
             self.dtype, shape=Shape((min(rows + min(k, 0), cols - max(k, 0)),))
         )
@@ -713,7 +757,9 @@ def diagonal(self, k=0):
         task.execute()
         return store_to_cupynumeric_array(output)
 
-    def todense(self, order=None, out=None):
+    def todense(
+        self, order: str | None = None, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         """Return a dense matrix representation of this matrix.
 
         Parameters
@@ -744,25 +790,25 @@ def todense(self, order=None, out=None):
         if order is not None:
             raise NotImplementedError
         if out is not None:
-            out = cupynumeric.array(out)
+            out = cn.array(out)
             if out.dtype != self.dtype:
                 raise ValueError(
                     f"Output type {out.dtype} is not consistent with dtype {self.dtype}"
                 )
-            out = get_store_from_cupynumeric_array(out)
+            out_store = get_store_from_cupynumeric_array(out)
         elif out is None:
-            out = runtime.create_store(self.dtype, shape=self.shape)
+            out_store = runtime.create_store(self.dtype, shape=self.shape)
 
         task = runtime.create_manual_task(SparseOpCode.CSR_TO_DENSE, (1,))
         self.pos.promote(1, self.shape[1])
-        task.add_output(out)
+        task.add_output(out_store)
         task.add_input(self.pos)
         task.add_input(self.crd)
         task.add_input(self.vals)
         task.execute()
-        return store_to_cupynumeric_array(out)
+        return store_to_cupynumeric_array(out_store)
 
-    def multiply(self, other):
+    def multiply(self, other: Any) -> csr_array:
         """Point-wise multiplication by another matrix, vector, or scalar.
 
         Parameters
@@ -779,9 +825,9 @@ def multiply(self, other):
         -----
         This is equivalent to the * operator.
         """
-        return self * other
+        return cast(csr_array, self * other)
 
-    def __rmul__(self, other):
+    def __rmul__(self, other: Any) -> csr_array:
         """Right multiplication by a scalar.
 
         Parameters
@@ -794,10 +840,10 @@ def __rmul__(self, other):
         csr_array
             The result of the multiplication.
         """
-        return self * other
+        return cast(csr_array, self * other)
 
     # This is an element-wise operation now.
-    def __mul__(self, other):
+    def __mul__(self, other: Any) -> csr_array:
         """Element-wise multiplication.
 
         Parameters
@@ -820,10 +866,10 @@ def __mul__(self, other):
         Currently only supports scalar multiplication. Array multiplication
         is not implemented.
         """
-        if isinstance(other, numpy.ndarray):
-            other = cupynumeric.array(other)
+        if isinstance(other, np.ndarray):
+            other = cn.array(other)
 
-        if cupynumeric.ndim(other) == 0:
+        if cn.ndim(other) == 0:
             # If we have a scalar, then do an element-wise multiply on the
             # values array.
             new_vals = store_to_cupynumeric_array(self.vals) * other
@@ -832,7 +878,7 @@ def __mul__(self, other):
             raise NotImplementedError
 
     # rmatmul represents the operation other @ self.
-    def __rmatmul__(self, other):
+    def __rmatmul__(self, other: Any) -> cn.ndarray | csr_array:
         """Right matrix multiplication (other @ self).
 
         Parameters
@@ -858,7 +904,7 @@ def __rmatmul__(self, other):
         # Handle dense @ CSR
         raise NotImplementedError
 
-    def __matmul__(self, other):
+    def __matmul__(self, other: Any) -> cn.ndarray | csr_array:
         """Matrix multiplication (self @ other).
 
         Parameters
@@ -877,7 +923,9 @@ def __matmul__(self, other):
         """
         return self.dot(other)
 
-    def _compare_scalar(self, other, op):
+    def _compare_scalar(
+        self, other: object, op: Callable[..., cn.ndarray]
+    ) -> csr_array:
         """Helper method for element-wise comparison operations with scalars.
         This methods returns a boolean CSR array with True values where
         the comparison for op returns True.
@@ -898,7 +946,7 @@ def _compare_scalar(self, other, op):
         mask = op(store_to_cupynumeric_array(self.vals), other)
         col_indices = store_to_cupynumeric_array(self.crd)[mask]
         row_indices = self._get_row_indices()[mask]
-        vals = cupynumeric.ones(row_indices.size, dtype=bool)
+        vals = cn.ones(row_indices.size, dtype=bool)
 
         # NOTE:
         # If the data was already sorted by rows and cols in self,
@@ -906,12 +954,10 @@ def _compare_scalar(self, other, op):
         # but there's no clean way to pass to the class that the data
         # is already sorted
         return csr_array(
-            (vals, (row_indices, col_indices)),
-            shape=self.shape,
-            dtype=bool,
+            (vals, (row_indices, col_indices)), shape=self.shape, dtype=bool
         )
 
-    def __gt__(self, other):
+    def __gt__(self, other: object) -> csr_array:
         """Element-wise greater than comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -936,9 +982,9 @@ def __gt__(self, other):
         >>> A = csr_array(...)
         >>> mask = A > 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.greater)
+        return self._compare_scalar(other, cn.greater)
 
-    def __lt__(self, other):
+    def __lt__(self, other: object) -> csr_array:
         """Element-wise less than comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -963,9 +1009,9 @@ def __lt__(self, other):
         >>> A = csr_array(...)
         >>> mask = A < 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.less)
+        return self._compare_scalar(other, cn.less)
 
-    def __ge__(self, other):
+    def __ge__(self, other: object) -> csr_array:
         """Element-wise greater than or equal comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -990,9 +1036,9 @@ def __ge__(self, other):
         >>> A = csr_array(...)
         >>> mask = A >= 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.greater_equal)
+        return self._compare_scalar(other, cn.greater_equal)
 
-    def __le__(self, other):
+    def __le__(self, other: object) -> csr_array:
         """Element-wise less than or equal comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -1017,9 +1063,9 @@ def __le__(self, other):
         >>> A = csr_array(...)
         >>> mask = A <= 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.less_equal)
+        return self._compare_scalar(other, cn.less_equal)
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> csr_array:  # type: ignore [override]
         """Element-wise equality comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -1044,9 +1090,9 @@ def __eq__(self, other):
         >>> A = csr_array(...)
         >>> mask = A == 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.equal)
+        return self._compare_scalar(other, cn.equal)
 
-    def __ne__(self, other):
+    def __ne__(self, other: object) -> csr_array:  # type: ignore [override]
         """Element-wise not equal comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -1071,9 +1117,11 @@ def __ne__(self, other):
         >>> A = csr_array(...)
         >>> mask = A != 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.not_equal)
+        return self._compare_scalar(other, cn.not_equal)
 
-    def __setitem__(self, key, value):
+    def __setitem__(
+        self, key: csr_array | csr_matrix, value: Any
+    ) -> csr_array:
         """Set values in the matrix using a boolean CSR mask.
 
         Parameters
@@ -1118,7 +1166,9 @@ def __setitem__(self, key, value):
         assert key.shape == self.shape
         assert key.dtype == bool
 
-        value_store = runtime.legate_runtime.create_store_from_scalar(Scalar(value))
+        value_store = runtime.legate_runtime.create_store_from_scalar(
+            Scalar(value)
+        )
 
         # launch c++ task
         task = runtime.create_auto_task(SparseOpCode.CSR_INDEXING_CSR)
@@ -1144,7 +1194,76 @@ def __setitem__(self, key, value):
 
         return self
 
-    def dot(self, other, out=None):
+    def __neg__(self) -> csr_array:
+        """Return -self (negation of all values)."""
+        return self._with_data(
+            -store_to_cupynumeric_array(self.vals), copy=True
+        )
+
+    # self - other
+    def __sub__(self, other) -> csr_array:
+        if is_scalar_like(other):
+            if other == 0:
+                return self.copy()
+            raise NotImplementedError(
+                "Subtraction of a scalar from a Legate Sparse array "
+                "will break sparsity and is not supported."
+                "Use the method data() to manipulate only the nonzeros."
+            )
+        elif is_sparse(other):
+            if other.shape != self.shape:
+                raise ValueError(
+                    "Inconsistent shapes: ({self.shape}, {other.shape})"
+                )
+            return geam(self, other, 1.0, -1.0, None)
+        elif is_dense(other):
+            return self.todense() - other
+
+        return NotImplemented
+
+    # other - self
+    def __rsub__(self, other: csr_array) -> csr_array:
+        if is_scalar_like(other):
+            if other == 0:
+                return -self.copy()
+            raise NotImplementedError(
+                "Subtraction of a scalar from a Legate Sparse array "
+                "will break sparsity and is not supported."
+                "Use the method data() to manipulate only the nonzeros."
+            )
+        elif is_dense(other):
+            return other - self.todense()
+
+        return NotImplemented
+
+    # self + other
+    def __add__(self, other):
+        if is_scalar_like(other):
+            if other == 0:
+                return self.copy()
+            raise NotImplementedError(
+                "Addition of a scalar to a Legate Sparse array "
+                "will break sparsity and is not supported."
+                "Use the method data() to manipulate only the nonzeros."
+            )
+        elif is_sparse(other):
+            if other.shape != self.shape:
+                raise ValueError(
+                    "Inconsistent shapes: ({self.shape}, {other.shape})"
+                )
+            return geam(self, other, 1.0, 1.0, None)
+        elif is_dense(other):
+            return self.todense() + other
+
+        return NotImplemented
+
+    # other + self
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def dot(
+        self, other: cn.ndarray | csr_array, out: cn.ndarray | None = None
+    ) -> cn.ndarray | csr_array:
         """Ordinary dot product.
 
         Parameters
@@ -1200,7 +1319,7 @@ def dot(self, other, out=None):
         """
         # If output specified - it should be cupynumeric array
         if out is not None:
-            assert isinstance(out, cupynumeric.ndarray)
+            assert isinstance(out, cn.ndarray)
 
         # only floating point operations are supported by cusparse at the moment
         if runtime.num_gpus > 0:
@@ -1214,10 +1333,12 @@ def dot(self, other, out=None):
                 raise NotImplementedError(msg)
 
         # If other.shape = (M,) then it's SpMV
-        if len(other.shape) == 1 or (len(other.shape) == 2 and other.shape[1] == 1):
+        if len(other.shape) == 1 or (
+            len(other.shape) == 2 and other.shape[1] == 1
+        ):
             # convert X to the cupynumeric array if needed
-            if not isinstance(other, cupynumeric.ndarray):
-                other = cupynumeric.array(other)
+            if not isinstance(other, cn.ndarray):
+                other = cn.array(other)
             assert self.shape[1] == other.shape[0]
             # for the case of X shape == (M, 1)
             other_originally_2d = False
@@ -1233,11 +1354,13 @@ def dot(self, other, out=None):
                     category=RuntimeWarning,
                     stacklevel=level,
                 )
-                other = cupynumeric.array(other)
+                other = cn.array(other)
 
             # Coerce A and x into a common type. Use that coerced type
             # to find the type of the output.
-            A, x = cast_to_common_type(self, other)
+            common_dtype = find_common_type(self, other)
+            A = self.astype(common_dtype, copy=False)
+            x = other.astype(common_dtype, copy=False)
             if out is None:
                 y = store_to_cupynumeric_array(
                     runtime.create_store(A.dtype, shape=(self.shape[0],))
@@ -1272,12 +1395,16 @@ def dot(self, other, out=None):
             if out is not None:
                 raise ValueError("Cannot provide out for CSRxCSR matmul.")
             assert self.shape[1] == other.shape[0]
-            return spgemm_csr_csr_csr(*cast_to_common_type(self, other))
+            common_dtype = find_common_type(self, other)
+            return spgemm_csr_csr_csr(
+                self.astype(common_dtype, copy=False),
+                other.astype(common_dtype, copy=False),
+            )
         else:
             raise NotImplementedError
 
     # Misc
-    def _getpos(self):
+    def _getpos(self) -> list[tuple[int, int]]:
         """Helper method to get row start and end positions.
 
         This internal method unpacks the compressed row storage format's position array
@@ -1295,7 +1422,7 @@ def _getpos(self):
         row_end = store_to_cupynumeric_array(row_end_st)
         return [(i, j) for (i, j) in zip(row_start, row_end)]
 
-    def copy(self):
+    def copy(self) -> csr_array:
         """Returns a copy of this matrix.
 
         Returns
@@ -1305,7 +1432,7 @@ def copy(self):
         """
         return csr_array(self, dtype=self.dtype)
 
-    def conj(self, copy=True):
+    def conj(self, copy: bool = True) -> csr_array:
         """Element-wise complex conjugate.
 
         Parameters
@@ -1329,7 +1456,9 @@ def conj(self, copy=True):
             get_store_from_cupynumeric_array(self.data.conj()), copy=False
         )
 
-    def transpose(self, axes=None, copy=False):
+    def transpose(
+        self, axes: Any | None = None, copy: bool = False
+    ) -> csr_array:
         """Reverses the dimensions of the sparse matrix.
 
         Parameters
@@ -1373,7 +1502,9 @@ def transpose(self, axes=None, copy=False):
         task.execute()
 
         # sort
-        sort_mask = cupynumeric.argsort(self.crd, kind="stable")
+        sort_mask = cn.argsort(
+            store_to_cupynumeric_array(self.crd), kind="stable"
+        )
         new_rows = self.get_indices()[sort_mask]
         new_ci = store_to_cupynumeric_array(rows_expanded)[sort_mask]
         new_data = self.get_data()[sort_mask]
@@ -1388,7 +1519,7 @@ def transpose(self, axes=None, copy=False):
 
     T = property(transpose, doc="Transpose of the matrix")
 
-    def asformat(self, format, copy=False):
+    def asformat(self, format: str | None, copy: bool = False) -> csr_array:
         """Convert this matrix to a specified format.
 
         Parameters
@@ -1417,7 +1548,16 @@ def asformat(self, format, copy=False):
         else:
             raise NotImplementedError("Only CSR format is supported right now")
 
-    def tocsr(self, copy=False):
+    # correct return type value on this subclass
+    def astype(
+        self,
+        dtype: npt.dtype[Any],
+        casting: CastingKind = "unsafe",
+        copy: bool = True,
+    ) -> csr_array:
+        return cast(csr_array, super().astype(dtype, casting, copy))
+
+    def tocsr(self, copy: bool = False) -> csr_array:
         """Convert this matrix to a CSR matrix.
 
         Parameters
@@ -1439,7 +1579,7 @@ def tocsr(self, copy=False):
             return self.copy().tocsr(copy=False)
         return self
 
-    def nonzero(self):
+    def nonzero(self) -> tuple[cn.ndarray, cn.ndarray]:
         """Return the indices of the non-zero elements.
 
         Returns
@@ -1455,13 +1595,15 @@ def nonzero(self):
         """
         task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES)
 
-        row_indices = runtime.create_store(coord_ty, shape=self.crd.shape)
-        row_indices_part = task.add_output(row_indices)
+        row_indices_store = runtime.create_store(
+            coord_ty, shape=self.crd.shape
+        )
+        row_indices_part = task.add_output(row_indices_store)
         pos_part = task.add_input(self.pos)
         task.add_constraint(image(pos_part, row_indices_part))
         task.execute()
 
-        row_indices = store_to_cupynumeric_array(row_indices)
+        row_indices = store_to_cupynumeric_array(row_indices_store)
         col_indices = store_to_cupynumeric_array(self.crd)
         vals_array = store_to_cupynumeric_array(self.vals)
         mask = vals_array != 0.0
@@ -1474,7 +1616,7 @@ def nonzero(self):
 
 
 # spmv computes y = A @ x.
-def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
+def spmv(A: csr_array, x: cn.ndarray, y: cn.ndarray) -> None:
     """Perform sparse matrix vector product y = A @ x.
 
     Parameters
@@ -1506,10 +1648,16 @@ def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
     x_var = task.add_input(x_store)
 
     task.add_constraint(align(y_var, pos_var))
-    task.add_constraint(image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST))
-    task.add_constraint(image(pos_var, vals_var, hint=ImageComputationHint.FIRST_LAST))
+    task.add_constraint(
+        image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)
+    )
+    task.add_constraint(
+        image(pos_var, vals_var, hint=ImageComputationHint.FIRST_LAST)
+    )
     # exact or approximate image to X
-    task.add_constraint(image(crd_var, x_var, hint=ImageComputationHint.MIN_MAX))
+    task.add_constraint(
+        image(crd_var, x_var, hint=ImageComputationHint.MIN_MAX)
+    )
 
     task.execute()
 
@@ -1553,7 +1701,7 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
     if runtime.num_gpus > 0:
         # replacement for the ImagePartition functor to get dense image
         # for rows of B, run separate task for this
-        pos_rect = runtime.create_store(rect1, shape=(A.shape[0],))  # type: ignore
+        pos_rect = runtime.create_store(rect1, shape=(A.shape[0],))
         task = runtime.create_auto_task(SparseOpCode.FAST_IMAGE_RANGE)
         A_pos_part = task.add_input(A.pos)
         A_crd_part = task.add_input(A.crd)
@@ -1566,7 +1714,7 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
 
         task.execute()
 
-        pos = runtime.create_store(rect1, shape=(A.shape[0],))  # type: ignore
+        pos = runtime.create_store(rect1, shape=(A.shape[0],))
         crd = runtime.create_store(coord_ty, ndim=1)
         vals = runtime.create_store(A.dtype, ndim=1)
 
@@ -1605,7 +1753,9 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
         # Array class should provide this functionality
         task.add_constraint(align(A_pos_part, B_pos_image_part))
         task.add_constraint(
-            image(B_pos_image_part, B_pos_part, hint=ImageComputationHint.MIN_MAX)
+            image(
+                B_pos_image_part, B_pos_part, hint=ImageComputationHint.MIN_MAX
+            )
         )
 
         task.add_constraint(
@@ -1659,11 +1809,11 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
 
         task.execute()
 
-        pos, nnz = CompressedBase.nnz_to_pos_cls(q_nnz)
+        pos, nnz_value = CompressedBase.nnz_to_pos_cls(q_nnz)
         # Block and convert the nnz future into an int.
-        nnz = int(nnz)
-        crd = runtime.create_store(coord_ty, shape=(nnz,))
-        vals = runtime.create_store(A.dtype, shape=(nnz,))
+        nnz = int(nnz_value)
+        crd = runtime.create_store(coord_ty, shape=Shape((nnz,)))
+        vals = runtime.create_store(A.dtype, shape=Shape((nnz,)))
 
         task = runtime.create_auto_task(SparseOpCode.SPGEMM_CSR_CSR_CSR)
         C_pos_part_out = task.add_output(pos)
@@ -1692,7 +1842,124 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
         task.add_constraint(image(B_pos_part, B_vals_part))
 
         task.execute()
+        return csr_array((vals, crd, pos), shape=(A.shape[0], B.shape[1]))
+
+
+def geam(A: csr_array, B: csr_array, alpha: Any, beta: Any, C=None):
+    """Compute C = alpha * A + beta * B for CSR matrices.
+
+    Parameters
+    ----------
+    A : csr_array
+        First input sparse matrix.
+    B : csr_array
+        Second input sparse matrix. Must have same shape as A.
+    alpha : scalar-like
+        Scalar multiplier for A. Will be cast to A.dtype.
+    beta : scalar-like
+        Scalar multiplier for B. Will be cast to A.dtype.
+    C : csr_array, optional
+        Output sparse matrix. If provided, must have the correct sparsity
+        pattern to hold the result. If None, a new matrix is allocated.
+
+    Returns
+    -------
+    csr_array
+        The result C = alpha * A + beta * B.
+
+    Notes
+    -----
+    If C is provided, it is the user's responsibility to ensure the sparsity
+    pattern matches the result. Behavior is undefined otherwise.
+
+    alpha and beta may be integers, floats, or complex values. They are
+    converted to A.dtype before computation. For complex inputs, A.dtype
+    should be a complex dtype to preserve the imaginary component.
+    """
+
+    if C is None:
+        perform_symbolic_phase = True
+    else:
+        # If C is provided, assume it has the correct sparsity pattern
+        assert isinstance(C, csr_array), "C must be a Legate Sparse CSR array"
+        perform_symbolic_phase = False
+
+    # Symbolic phase: compute the sparsity pattern of the result
+    if perform_symbolic_phase:
+        nnz_per_row = runtime.create_store(nnz_ty, A.pos.shape)
+        task = runtime.create_auto_task(SparseOpCode.GEAM_CSR_CSR_SYMBOLIC)
+        A_pos_part = task.add_input(A.pos)
+        A_crd_part = task.add_input(A.crd)
+        B_pos_part = task.add_input(B.pos)
+        B_crd_part = task.add_input(B.crd)
+        nnz_per_row_part = task.add_output(nnz_per_row)
+
+        task.add_constraint(image(A_pos_part, A_crd_part))
+        task.add_constraint(image(B_pos_part, B_crd_part))
+        task.add_constraint(align(A_pos_part, B_pos_part))
+        task.add_constraint(align(A_pos_part, nnz_per_row_part))
+
+        task.execute()
+
+        # Compute C_pos from nnz_per_row using the helper from CompressedBase
+        C_pos, nnz_scalar = CompressedBase.nnz_to_pos_cls(nnz_per_row)
+        nnz_total = int(nnz_scalar)
+
+    # Allocate output arrays if needed
+    if perform_symbolic_phase:
+        C_vals = runtime.create_store(A.dtype, shape=(nnz_total,))
+        C_crd = runtime.create_store(coord_ty, shape=(nnz_total,))
+    else:
+        C_vals = C.vals
+        C_crd = C.crd
+        C_pos = C.pos
+
+    # Create scalar stores for alpha and beta
+    alpha_store = runtime.legate_runtime.create_store_from_scalar(
+        Scalar(A.dtype.type(alpha))
+    )
+    beta_store = runtime.legate_runtime.create_store_from_scalar(
+        Scalar(A.dtype.type(beta))
+    )
+
+    # Compute phase: C = alpha * A + beta * B
+    task = runtime.create_auto_task(SparseOpCode.GEAM_CSR_CSR_COMPUTE)
+
+    # Inputs (order must match C++ template expectations)
+    A_pos_part = task.add_input(A.pos)
+    A_crd_part = task.add_input(A.crd)
+    A_vals_part = task.add_input(A.vals)
+    B_pos_part = task.add_input(B.pos)
+    B_crd_part = task.add_input(B.crd)
+    B_vals_part = task.add_input(B.vals)
+
+    # C_pos is an INPUT (already computed in symbolic phase)
+    C_pos_part = task.add_input(C_pos)
+
+    # C_crd and C_vals are outputs
+    C_crd_part = task.add_output(C_crd)
+    C_vals_part = task.add_output(C_vals)
+
+    # Scalar inputs (alpha and beta)
+    task.add_input(alpha_store)
+    task.add_input(beta_store)
+
+    # Align row partitions: A, B, C all partitioned by the same rows
+    task.add_constraint(align(A_pos_part, B_pos_part))
+    task.add_constraint(align(A_pos_part, C_pos_part))
+
+    # Image constraints: crd and vals are partitioned via pos
+    task.add_constraint(image(A_pos_part, A_crd_part))
+    task.add_constraint(image(A_pos_part, A_vals_part))
+    task.add_constraint(image(B_pos_part, B_crd_part))
+    task.add_constraint(image(B_pos_part, B_vals_part))
+    task.add_constraint(image(C_pos_part, C_crd_part))
+    task.add_constraint(image(C_pos_part, C_vals_part))
+
+    task.execute()
+
+    if perform_symbolic_phase:
         return csr_array(
-            (vals, crd, pos),
-            shape=Shape((A.shape[0], B.shape[1])),
+            (C_vals, C_crd, C_pos), shape=A.shape, dtype=A.dtype, copy=False
         )
+    return C
diff --git a/legate_sparse/dia.py b/legate_sparse/dia.py
index 20f2dc5c..4fc035ca 100644
--- a/legate_sparse/dia.py
+++ b/legate_sparse/dia.py
@@ -44,10 +44,13 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from __future__ import annotations
 
-import cupynumeric
-import numpy
-import scipy  # type: ignore
+from typing import TYPE_CHECKING
+
+import cupynumeric as cn
+import numpy as np
+import scipy
 
 from .base import CompressedBase
 from .coverage import clone_scipy_arr_kind
@@ -59,6 +62,11 @@
     store_to_cupynumeric_array,
 )
 
+if TYPE_CHECKING:
+    from typing import Any
+
+    import numpy.typing as npt
+
 
 # Temporary implementation for matrix generation in examples
 @clone_scipy_arr_kind(scipy.sparse.dia_array)
@@ -128,7 +136,13 @@ class dia_array(CompressedBase):
            [0, 7, 9]])
     """
 
-    def __init__(self, arg, shape=None, dtype=None, copy=False):
+    def __init__(
+        self,
+        arg: tuple[cn.ndarray, cn.ndarray],
+        shape: tuple[int, ...] | None = None,
+        dtype: npt.dtype[Any] | None = None,
+        copy: bool = False,
+    ) -> None:
         """Initialize a DIA array.
 
         Parameters
@@ -169,14 +183,14 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         assert isinstance(arg, tuple)
         data, offsets = arg
         if isinstance(offsets, int):
-            offsets = cupynumeric.full((1,), offsets)
+            offsets = cn.full((1,), offsets)
         data, offsets = cast_arr(data), cast_arr(offsets)
         if dtype is not None:
             data = data.astype(dtype)
         dtype = data.dtype
         assert dtype is not None
-        if not isinstance(dtype, numpy.dtype):
-            dtype = numpy.dtype(dtype)
+        if not isinstance(dtype, np.dtype):
+            dtype = np.dtype(dtype)
 
         self.dtype = dtype
         # Ensure that we don't accidentally include ndarray
@@ -185,10 +199,10 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         # legate under the hood.
         self.shape = tuple(int(i) for i in shape)
         self._offsets = get_store_from_cupynumeric_array(offsets, copy=copy)
-        self._data = get_store_from_cupynumeric_array(data, copy=copy)
+        self._store = get_store_from_cupynumeric_array(data, copy=copy)
 
     @property
-    def nnz(self):
+    def nnz(self) -> int:
         """Number of stored values, including explicit zeros.
 
         Returns
@@ -211,7 +225,7 @@ def nnz(self):
         return int(nnz)
 
     @property
-    def data(self):
+    def data(self) -> cn.ndarray:
         """Get the data array of the DIA matrix.
 
         Returns
@@ -220,10 +234,10 @@ def data(self):
             The data array containing the diagonal values. Each row represents
             a diagonal, with shape (n_diagonals, max_diagonal_length).
         """
-        return store_to_cupynumeric_array(self._data)
+        return store_to_cupynumeric_array(self._store)
 
     @property
-    def offsets(self):
+    def offsets(self) -> cn.ndarray:
         """Get the offsets array of the DIA matrix.
 
         Returns
@@ -235,7 +249,7 @@ def offsets(self):
         """
         return store_to_cupynumeric_array(self._offsets)
 
-    def copy(self):
+    def copy(self) -> dia_array:
         """Returns a copy of this matrix.
 
         Returns
@@ -243,11 +257,13 @@ def copy(self):
         dia_array
             A copy of the matrix with the same data and structure.
         """
-        data = cupynumeric.array(self.data)
-        offsets = cupynumeric.array(self.offsets)
+        data = cn.array(self.data)
+        offsets = cn.array(self.offsets)
         return dia_array((data, offsets), shape=self.shape, dtype=self.dtype)
 
-    def transpose(self, axes=None, copy=False):
+    def transpose(
+        self, axes: tuple[int, ...] | None = None, copy: bool = False
+    ) -> dia_array:
         """Reverses the dimensions of the sparse matrix.
 
         Parameters
@@ -295,13 +311,13 @@ def transpose(self, axes=None, copy=False):
         offsets = -self.offsets
 
         # re-align the data matrix
-        r = cupynumeric.arange(len(offsets), dtype=coord_ty)[:, None]
-        c = cupynumeric.arange(num_rows, dtype=coord_ty) - (offsets % max_dim)[:, None]
+        r = cn.arange(len(offsets), dtype=coord_ty)[:, None]
+        c = cn.arange(num_rows, dtype=coord_ty) - (offsets % max_dim)[:, None]
         pad_amount = max(0, max_dim - self.data.shape[1])
-        data = cupynumeric.hstack(
+        data = cn.hstack(
             (
                 self.data,
-                cupynumeric.zeros(
+                cn.zeros(
                     (self.data.shape[0], pad_amount), dtype=self.data.dtype
                 ),
             )
@@ -316,7 +332,7 @@ def transpose(self, axes=None, copy=False):
 
     T = property(transpose, doc="Transpose of the matrix")
 
-    def tocsr(self, copy=False):
+    def tocsr(self, copy: bool = False) -> csr_array:
         """Convert this matrix to a CSR matrix.
 
         Parameters
@@ -341,7 +357,7 @@ def tocsr(self, copy=False):
         return self.transpose(copy=copy)._tocsr_transposed(copy=False)
 
     # This routine is lifted from scipy.sparse's converter.
-    def _tocsr_transposed(self, copy=False):
+    def _tocsr_transposed(self, copy: bool = False) -> csr_array:
         """Convert the transposed DIA matrix to CSR format.
 
         This internal method converts a transposed DIA matrix to CSR format.
@@ -374,7 +390,7 @@ def _tocsr_transposed(self, copy=False):
 
         num_rows, num_cols = self.shape
         num_offsets, offset_len = self.data.shape
-        offset_inds = cupynumeric.arange(offset_len)
+        offset_inds = cn.arange(offset_len)
 
         row = offset_inds - self.offsets[:, None]
         mask = row >= 0
@@ -383,14 +399,14 @@ def _tocsr_transposed(self, copy=False):
         mask &= self.data != 0
 
         idx_dtype = coord_ty
-        indptr = cupynumeric.zeros(num_cols + 1, dtype=idx_dtype)
+        indptr = cn.zeros(num_cols + 1, dtype=idx_dtype)
         # note that the output dtype in a reduction (e.g, sum) determines
         # the dtype of the accumulator that is used in the reduction
         # in cupynumeric, it looks like the output dtype is set to the src
         # dtype if unspecified and that results in the output not performing
         # an integer sum. But we want the integer sum, so specify
         # dtype as idx_dtype to mask.sum()
-        indptr[1 : offset_len + 1] = cupynumeric.cumsum(
+        indptr[1 : offset_len + 1] = cn.cumsum(
             mask.sum(axis=0, dtype=idx_dtype)[:num_cols]
         )
         if offset_len < num_cols:
@@ -398,7 +414,10 @@ def _tocsr_transposed(self, copy=False):
         indices = row.T[mask.T].astype(idx_dtype, copy=False)
         data = self.data.T[mask.T]
         return csr_array(
-            (data, indices, indptr), shape=self.shape, dtype=self.dtype, copy=False
+            (data, indices, indptr),
+            shape=self.shape,
+            dtype=self.dtype,
+            copy=False,
         )
 
 
diff --git a/legate_sparse/gallery.py b/legate_sparse/gallery.py
index 371a4c44..e5583e6e 100644
--- a/legate_sparse/gallery.py
+++ b/legate_sparse/gallery.py
@@ -66,15 +66,29 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
+from __future__ import annotations
 
+from typing import TYPE_CHECKING
 
-import cupynumeric
-import numpy
+import cupynumeric as cn
+import numpy as np
 
+from .csr import csr_array
 from .dia import dia_array
 
+if TYPE_CHECKING:
+    from typing import Any, Sequence
 
-def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
+    import numpy.typing as npt
+
+
+def diags(
+    diagonals: Sequence[cn.ndarray],
+    offsets: Sequence[int] | int = 0,
+    shape: tuple[int, ...] | None = None,
+    format: str | None = None,
+    dtype: npt.dtype[Any] | None = None,
+) -> csr_array | dia_array:
     """Construct a sparse matrix from diagonals.
 
     Parameters
@@ -159,22 +173,25 @@ def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
            [ 0.,  0.,  0.,  0.]])
     """
     # if offsets is not a sequence, assume that there's only one diagonal
-    if numpy.isscalar(offsets):
+    diags: list[cn.ndarray]
+    if np.isscalar(offsets):
         # now check that there's actually only one diagonal
-        if len(diagonals) == 0 or numpy.isscalar(diagonals[0]):
-            diagonals = [cupynumeric.atleast_1d(diagonals)]
+        if len(diagonals) == 0 or np.isscalar(diagonals[0]):
+            diags = [cn.atleast_1d(diagonals)]  # type: ignore [list-item, arg-type]
         else:
             raise ValueError("Different number of diagonals and offsets.")
     else:
-        diagonals = list(map(cupynumeric.atleast_1d, diagonals))
+        diags = cn.atleast_1d(*diagonals)  # type: ignore [assignment]
+
+    assert not isinstance(offsets, int)
 
     # Basic check
-    if len(diagonals) != len(offsets):
+    if len(diags) != len(offsets):
         raise ValueError("Different number of diagonals and offsets.")
 
     # Determine shape, if omitted
     if shape is None:
-        m = len(diagonals[0]) + abs(int(offsets[0]))
+        m = len(diags[0]) + abs(int(offsets[0]))
         shape = (m, m)
 
     # Determine data type, if omitted
@@ -187,34 +204,38 @@ def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
     # Construct data array
     m, n = shape
 
-    M = max([min(m + offset, n - offset) + max(0, offset) for offset in offsets])
+    M = max(
+        [min(m + offset, n - offset) + max(0, offset) for offset in offsets]
+    )
     M = max(0, M)
-    data_arr = cupynumeric.zeros((len(offsets), M), dtype=dtype)
+    data_arr = cn.zeros((len(offsets), M), dtype=dtype)
 
     K = min(m, n)
 
-    for j, diagonal in enumerate(diagonals):
+    for j, diag in enumerate(diags):
         offset = int(offsets[j])
         k = max(0, offset)
         length = min(m + offset, n - offset, K)
         if length < 0:
-            raise ValueError("Offset %d (index %d) out of bounds" % (offset, j))
+            raise ValueError(
+                "Offset %d (index %d) out of bounds" % (offset, j)
+            )
         try:
-            data_arr[j, k : k + length] = diagonal[..., :length]
+            data_arr[j, k : k + length] = diag[..., :length]
         except ValueError as e:
-            if len(diagonal) != length and len(diagonal) != 1:
+            if len(diag) != length and len(diag) != 1:
                 raise ValueError(
                     "Diagonal length (index %d: %d at offset %d) does not "
                     "agree with matrix size (%d, %d)."
-                    % (j, len(diagonal), offset, m, n)
+                    % (j, len(diag), offset, m, n)
                 ) from e
             raise
 
     # We importantly don't perform this conversion to cupynumeric (involving
     # an attach operation) until we're done indexing into the list. This
     # avoid a cupynumeric crash involving restrictions in attach in pde.py.
-    offsets = cupynumeric.atleast_1d(offsets)
-    dia = dia_array((data_arr, offsets), shape=(m, n), dtype=dtype)
+    offsets_array: cn.ndarray = cn.atleast_1d(offsets)  # type: ignore [arg-type, assignment]
+    dia = dia_array((data_arr, offsets_array), shape=(m, n), dtype=dtype)
     if format == "csr":
         return dia.tocsr()
     return dia
diff --git a/legate_sparse/install_info.py.in b/legate_sparse/install_info.py.in
index 84799ee4..3ad3ecd1 100644
--- a/legate_sparse/install_info.py.in
+++ b/legate_sparse/install_info.py.in
@@ -11,9 +11,13 @@
 
 # IMPORTANT:
 #   * install_info.py is a generated file and should not be modified by hand
+from __future__ import annotations
+
 
 def get_libpath():
-    import os, sys, platform
+    import os
+    import platform
+    import sys
     join = os.path.join
     exists = os.path.exists
     dirname = os.path.dirname
@@ -32,10 +36,10 @@ def get_libpath():
         return None
 
     return (
-        find_liblegate_sparse(join(cn_path, "build", "lib")) or
-        find_liblegate_sparse(join(dirname(dirname(dirname(cn_path))), "lib")) or
-        find_liblegate_sparse(join(dirname(dirname(sys.executable)), "lib")) or
-        ""
+        find_liblegate_sparse(join(cn_path, "build", "lib"))
+        or find_liblegate_sparse(join(dirname(dirname(dirname(cn_path))), "lib"))
+        or find_liblegate_sparse(join(dirname(dirname(sys.executable)), "lib"))
+        or ""
     )
 
 
diff --git a/legate_sparse/io.py b/legate_sparse/io.py
index ecaf8e3c..6ce90ba5 100644
--- a/legate_sparse/io.py
+++ b/legate_sparse/io.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import numpy as np
 from legate.core import track_provenance, types
@@ -22,8 +23,8 @@
 from .utils import store_to_cupynumeric_array
 
 
-@track_provenance(runtime.sparse_library)
-def mmread(source):
+@track_provenance()
+def mmread(source: str) -> csr_array:
     """Read a sparse matrix from a Matrix Market (.mtx) file.
 
     Parameters
@@ -59,28 +60,34 @@ def mmread(source):
     # TODO (rohany): We'll assume for now that all of the nodes in the system
     # can access the file passed in, so we don't need to worry about where this
     # task gets mapped to.
-    rows = runtime.create_store(coord_ty, ndim=1)
-    cols = runtime.create_store(coord_ty, ndim=1)
-    vals = runtime.create_store(float64, ndim=1)
-    m = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,))
-    n = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,))
-    nnz = runtime.create_store(nnz_ty, optimize_scalar=True, shape=(1,))
+    rows_store = runtime.create_store(coord_ty, ndim=1)
+    cols_store = runtime.create_store(coord_ty, ndim=1)
+    vals_store = runtime.create_store(float64, ndim=1)
+    m_store = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,))
+    n_store = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,))
+    nnz_store = runtime.create_store(nnz_ty, optimize_scalar=True, shape=(1,))
     task = runtime.create_auto_task(SparseOpCode.READ_MTX_TO_COO)
-    task.add_output(m)
-    task.add_output(n)
-    task.add_output(nnz)
-    task.add_output(rows)
-    task.add_output(cols)
-    task.add_output(vals)
+    task.add_output(m_store)
+    task.add_output(n_store)
+    task.add_output(nnz_store)
+    task.add_output(rows_store)
+    task.add_output(cols_store)
+    task.add_output(vals_store)
     task.add_scalar_arg(source, types.string_type)
     task.execute()
 
-    m = int(np.asarray(m.get_physical_store().get_inline_allocation())[0])
-    n = int(np.asarray(n.get_physical_store().get_inline_allocation())[0])
-    nnz = int(np.asarray(nnz.get_physical_store().get_inline_allocation())[0])
+    m = int(
+        np.asarray(m_store.get_physical_store().get_inline_allocation())[0]
+    )
+    n = int(
+        np.asarray(n_store.get_physical_store().get_inline_allocation())[0]
+    )
+    nnz = int(
+        np.asarray(nnz_store.get_physical_store().get_inline_allocation())[0]
+    )
     # Slice down each store from the resulting size into the actual size.
     sl = slice(0, nnz)
-    rows = store_to_cupynumeric_array(rows.slice(0, sl))
-    cols = store_to_cupynumeric_array(cols.slice(0, sl))
-    vals = store_to_cupynumeric_array(vals.slice(0, sl))
+    rows = store_to_cupynumeric_array(rows_store.slice(0, sl))
+    cols = store_to_cupynumeric_array(cols_store.slice(0, sl))
+    vals = store_to_cupynumeric_array(vals_store.slice(0, sl))
     return csr_array((vals, (rows, cols)), shape=(m, n))
diff --git a/legate_sparse/linalg.py b/legate_sparse/linalg.py
index 82aa0edb..789cd2a9 100644
--- a/legate_sparse/linalg.py
+++ b/legate_sparse/linalg.py
@@ -93,15 +93,30 @@
 
 """
 
+from __future__ import annotations
+
 import inspect
 import warnings
+from typing import TYPE_CHECKING, Protocol
 
-import cupynumeric as np
-from legate.core import track_provenance, types
+import cupynumeric as cn
+import numpy as np
+from legate.core import align, image, track_provenance, types
 
 from .config import SparseOpCode
 from .runtime import runtime
-from .utils import get_store_from_cupynumeric_array
+from .utils import get_store_from_cupynumeric_array, store_to_cupynumeric_array
+
+if TYPE_CHECKING:
+    from typing import Any
+
+    import numpy.typing as npt
+
+
+class LOCallable(Protocol):
+    def __call__(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray: ...
 
 
 # We have to implement our own / copy the LinearOperator class from
@@ -196,7 +211,7 @@ class LinearOperator:
 
     ndim = 2
 
-    def __new__(cls, *args, **kwargs):
+    def __new__(cls, *args: Any, **kwargs: Any) -> LinearOperator:
         if cls is LinearOperator:
             # Operate as _CustomLinearOperator factory.
             return super(LinearOperator, cls).__new__(_CustomLinearOperator)
@@ -216,7 +231,9 @@ def __new__(cls, *args, **kwargs):
 
             return obj
 
-    def __init__(self, dtype, shape):
+    def __init__(
+        self, dtype: npt.dtype[Any] | None, shape: tuple[int, ...]
+    ) -> None:
         """Initialize this LinearOperator.
 
         To be called by subclasses. ``dtype`` may be None; ``shape`` should
@@ -229,13 +246,21 @@ def __init__(self, dtype, shape):
         self.dtype = dtype
         self.shape = shape
 
-    def _init_dtype(self):
+    def _init_dtype(self) -> None:
         """Called from subclasses at the end of the __init__ routine."""
         if self.dtype is None:
-            v = np.zeros(self.shape[-1])
-            self.dtype = np.asarray(self.matvec(v)).dtype
+            v = cn.zeros(self.shape[-1])
+            self.dtype = cn.asarray(self.matvec(v)).dtype
+
+    def _matmat(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
+        """Default matrix-matrix multiplication handler."""
+        raise NotImplementedError
 
-    def _matvec(self, x, out=None):
+    def _matvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         """Default matrix-vector multiplication handler.
 
         If self is a linear operator of shape (M, N), then this method will
@@ -247,7 +272,9 @@ def _matvec(self, x, out=None):
         """
         raise NotImplementedError
 
-    def matvec(self, x, out=None):
+    def matvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         """Matrix-vector multiplication.
 
         Performs the operation y=A*x where A is an MxN linear
@@ -275,7 +302,7 @@ def matvec(self, x, out=None):
         if x.shape != (N,) and x.shape != (N, 1):
             raise ValueError("dimension mismatch")
 
-        y = np.asarray(self._matvec(x, out=out))
+        y = cn.asarray(self._matvec(x, out=out))
 
         if x.ndim == 1:
             # TODO (hme): This is a cuPyNumeric bug, reshape should accept an
@@ -288,11 +315,15 @@ def matvec(self, x, out=None):
 
         return y
 
-    def _rmatvec(self, x, out=None):
+    def _rmatvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         """Default implementation of _rmatvec; defers to adjoint."""
         raise NotImplementedError
 
-    def rmatvec(self, x, out=None):
+    def rmatvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         """Adjoint matrix-vector multiplication.
 
         Performs the operation y = A^H * x where A is an MxN linear
@@ -320,14 +351,16 @@ def rmatvec(self, x, out=None):
         if x.shape != (M,) and x.shape != (M, 1):
             raise ValueError("dimension mismatch")
 
-        y = np.asarray(self._rmatvec(x, out=out))
+        y = cn.asarray(self._rmatvec(x, out=out))
 
         if x.ndim == 1:
             y = y.reshape(N)
         elif x.ndim == 2:
             y = y.reshape(N, 1)
         else:
-            raise ValueError("invalid shape returned by user-defined rmatvec()")
+            raise ValueError(
+                "invalid shape returned by user-defined rmatvec()"
+            )
 
         return y
 
@@ -337,88 +370,104 @@ def rmatvec(self, x, out=None):
 class _CustomLinearOperator(LinearOperator):
     """Linear operator defined in terms of user-specified operations."""
 
+    _matvec_impl: LOCallable
+    _rmatvec_impl: LOCallable | None
+
     def __init__(
         self,
-        shape,
-        matvec,
-        rmatvec=None,
-        matmat=None,
-        dtype=None,
-        rmatmat=None,
-    ):
+        shape: tuple[int, ...],
+        matvec: LOCallable,
+        rmatvec: LOCallable | None = None,
+        matmat: LOCallable | None = None,
+        dtype: npt.dtype[Any] | None = None,
+        rmatmat: LOCallable | None = None,
+    ) -> None:
         super().__init__(dtype, shape)
 
         self.args = ()
 
-        self.__matvec_impl = matvec
-        self.__rmatvec_impl = rmatvec
+        self._matvec_impl = matvec
+        self._rmatvec_impl = rmatvec
 
         # Check if the implementations of matvec and rmatvec have the out=
         # parameter.
-        self._matvec_has_out = self._has_out(self.__matvec_impl)
-        self._rmatvec_has_out = self._has_out(self.__rmatvec_impl)
+        self._matvec_has_out = self._has_out(self._matvec_impl)
+        self._rmatvec_has_out = self._has_out(self._rmatvec_impl)
 
         self._init_dtype()
 
-    def _matvec(self, x, out=None):
+    def _matvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         if self._matvec_has_out:
-            return self.__matvec_impl(x, out=out)
+            return self._matvec_impl(x, out=out)
         else:
             if out is None:
-                return self.__matvec_impl(x)
+                return self._matvec_impl(x)
             else:
-                out[:] = self.__matvec_impl(x)
+                out[:] = self._matvec_impl(x)
                 return out
 
-    def _rmatvec(self, x, out=None):
-        func = self.__rmatvec_impl
+    def _rmatvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
+        assert self._rmatvec_impl is not None
+        func = self._rmatvec_impl
         if func is None:
             raise NotImplementedError("rmatvec is not defined")
         if self._rmatvec_has_out:
-            return self.__rmatvec_impl(x, out=out)
+            return self._rmatvec_impl(x, out=out)
         else:
             if out is None:
-                return self.__rmatvec_impl(x)
+                return self._rmatvec_impl(x)
             else:
-                result = self.__rmatvec_impl(x)
+                result = self._rmatvec_impl(x)
                 out[:] = result
                 return out
 
-    def _has_out(self, o):
+    def _has_out(self, o: LOCallable | None) -> bool:
         if o is None:
             return False
         sig = inspect.signature(o)
-        for key, param in sig.parameters.items():
-            if key == "out":
-                return True
-        return False
+        return "out" in sig.parameters
 
 
 # _SparseMatrixLinearOperator is an overload of LinearOperator to wrap
 # sparse matrices as a linear operator. It caches the conjugate transpose
 # of the sparse matrices to avoid repeat conversions.
 class _SparseMatrixLinearOperator(LinearOperator):
-    def __init__(self, A):
+    AH: cn.ndarray | None
+
+    def __init__(self, A: cn.ndarray) -> None:
         self.A = A
         self.AH = None
         super().__init__(A.dtype, A.shape)
 
-    def _matvec(self, x, out=None):
+    def _matvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         return self.A.dot(x, out=out)
 
-    def _rmatvec(self, x, out=None):
+    def _rmatvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         if self.AH is None:
-            self.AH = self.A.T.conj(copy=False)
+            self.AH = self.A.T.conj()
+        assert self.AH is not None
         return self.AH.dot(x, out=out)
 
 
 # IdentityOperator is a no-op linear operator, and is lifted from
 # scipy.sparse.
 class IdentityOperator(LinearOperator):
-    def __init__(self, shape, dtype=None):
+    def __init__(
+        self, shape: tuple[int, ...], dtype: npt.dtype[Any] | None = None
+    ) -> None:
         super().__init__(dtype, shape)
 
-    def _matvec(self, x, out=None):
+    def _matvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         # If out is specified, copy the input into the output.
         if out is not None:
             out[:] = x
@@ -428,7 +477,9 @@ def _matvec(self, x, out=None):
             # the input to avoid silently aliasing the input array.
             return x.copy()
 
-    def _rmatvec(self, x, out=None):
+    def _rmatvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         # If out is specified, copy the input into the output.
         if out is not None:
             out[:] = x
@@ -439,7 +490,7 @@ def _rmatvec(self, x, out=None):
             return x.copy()
 
 
-def make_linear_operator(A):
+def make_linear_operator(A: Any | LinearOperator) -> LinearOperator:
     """Convert a matrix to a LinearOperator.
 
     Parameters
@@ -473,7 +524,14 @@ def make_linear_operator(A):
 # future operations to compute new futures, and avoids
 # allocating unnecessary futures.
 @track_provenance(nested=True)
-def cg_axpby(y, x, a, b, isalpha=True, negate=False):
+def cg_axpby(
+    y: cn.ndarray,
+    x: cn.ndarray,
+    a: cn.ndarray,
+    b: cn.ndarray,
+    isalpha: bool = True,
+    negate: bool = False,
+) -> cn.ndarray:
     """Perform fused vector operation for CG solvers.
 
     This function performs the operation y = alpha * x + beta * y where
@@ -526,7 +584,12 @@ def cg_axpby(y, x, a, b, isalpha=True, negate=False):
     return y
 
 
-def _get_atol_rtol(b_norm, tol=None, atol=0.0, rtol=1e-5):
+def _get_atol_rtol(
+    b_norm: float | cn.ndarray,
+    tol: float | None = None,
+    atol: float = 0.0,
+    rtol: float = 1e-5,
+) -> tuple[float, float]:
     """Compute absolute and relative tolerances for convergence.
 
     Parameters
@@ -561,17 +624,17 @@ def _get_atol_rtol(b_norm, tol=None, atol=0.0, rtol=1e-5):
 
 
 def cg(
-    A,
-    b,
-    x0=None,
-    tol=None,
-    maxiter=None,
-    M=None,
-    callback=None,
-    atol=0.0,
-    rtol=1e-5,
-    conv_test_iters=25,
-):
+    A: Any | LinearOperator,
+    b: cn.ndarray,
+    x0: cn.ndarray | None = None,
+    tol: float | None = None,
+    maxiter: int | None = None,
+    M: Any | LinearOperator | None = None,
+    callback: Any | None = None,
+    atol: float = 0.0,
+    rtol: float = 1e-5,
+    conv_test_iters: int = 25,
+) -> tuple[cn.ndarray, int]:
     """Solve a linear system using the Conjugate Gradient method.
 
     Parameters
@@ -631,8 +694,8 @@ def cg(
     assert len(b.shape) == 1 or (len(b.shape) == 2 and b.shape[1] == 1)
     assert len(A.shape) == 2 and A.shape[0] == A.shape[1]
 
-    bnrm2 = np.linalg.norm(b)
-    atol, _ = _get_atol_rtol(bnrm2, tol, atol, rtol)
+    b_norm = cn.linalg.norm(b)
+    atol, _ = _get_atol_rtol(b_norm, tol, atol, rtol)
 
     n = b.shape[0]
     if maxiter is None:
@@ -644,15 +707,15 @@ def cg(
         if M is None
         else make_linear_operator(M)
     )
-    x = np.zeros(n) if x0 is None else x0.copy()
-    p = np.zeros(n)
+    x = cn.zeros(n) if x0 is None else x0.copy()
+    p = cn.zeros(n)
 
     # This implementation is adapted from CuPy's CG solve:
     # https://github.com/cupy/cupy/blob/master/cupyx/scipy/sparse/linalg/_iterative.py.
     # # Hold onto several temps to store allocations used in each iteration.
     r = b - A.matvec(x)
     iters = 0
-    rho = 0
+    rho: int | cn.ndarray = 0
     z = None
     q = None
 
@@ -679,9 +742,9 @@ def cg(
         iters += 1
         if callback is not None:
             callback(x)
-        if (iters % conv_test_iters == 0 or iters == (maxiter - 1)) and np.linalg.norm(
-            r
-        ) < atol:
+        if (
+            iters % conv_test_iters == 0 or iters == (maxiter - 1)
+        ) and cn.linalg.norm(r) < atol:
             converged = True
             # Test convergence every conv_test_iters iterations.
             break
@@ -696,19 +759,19 @@ def cg(
 # This implementation of GMRES is lifted from the cupy implementation:
 # https://github.com/cupy/cupy/blob/9d2e2381ae7f33a42291d1bf8271484c9d2a55ac/cupyx/scipy/sparse/linalg/_iterative.py#L94.
 def gmres(
-    A,
-    b,
-    x0=None,
-    tol=None,
-    restart=None,
-    maxiter=None,
-    M=None,
-    callback=None,
-    restrt=None,
-    atol=0.0,
-    callback_type=None,
-    rtol=1e-5,
-):
+    A: Any | LinearOperator,
+    b: cn.ndarray,
+    x0: cn.ndarray | None = None,
+    tol: float | None = None,
+    restart: int | None = None,
+    maxiter: int | None = None,
+    M: Any | LinearOperator | None = None,
+    callback: Any = None,
+    restrt: int | None = None,
+    atol: float = 0.0,
+    callback_type: str | None = None,
+    rtol: float = 1e-5,
+) -> tuple[cn.ndarray, int]:
     """Solve a linear system using the Generalized Minimal Residual method.
 
     Parameters
@@ -796,10 +859,10 @@ def gmres(
         if M is None
         else make_linear_operator(M)
     )
-    x = np.zeros(n) if x0 is None else x0.copy()
+    x = cn.zeros(n) if x0 is None else x0.copy()
 
-    bnrm2 = np.linalg.norm(b)
-    atol, _ = _get_atol_rtol(bnrm2, tol, atol, rtol)
+    b_norm = cn.linalg.norm(b)
+    atol, _ = _get_atol_rtol(b_norm, tol, atol, rtol)
 
     if maxiter is None:
         maxiter = n * 10
@@ -813,11 +876,11 @@ def gmres(
     if callback is None:
         callback_type = None
 
-    V = np.empty((n, restart), dtype=A.dtype)
-    H = np.zeros((restart + 1, restart), dtype=A.dtype)
-    e = np.zeros((restart + 1,), dtype=A.dtype)
+    V = cn.empty((n, restart), dtype=A.dtype)
+    H: Any = cn.zeros((restart + 1, restart), dtype=A.dtype)
+    e: Any = cn.zeros((restart + 1,), dtype=A.dtype)
 
-    def compute_hu(u, j):
+    def compute_hu(u: cn.ndarray, j: int) -> tuple[cn.ndarray, cn.ndarray]:
         """Compute Householder transformation for Arnoldi iteration.
 
         Parameters
@@ -847,7 +910,7 @@ def compute_hu(u, j):
     while True:
         mx = M.matvec(x)
         r = b - A.matvec(mx)
-        r_norm = np.linalg.norm(r)
+        r_norm = cn.linalg.norm(r)
         if callback_type == "x":
             callback(mx)
         elif callback_type == "pr_norm" and iters > 0:
@@ -863,14 +926,14 @@ def compute_hu(u, j):
             z = M.matvec(v)
             u = A.matvec(z)
             H[: j + 1, j], u = compute_hu(u, j)
-            H[j + 1, j] = np.linalg.norm(u)
+            H[j + 1, j] = cn.linalg.norm(u)
             if j + 1 < restart:
                 v = u / H[j + 1, j]
                 V[:, j + 1] = v
 
         # Note: The least-square solution to equation Hy = e is computed on CPU
         # because it is faster if tha matrix size is small.
-        ret = np.linalg.lstsq(H, e)
+        ret = cn.linalg.lstsq(H, e)  # type: ignore [attr-defined]
         y = ret[0]
         x += V @ y
         iters += restart
@@ -879,3 +942,258 @@ def compute_hu(u, j):
     if iters == maxiter and not (r_norm <= atol):
         info = iters
     return mx, info
+
+
+def spsolve(A: Any, b: np.ndarray) -> np.ndarray:
+    """
+    Solve a linear system of equation Ax=b by factorizing A
+
+    Parameters
+    ----------
+    A : csr_array
+        Input sparse matrix of shape (N, N).
+    b : cupynumeric.ndarray
+        Dense vector of shape (N,).
+
+    Returns
+    -------
+    x : cupynumeric.ndarray
+        Dense vector of shape (N,), that solves A x = b.
+
+    Raises
+    ------
+    RuntimeError
+        If attempted to solve on any configuration other than one GPU
+    ValueError
+        If the RHS is not one-dimensional
+
+    Notes
+    -----
+    This function uses cuDSS to perform the sparse direct solve, which
+    computes the reordering on Host.
+
+    """
+
+    # TODO:
+    # Support multi-dimensional RHS. Note that cuDSS only supports
+    # column-major order for x and b, so we need to update the
+    # mapper for those stores. Partitioning constraints will also need to
+    # be changed since alignment constraints will need both stores
+    # to be of the same dimension (e.g., we cannot align pos (1D)
+    # and b (say, 2D) without manipulating the stores
+
+    # NOTE: multi-gpu runs might hang with cuda < 13.0.0.
+    # For multi-gpu runs, the user is expected to set the path to
+    # libcudss_comm_nccl.so in the env CUDSS_COMM_LIB
+    if runtime.num_gpus == 0:
+        raise RuntimeError("spsolve is currently supported only for GPU(s)")
+
+    if b.ndim != 1:
+        raise ValueError(f"RHS must be 1D. Dimension of b is: {b.ndim}")
+
+    b_store = get_store_from_cupynumeric_array(b)
+    x_store = runtime.create_store(b.dtype, shape=(A.shape[1],))
+
+    task = runtime.create_auto_task(SparseOpCode.SPSOLVE)
+
+    pos_part = task.add_input(A.pos)
+    crd_part = task.add_input(A.crd)
+    vals_part = task.add_input(A.vals)
+    b_part = task.add_input(b_store)
+    x_part = task.add_output(x_store)
+    task.add_scalar_arg(A.shape[0], types.uint64)  # global nrows
+    task.add_scalar_arg(A.vals.size, types.uint64)  # global nnz
+
+    # Add communicator
+    task.add_communicator("nccl")
+
+    # Since we don't support multi-gpu or multi-cpu runs, these constraints
+    # are not particularly relevant right now, but they enable
+    # debugging the multi-gpu hang. The matrix and the vectors are
+    # partitioned row-wise without any sparsity-dependent constraints
+    # that is typical in other API implementations in legate-sparse
+    # that use mathlibs (e.g., cuSparse). This passes on the responsibility
+    # of inserting appropriate communication primitives to the
+    # underlying math library, cuDSS. This is why we don't constraint the
+    # partition of x to the image of crd (e.g., like in SpMv in csr.py)
+    task.add_constraint(image(pos_part, crd_part))
+    task.add_constraint(image(pos_part, vals_part))
+    task.add_constraint(align(x_part, pos_part))
+    task.add_constraint(align(b_part, pos_part))
+
+    task.execute()
+
+    return store_to_cupynumeric_array(x_store)
+
+
+# this function has been adapted from cupy's implementation of `eigsh`:
+# https://github.com/cupy/cupy/blob/v13.6.0/cupyx/scipy/sparse/linalg/_eigen.py
+def eigsh(
+    a,
+    k=6,
+    *,
+    which="LM",
+    v0=None,
+    ncv=None,
+    maxiter=None,
+    tol=0,
+    return_eigenvectors=True,
+):
+    def _lanczos(a, V, u, alpha, beta, i_start, i_end):
+        for i in range(i_start, i_end):
+            u[...] = a.matvec(V[i])
+            alpha[i] = cn.dot(V[i].conj(), u)
+
+            # Full reorthogonalization with "twice is enough" strategy
+            # for improved numerical stability. This matches the approach
+            # used in robust Lanczos implementations.
+            # First pass
+            coeffs = V[: i + 1].conj() @ u
+            u -= coeffs @ V[: i + 1]
+            # Second pass for numerical stability
+            coeffs2 = V[: i + 1].conj() @ u
+            u -= coeffs2 @ V[: i + 1]
+
+            beta[i] = cn.linalg.norm(u)
+            if i >= i_end - 1:
+                break
+            V[i + 1] = u / beta[i]
+
+    def _eigsh_solve_ritz(alpha, beta, beta_k, k, which):
+        # Note: This is done on the CPU using numpy, following CuPy's approach.
+        # This avoids numerical issues that can occur with GPU-based eigh
+        # on small tridiagonal matrices from the thick-restart Lanczos.
+        alpha_np = np.array(alpha)
+        beta_np = np.array(beta)
+        t = np.diag(alpha_np)
+        t = t + np.diag(beta_np[:-1], k=1)
+        t = t + np.diag(beta_np[:-1], k=-1)
+        if beta_k is not None:
+            beta_k_np = np.array(beta_k)
+            t[k, :k] = beta_k_np
+            t[:k, k] = beta_k_np
+        w, s = np.linalg.eigh(t)
+
+        # Pick-up k ritz-values and ritz-vectors
+        if which == "LA":
+            idx = np.argsort(w)
+            wk = w[idx[-k:]]
+            sk = s[:, idx[-k:]]
+        elif which == "LM":
+            idx = np.argsort(np.absolute(w))
+            wk = w[idx[-k:]]
+            sk = s[:, idx[-k:]]
+        elif which == "SA":
+            idx = np.argsort(w)
+            wk = w[idx[:k]]
+            sk = s[:, idx[:k]]
+        # Convert back to cupynumeric arrays
+        return cn.array(wk), cn.array(sk)
+
+    # Convert to LinearOperator for uniform matvec interface
+    a = make_linear_operator(a)
+    n = a.shape[0]
+    if a.ndim != 2 or a.shape[0] != a.shape[1]:
+        raise ValueError("expected square matrix (shape: {})".format(a.shape))
+    if a.dtype.char not in "fdFD":
+        raise TypeError("unsupprted dtype (actual: {})".format(a.dtype))
+    if k <= 0:
+        raise ValueError("k must be greater than 0 (actual: {})".format(k))
+    if k >= n:
+        raise ValueError("k must be smaller than n (actual: {})".format(k))
+    if which not in ("LM", "LA", "SA"):
+        raise ValueError(
+            "which must be 'LM','LA'or'SA' (actual: {})".format(which)
+        )
+    if ncv is None:
+        ncv = min(max(2 * k, k + 32), n - 1)
+    else:
+        ncv = min(max(ncv, k + 2), n - 1)
+    if maxiter is None:
+        maxiter = 10 * n
+    if tol == 0:
+        tol = cn.finfo(a.dtype).eps
+
+    if k + 1 == ncv:
+        raise ValueError(
+            f"k must be smaller than ncv - 1 (k + 1 < ncv < n)."
+            f" ncv: {ncv}, k: {k}, n: {n}"
+        )
+
+    alpha = cn.zeros((ncv,), dtype=a.dtype)
+    beta = cn.zeros((ncv,), dtype=a.dtype.char.lower())
+    V = cn.empty((ncv, n), dtype=a.dtype)
+
+    if v0 is None:
+        u = cn.random.random((n,)).astype(a.dtype)
+        V[0] = u / cn.linalg.norm(u)
+    else:
+        u = v0
+        V[0] = v0 / cn.linalg.norm(v0)
+
+    _lanczos(a, V, u, alpha, beta, 0, ncv)
+
+    iter_current = ncv
+    w, s = _eigsh_solve_ritz(alpha, beta, None, k, which)
+    x = V.T @ s
+
+    beta_k = beta[-1] * s[-1, :]
+    res = cn.linalg.norm(beta_k)
+
+    iter_increment = ncv - k
+    # Track initial beta scale for detecting relative breakdown
+    # When beta[k] is too small relative to the typical beta values,
+    # the thick restart becomes numerically unstable
+    initial_beta_scale = cn.max(cn.abs(beta[:-1]))
+
+    while res > tol and iter_current < maxiter:
+        beta[:k] = 0
+        alpha[:k] = w
+        V[:k] = x.T
+
+        # Full reorthogonalization with "twice is enough" (same as in _lanczos)
+        coeffs = V[:k].conj() @ u
+        u = u - coeffs @ V[:k]
+        coeffs2 = V[:k].conj() @ u
+        u = u - coeffs2 @ V[:k]
+
+        u_norm = cn.linalg.norm(u)
+        # Check for numerical breakdown: if u_norm is too small relative
+        # to initial scale, the thick restart becomes numerically unstable.
+        # A ratio < 0.1 indicates potential numerical issues.
+        if u_norm < 0.1 * initial_beta_scale:
+            # Accept current eigenvalues as converged
+            break
+
+        V[k] = u / u_norm
+        u[...] = a.matvec(V[k])
+        alpha[k] = cn.dot(V[k].conj(), u)
+        u -= alpha[k] * V[k]
+        u -= V[:k].T @ beta_k
+        beta[k] = cn.linalg.norm(u)
+
+        # Check for numerical breakdown after computing beta[k]
+        # If beta[k] is very small relative to initial scale,
+        # continuing will cause numerical instability
+        if beta[k] < 0.1 * initial_beta_scale:
+            # Accept current eigenvalues as converged
+            break
+
+        # note that this can run into Out of bounds error
+        # in legate if `k` is not properly constrained
+        # in the initial part of the algorithm
+        V[k + 1] = u / beta[k]
+
+        _lanczos(a, V, u, alpha, beta, k + 1, ncv)
+        w, s = _eigsh_solve_ritz(alpha, beta, beta_k, k, which)
+        x = V.T @ s
+        beta_k = beta[-1] * s[-1, :]
+        res = cn.linalg.norm(beta_k)
+
+        iter_current += iter_increment
+
+    if return_eigenvectors:
+        idx = cn.argsort(w)
+        return w[idx], x[:, idx]
+    else:
+        return cn.sort(w)
diff --git a/legate_sparse/module.py b/legate_sparse/module.py
index 56f22fa1..2fe4c5dd 100644
--- a/legate_sparse/module.py
+++ b/legate_sparse/module.py
@@ -44,7 +44,9 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from __future__ import annotations
 
+from typing import Any
 
 from .csr import csr_array  # noqa: F401
 from .dia import dia_array  # noqa: F401
@@ -56,10 +58,11 @@
 
 
 # returns whether or not an object is a legate sparse created sparse matrix.
-def _is_sparse_matrix(obj) -> bool:
+def _is_sparse_matrix(obj: Any) -> bool:
     return any((isinstance(obj, csr_array), isinstance(obj, dia_array)))
 
-def isspmatrix(obj) -> bool:
+
+def isspmatrix(obj: Any) -> bool:
     """Check if an object is a legate sparse matrix.
 
     Parameters
@@ -81,7 +84,7 @@ def isspmatrix(obj) -> bool:
     return _is_sparse_matrix(obj)
 
 
-def issparse(obj) -> bool:
+def issparse(obj: Any) -> bool:
     """Check if an object is a legate sparse matrix.
 
     Parameters
@@ -104,7 +107,7 @@ def issparse(obj) -> bool:
 
 
 # Variants for each particular format type.
-def isspmatrix_csr(obj):
+def isspmatrix_csr(obj: Any) -> bool:
     """Check if an object is a CSR sparse matrix.
 
     Parameters
diff --git a/legate_sparse/runtime.py b/legate_sparse/runtime.py
index e7a3dc41..84e12d3e 100644
--- a/legate_sparse/runtime.py
+++ b/legate_sparse/runtime.py
@@ -30,11 +30,12 @@
 from .config import SparseOpCode, _library
 
 if TYPE_CHECKING:
-    from typing import Optional, Union
+    from typing import Any
 
     import numpy.typing as npt
+    from legate.core import Library
 
-TO_CORE_DTYPES = {
+TO_CORE_DTYPES: dict[npt.DTypeLike, types.Type] = {
     np.dtype(np.bool_): types.bool_,
     np.dtype(np.int8): types.int8,
     np.dtype(np.int16): types.int16,
@@ -54,7 +55,7 @@
 
 # TODO (marsaev): rename to SparseRuntime to avoid confusion?
 class Runtime:
-    def __init__(self, sparse_library):
+    def __init__(self, sparse_library: Library) -> None:
         self.sparse_library = sparse_library
         self.legate_runtime = get_legate_runtime()
         self.legate_machine = get_machine()
@@ -66,25 +67,25 @@ def __init__(self, sparse_library):
             task = self.legate_runtime.create_manual_task(
                 self.sparse_library,
                 SparseOpCode.LOAD_CUDALIBS,
-                launch_shape=Shape((self.num_gpus,)),
+                launch_shape=(self.num_gpus,),
             )
             task.execute()
             self.legate_runtime.issue_execution_fence(block=True)
 
     @property
-    def num_procs(self):
+    def num_procs(self) -> int:
         return self.legate_machine.count(self.legate_machine.preferred_target)
 
     @property
-    def num_gpus(self):
+    def num_gpus(self) -> int:
         return self.legate_machine.count(TaskTarget.GPU)
 
     def create_store(
         self,
-        ty: Union[npt.DTypeLike],
-        shape: Optional[Union[tuple[int, ...], Shape]] = None,
+        ty: npt.dtype[Any] | types.Type,
+        shape: Shape | tuple[int, ...] | None = None,
         optimize_scalar: bool = False,
-        ndim: Optional[int] = None,
+        ndim: int | None = None,
     ) -> LogicalStore:
         core_ty = TO_CORE_DTYPES[ty] if isinstance(ty, np.dtype) else ty
         return self.legate_runtime.create_store(
@@ -92,11 +93,13 @@ def create_store(
         )
 
     # only OpCode
-    def create_auto_task(self, OpCode) -> AutoTask:
-        return self.legate_runtime.create_auto_task(self.sparse_library, OpCode)
+    def create_auto_task(self, OpCode: int) -> AutoTask:
+        return self.legate_runtime.create_auto_task(
+            self.sparse_library, OpCode
+        )
 
     # OpCode and launch domains
-    def create_manual_task(self, OpCode, *args) -> ManualTask:
+    def create_manual_task(self, OpCode: int, *args: Any) -> ManualTask:
         return self.legate_runtime.create_manual_task(
             self.sparse_library, OpCode, *args
         )
diff --git a/legate_sparse/settings.py b/legate_sparse/settings.py
index 31e48a0c..7d518777 100644
--- a/legate_sparse/settings.py
+++ b/legate_sparse/settings.py
@@ -14,7 +14,12 @@
 #
 from __future__ import annotations
 
-from legate.util.settings import PrioritizedSetting, Settings, convert_bool
+from legate.util.settings import (
+    PrioritizedSetting,
+    Settings,
+    convert_bool,
+    convert_str,
+)
 
 __all__ = ("settings",)
 
@@ -32,5 +37,15 @@ class SparseRuntimeSettings(Settings):
         """,
     )
 
+    cudss_commnccl_loc: PrioritizedSetting[bool] = PrioritizedSetting(
+        "cudss-comm-lib",
+        "CUDSS_COMM_LIB",
+        default="",
+        convert=convert_str,
+        help="""
+        For multi-gpu runs, set CUDSS_COMM_LIB env to /path/to/libcudss_commlayer_nccl.so
+        """,
+    )
+
 
 settings = SparseRuntimeSettings()
diff --git a/legate_sparse/types.py b/legate_sparse/types.py
index 923767f2..a566f617 100644
--- a/legate_sparse/types.py
+++ b/legate_sparse/types.py
@@ -11,26 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
-import numpy
+import numpy as np
 
 # Define some common types. Hopefully as we make more
 # progress in generalizing the compute kernels, we can
 # remove this code.
-coord_ty = numpy.dtype(numpy.int64)
+coord_ty = np.dtype(np.int64)
 """Data type for coordinate indices in sparse matrices (int64)."""
 
-nnz_ty = numpy.dtype(numpy.uint64)
+nnz_ty = np.dtype(np.uint64)
 """Data type for non-zero counts in sparse matrices (uint64)."""
 
-float64 = numpy.dtype(numpy.float64)
+float64 = np.dtype(np.float64)
 """64-bit floating point data type."""
 
-int32 = numpy.dtype(numpy.int32)
+int32 = np.dtype(np.int32)
 """32-bit integer data type."""
 
-int64 = numpy.dtype(numpy.int64)
+int64 = np.dtype(np.int64)
 """64-bit integer data type."""
 
-uint64 = numpy.dtype(numpy.uint64)
+uint64 = np.dtype(np.uint64)
 """64-bit unsigned integer data type."""
diff --git a/legate_sparse/utils.py b/legate_sparse/utils.py
index 2c072f2b..31b28a28 100644
--- a/legate_sparse/utils.py
+++ b/legate_sparse/utils.py
@@ -11,26 +11,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import math
 import traceback
-from typing import Any
+from typing import TYPE_CHECKING, cast
 
-import cupynumeric
-import numpy
+import cupynumeric as cn
+import numpy as np
 from legate.core import LogicalStore
 
 import legate_sparse
 
 from .runtime import runtime
 
+if TYPE_CHECKING:
+    from typing import Any
+
+    import numpy.typing as npt
+
+    from .csr import csr_array
+
 # Datatypes that spmv and spgemm operations are supported for
-SUPPORTED_DATATYPES = (
-    numpy.float32,
-    numpy.float64,
-    numpy.complex64,
-    numpy.complex128,
-)
+SUPPORTED_DATATYPES = (np.float32, np.float64, np.complex64, np.complex128)
 """Supported datatypes for sparse matrix operations (SpMV and SpGEMM)."""
 
 
@@ -59,7 +62,7 @@ def find_last_user_stacklevel() -> int:
 
 
 # store_to_cupynumeric_array converts a store to a cuPyNumeric array.
-def store_to_cupynumeric_array(store: LogicalStore):
+def store_to_cupynumeric_array(store: LogicalStore) -> cn.ndarray:
     """Convert a LogicalStore to a cupynumeric array.
 
     Parameters
@@ -72,13 +75,12 @@ def store_to_cupynumeric_array(store: LogicalStore):
     cupynumeric.ndarray
         The cupynumeric array representation of the store.
     """
-    return cupynumeric.asarray(store)
+    return cn.asarray(store)
 
 
 # get_store_from_cupynumeric_array extracts a store from a cuPyNumeric array.
 def get_store_from_cupynumeric_array(
-    arr: cupynumeric.ndarray,
-    copy=False,
+    arr: cn.ndarray, copy: bool = False
 ) -> LogicalStore:
     """Extract a LogicalStore from a cupynumeric array.
 
@@ -96,17 +98,17 @@ def get_store_from_cupynumeric_array(
     """
     if copy:
         # If requested to make a copy, do so.
-        arr = cupynumeric.array(arr)
+        arr = cn.array(arr)
 
     data = arr.__legate_data_interface__["data"]
     array = data[next(iter(data))]
     store = array.data
 
-    return store
+    return cast(LogicalStore, store)
 
 
 # cast_to_store attempts to cast an arbitrary object into a store.
-def cast_to_store(arr):
+def cast_to_store(arr: cn.ndarray | LogicalStore) -> LogicalStore:
     """Cast an arbitrary object to a LogicalStore.
 
     Parameters
@@ -126,16 +128,18 @@ def cast_to_store(arr):
     """
     if isinstance(arr, LogicalStore):
         return arr
-    if isinstance(arr, numpy.ndarray):
-        arr = cupynumeric.array(arr)
-    if isinstance(arr, cupynumeric.ndarray):
+    if isinstance(arr, np.ndarray):
+        arr = cn.array(arr)
+    if isinstance(arr, cn.ndarray):
         return get_store_from_cupynumeric_array(arr)
     raise NotImplementedError
 
 
 # cast_arr attempts to cast an arbitrary object into a cupynumeric
 # ndarray, with an optional desired type.
-def cast_arr(arr, dtype=None):
+def cast_arr(
+    arr: cn.ndarray | LogicalStore, dtype: npt.dtype[Any] | None = None
+) -> cn.ndarray:
     """Cast an arbitrary object to a cupynumeric array.
 
     Parameters
@@ -152,14 +156,16 @@ def cast_arr(arr, dtype=None):
     """
     if isinstance(arr, LogicalStore):
         arr = store_to_cupynumeric_array(arr)
-    elif not isinstance(arr, cupynumeric.ndarray):
-        arr = cupynumeric.array(arr)
+    elif not isinstance(arr, cn.ndarray):
+        arr = cn.array(arr)
     if dtype is not None:
         arr = arr.astype(dtype)
     return arr
 
 
-def find_common_type(*args):
+def find_common_type(
+    *args: cn.ndarray | csr_array | np.ndarray,
+) -> npt.dtype[Any]:
     """Find the common data type for a set of arrays.
 
     This function performs a similar analysis to cupynumeric.ndarray.find_common_type
@@ -190,33 +196,10 @@ def find_common_type(*args):
             scalar_types.append(array.dtype)
         else:
             array_types.append(array.dtype)
-    return numpy.result_type(*array_types, *scalar_types)
-
-
-def cast_to_common_type(*args):
-    """Cast all arguments to the same common data type.
-
-    Parameters
-    ----------
-    *args : array_like
-        Arrays to cast to a common type.
-
-    Returns
-    -------
-    tuple
-        Tuple of arrays, all cast to the same common data type.
+    return np.result_type(*array_types, *scalar_types)
 
-    Notes
-    -----
-    This function first finds the common type using find_common_type,
-    then casts each input to that type. If all arguments are already
-    the common type, this will be a no-op.
-    """
-    common_type = find_common_type(*args)
-    return tuple(arg.astype(common_type, copy=False) for arg in args)
 
-
-def factor_int(n):
+def factor_int(n: int) -> tuple[int, int]:
     """Split an integer into two close factors.
 
     Parameters
@@ -242,7 +225,9 @@ def factor_int(n):
     return val, val2
 
 
-def broadcast_store(store: LogicalStore, shape: Any) -> LogicalStore:
+def broadcast_store(
+    store: LogicalStore, shape: tuple[int, ...]
+) -> LogicalStore:
     """Broadcast a LogicalStore to the desired shape.
 
     Parameters
@@ -294,12 +279,14 @@ def copy_store(store: LogicalStore) -> LogicalStore:
     LogicalStore
         A new LogicalStore with the same data as the input.
     """
-    res = runtime.create_store(store.type, store.shape)  # type: ignore
+    res = runtime.create_store(store.type, store.shape)
     runtime.legate_runtime.issue_copy(res, store)
     return res
 
 
-def store_from_store_or_array(src, copy=False) -> LogicalStore:  # type: ignore
+def store_from_store_or_array(
+    src: LogicalStore | cn.ndarray, copy: bool = False
+) -> LogicalStore:
     """Get LogicalStore from a LogicalStore or array, potentially creating a copy.
 
     Parameters
@@ -319,15 +306,19 @@ def store_from_store_or_array(src, copy=False) -> LogicalStore:  # type: ignore
     AssertionError
         If the input type is not supported.
     """
-    if isinstance(src, cupynumeric.ndarray):
+    if isinstance(src, cn.ndarray):
         return get_store_from_cupynumeric_array(src, copy)
     elif isinstance(src, LogicalStore):
         return copy_store(src) if copy else src
     else:
-        AssertionError("Wrong type for 'store_from_store_or_array()' utility")
+        raise AssertionError(
+            "Wrong type for 'store_from_store_or_array()' utility"
+        )
 
 
-def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type: ignore
+def array_from_store_or_array(
+    src: LogicalStore | cn.ndarray, copy: bool = False
+) -> cn.ndarray:
     """Get array from a LogicalStore or array, potentially creating a copy.
 
     Parameters
@@ -347,7 +338,7 @@ def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type:
     AssertionError
         If the input type is not supported.
     """
-    if isinstance(src, cupynumeric.ndarray):
+    if isinstance(src, cn.ndarray):
         return src.copy() if copy else src
     elif isinstance(src, LogicalStore):
         return (
@@ -356,11 +347,12 @@ def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type:
             else store_to_cupynumeric_array(src)
         )
     else:
-        AssertionError("Wrong type for 'array_from_store_or_array()' utility")
-    # type: ignore
+        raise AssertionError(
+            "Wrong type for 'array_from_store_or_array()' utility"
+        )
 
 
-def get_storage_type(src):
+def get_storage_type(src: LogicalStore | cn.ndarray) -> npt.dtype[Any]:
     """Get the storage type of an object.
 
     Parameters
@@ -378,18 +370,17 @@ def get_storage_type(src):
     AssertionError
         If the input type is not supported.
     """
-    if isinstance(src, cupynumeric.ndarray):
+    if isinstance(src, cn.ndarray):
         return src.dtype
     elif isinstance(src, LogicalStore):
         # there is legate.core to_core_dtype(), but here we need the opposite
         # doing via array now
         return cast_arr(src).dtype
     else:
-        AssertionError("Wrong type for 'get_storage_type()' utility")
-    # type: ignore
+        raise AssertionError("Wrong type for 'get_storage_type()' utility")
 
 
-def is_dtype_supported(dtype: numpy.dtype) -> bool:
+def is_dtype_supported(dtype: npt.dtype[Any]) -> bool:
     """Check if a datatype supports SpMV and SpGEMM operations.
 
     Parameters
@@ -409,7 +400,7 @@ def is_dtype_supported(dtype: numpy.dtype) -> bool:
     return dtype in SUPPORTED_DATATYPES
 
 
-def is_dense(x) -> bool:
+def is_dense(x: Any) -> bool:
     """Check if an object is a dense cupynumeric array.
 
     Parameters
@@ -422,10 +413,10 @@ def is_dense(x) -> bool:
     bool
         True if x is a cupynumeric.ndarray, False otherwise.
     """
-    return isinstance(x, cupynumeric.ndarray)
+    return isinstance(x, cn.ndarray)
 
 
-def is_scalar_like(x) -> bool:
+def is_scalar_like(x: Any) -> bool:
     """Check if an object is a scalar-like type.
 
     Parameters
@@ -445,10 +436,10 @@ def is_scalar_like(x) -> bool:
     """
     if isinstance(x, str):
         return False
-    return cupynumeric.isscalar(x) or (is_dense(x) and x.ndim == 0)
+    return cn.isscalar(x) or (is_dense(x) and x.ndim == 0)
 
 
-def is_sparse(x) -> bool:
+def is_sparse(x: Any) -> bool:
     """Check if an object is a legate sparse matrix.
 
     Parameters
@@ -464,7 +455,7 @@ def is_sparse(x) -> bool:
     return legate_sparse.isspmatrix(x)
 
 
-def sort_by_rows_then_cols(rows: cupynumeric.ndarray, cols: cupynumeric.ndarray):
+def sort_by_rows_then_cols(rows: cn.ndarray, cols: cn.ndarray) -> cn.ndarray:
     """Sort indices by rows first, then by columns.
 
     This function is a quick and dirty hack that does what np.lexsort does
@@ -501,7 +492,7 @@ def sort_by_rows_then_cols(rows: cupynumeric.ndarray, cols: cupynumeric.ndarray)
     # note that the lexsort reverses the order of key,
     # so this would be equivalent to np.lexsort((cols, rows))
 
-    indices = cupynumeric.argsort(cols, kind="stable")
-    order = cupynumeric.argsort(rows[indices], kind="stable")
+    indices = cn.argsort(cols, kind="stable")
+    order = cn.argsort(rows[indices], kind="stable")
 
     return indices[order]
diff --git a/legate_sparse_cpp.cmake b/legate_sparse_cpp.cmake
index 6a90e3b3..2f37b63d 100644
--- a/legate_sparse_cpp.cmake
+++ b/legate_sparse_cpp.cmake
@@ -105,6 +105,7 @@ if(Legion_USE_CUDA)
   )
 
   include(cmake/thirdparty/get_nccl.cmake)
+  include(cmake/thirdparty/get_cudss.cmake)
 endif()
 
 # End From cupynumeric
@@ -134,7 +135,7 @@ list(APPEND legate_sparse_SOURCES
   src/legate_sparse/array/csr/spmv.cc
   src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
   src/legate_sparse/array/csr/indexing.cc
-  
+
   src/legate_sparse/array/util/unzip_rect.cc
   src/legate_sparse/array/util/zip_to_rect.cc
 
@@ -142,6 +143,8 @@ list(APPEND legate_sparse_SOURCES
 
   src/legate_sparse/io/mtx_to_coo.cc
   src/legate_sparse/linalg/axpby.cc
+  src/legate_sparse/linalg/spsolve.cc
+  src/legate_sparse/array/csr/geam.cc
 )
 
 if(Legion_USE_OpenMP)
@@ -154,6 +157,7 @@ if(Legion_USE_OpenMP)
     src/legate_sparse/array/csr/spmv_omp.cc
     src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
     src/legate_sparse/array/csr/indexing_omp.cc
+    src/legate_sparse/array/csr/geam_omp.cc
 
     src/legate_sparse/array/util/unzip_rect_omp.cc
     src/legate_sparse/array/util/zip_to_rect_omp.cc
@@ -164,7 +168,7 @@ endif()
 
 if(Legion_USE_CUDA)
   list(APPEND legate_sparse_SOURCES
-    src/legate_sparse/cudalibs.cu 
+    src/legate_sparse/cudalibs.cu
 
     src/legate_sparse/array/conv/dense_to_csr.cu
     src/legate_sparse/array/conv/csr_to_dense.cu
@@ -174,19 +178,21 @@ if(Legion_USE_CUDA)
     src/legate_sparse/array/csr/spmv.cu
     src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
     src/legate_sparse/array/csr/indexing.cu
+    src/legate_sparse/array/csr/geam.cu
 
     src/legate_sparse/array/util/unzip_rect.cu
     src/legate_sparse/array/util/zip_to_rect.cu
-    
+
     src/legate_sparse/partition/fast_image_partition.cu
 
     src/legate_sparse/linalg/axpby.cu
+    src/legate_sparse/linalg/spsolve.cu
   )
 endif()
 
 
 list(APPEND legate_sparse_SOURCES
-  
+
   # This must always be the last file!
   # It guarantees we do our registration callback
   # only after all task variants are recorded
@@ -237,17 +243,21 @@ set_target_properties(legate_sparse
                       CUDA_STANDARD_REQUIRED              ON
                       LIBRARY_OUTPUT_DIRECTORY            lib)
 
+# NOTE: For multi-GPU runs, the env CUDSS_COMM_LIB must be set to path to libcudss_commlayer_nccl.so
+# conda install -c conda-forge libcudss libcudss-dev libcudss-commlayer-nccl
+# should install it in ${CONDA_PREFIX}/lib/
 target_link_libraries(legate_sparse
    PUBLIC legate::legate
           $<TARGET_NAME_IF_EXISTS:NCCL::NCCL>
           # do we need to put this dependency here?
           # what is the correct target?
           # cupynumeric::cupynumeric
-  PRIVATE 
+  PRIVATE
           # Add Conda library and include paths
           $<TARGET_NAME_IF_EXISTS:conda_env>
           $<TARGET_NAME_IF_EXISTS:CUDA::cublas>
           $<TARGET_NAME_IF_EXISTS:CUDA::cusparse>
+          $<TARGET_NAME_IF_EXISTS:cudss>
           $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>)
 
 
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..93344517
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,118 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+requires = [
+    "wheel",
+    "ninja",
+    "setuptools",
+    "scikit-build>=0.13.1",
+    "cmake>=3.30.4",
+]
+build-backend = "setuptools.build_meta"
+
+[tool.pytest.ini_options]
+addopts = "--capture=sys"
+cache_dir = "./.cache/pytest"
+
+[tool.mypy]
+python_version = "3.11"
+cache_dir = "./.cache/mypy"
+
+pretty = true
+show_error_codes = true
+show_error_context = true
+show_column_numbers = true
+
+namespace_packages = true
+ignore_missing_imports = false
+
+disallow_any_unimported = true
+disallow_any_expr = false
+disallow_any_decorated = false
+disallow_any_explicit = false
+disallow_any_generics = true
+disallow_subclassing_any = true
+
+disallow_untyped_calls = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+check_untyped_defs = true
+disallow_untyped_decorators = true
+no_implicit_optional = true
+strict_optional = true
+
+warn_redundant_casts = true
+warn_unused_ignores = false
+warn_no_return = true
+warn_return_any = true
+warn_unreachable = true
+
+ignore_errors = false
+
+allow_untyped_globals = false
+allow_redefinition = false
+implicit_reexport = true
+strict_equality = true
+
+warn_unused_configs = true
+
+[[tool.mypy.overrides]]
+# ignore auto-generated files
+# or files depending on auto-generated field
+module = [
+  "legate_sparse.install_info",
+  "legate_sparse._version",
+  "legate._version",
+  "legate.__main__",
+  "legate.install_info",
+]
+ignore_errors = true
+
+[tool.ruff]
+cache-dir = "./.cache/ruff"
+extend-exclude = [
+    "arch-*",
+    "*-arch",
+    "venv",
+    "*venv",
+    "build",
+]
+line-length = 79
+src = [".", "legate_sparse"]
+
+[tool.ruff.format]
+skip-magic-trailing-comma = true
+
+[tool.ruff.lint.isort.sections]
+legion = ["legion_cffi", "legion_top"]
+legate = ["legate"]
+testing = ["pytest", "pytest_mock"]
+
+[tool.ruff.lint.isort]
+known-third-party = ["numpy", "scipy"]
+known-first-party = ["legate_sparse"]
+length-sort-straight = true
+combine-as-imports = true
+split-on-trailing-comma = false
+required-imports = ["from __future__ import annotations"]
+section-order = [
+    "future",
+    "standard-library",
+    "third-party",
+    "legion",
+    "legate",
+    "first-party",
+    "local-folder",
+]
diff --git a/scripts/memlog_analysis.py b/scripts/memlog_analysis.py
old mode 100644
new mode 100755
index ee8bd3c6..e16a5369
--- a/scripts/memlog_analysis.py
+++ b/scripts/memlog_analysis.py
@@ -16,10 +16,10 @@
 
     # Parse the log file
     allocations = parse_memlog('memlog.txt')
-    
+
     # Export to CSV
     export_to_csv(allocations, 'memory_analysis.csv')
-    
+
     # Create visualizations (requires pandas, matplotlib, seaborn)
     visualize_allocations(allocations)
 """  # noqa: W293
@@ -116,7 +116,9 @@ def export_to_csv(
                 # If unique_mb_only is enabled, check for similar memory sizes
                 if unique_mb_only:
                     is_similar = any(
-                        are_similar_sizes(mb_size, seen_size, threshold_percent)
+                        are_similar_sizes(
+                            mb_size, seen_size, threshold_percent
+                        )
                         for seen_size in seen_mb_sizes
                     )
                     if is_similar:
@@ -145,7 +147,9 @@ def export_to_csv(
                 )
 
 
-def export_to_excel(allocations: List[BufferAllocation], output_file: str) -> bool:
+def export_to_excel(
+    allocations: List[BufferAllocation], output_file: str
+) -> bool:
     """
     Export memory allocation data to formatted Excel file.
 
@@ -299,7 +303,9 @@ def visualize_allocations(
     """
     if not all([PANDAS_AVAILABLE, MATPLOTLIB_AVAILABLE, SEABORN_AVAILABLE]):
         print("Error: Visualization requires pandas, matplotlib, and seaborn.")
-        print("Please install them with: pip install pandas matplotlib seaborn")
+        print(
+            "Please install them with: pip install pandas matplotlib seaborn"
+        )
         return False
 
     # Convert to DataFrame
@@ -360,7 +366,9 @@ def visualize_allocations(
     else:
         # Memory usage by description (top 10)
         plt.subplot(2, 2, 1)
-        top_descriptions = df.groupby("Description")["Size_MB"].sum().nlargest(10)
+        top_descriptions = (
+            df.groupby("Description")["Size_MB"].sum().nlargest(10)
+        )
         sns.barplot(x=top_descriptions.values, y=top_descriptions.index)
         plt.title("Top 10 Memory Usage by Description")
         plt.xlabel("Memory (MB)")
@@ -372,7 +380,9 @@ def visualize_allocations(
         plt.title("Memory Distribution by Type")
 
     plt.tight_layout()
-    plt.savefig(f"{output_dir}/memory_analysis.png", dpi=300, bbox_inches="tight")
+    plt.savefig(
+        f"{output_dir}/memory_analysis.png", dpi=300, bbox_inches="tight"
+    )
     plt.close()
     return True
 
@@ -383,7 +393,9 @@ def main():
 
     from memlog_parser import parse_memlog
 
-    parser = argparse.ArgumentParser(description="Analyze memory allocation logs")
+    parser = argparse.ArgumentParser(
+        description="Analyze memory allocation logs"
+    )
     parser.add_argument("file", help="Path to the memory log file")
     parser.add_argument(
         "--output-dir", default=".", help="Directory to save output files"
diff --git a/scripts/memlog_cli.py b/scripts/memlog_cli.py
old mode 100644
new mode 100755
index ef45a129..94cf430c
--- a/scripts/memlog_cli.py
+++ b/scripts/memlog_cli.py
@@ -14,7 +14,11 @@
 import os
 import sys
 
-from memlog_analysis import export_to_csv, export_to_excel, visualize_allocations
+from memlog_analysis import (
+    export_to_csv,
+    export_to_excel,
+    visualize_allocations,
+)
 from memlog_parser import (
     filter_allocations,
     parse_memlog,
@@ -49,8 +53,12 @@ def check_dependencies(format: str) -> bool:
             import pandas  # noqa:  F401
             import seaborn  # noqa:  F401
         except ImportError:
-            print("Error: Visualization requires pandas, matplotlib, and seaborn.")
-            print("Please install them with: pip install pandas matplotlib seaborn")
+            print(
+                "Error: Visualization requires pandas, matplotlib, and seaborn."
+            )
+            print(
+                "Please install them with: pip install pandas matplotlib seaborn"
+            )
             return False
 
     return True
diff --git a/scripts/memlog_parser.py b/scripts/memlog_parser.py
old mode 100644
new mode 100755
index 0024b41d..854b3a72
--- a/scripts/memlog_parser.py
+++ b/scripts/memlog_parser.py
@@ -60,7 +60,9 @@ class BufferAllocation:
 
     def total_bytes(self) -> int:
         """Calculate total bytes allocated including data type size."""
-        type_size = TYPE_SIZES.get(self.type, 1)  # Default to 1 byte if type not found
+        type_size = TYPE_SIZES.get(
+            self.type, 1
+        )  # Default to 1 byte if type not found
         return self.size * type_size
 
     def total_mb(self) -> float:
@@ -68,7 +70,9 @@ def total_mb(self) -> float:
         return self.total_bytes() / (1024 * 1024)
 
 
-def are_similar_sizes(size1: float, size2: float, threshold_percent: float) -> bool:
+def are_similar_sizes(
+    size1: float, size2: float, threshold_percent: float
+) -> bool:
     """
     Check if two sizes are similar within the given percentage threshold.
 
@@ -190,7 +194,10 @@ def filter_allocations(
 
     filtered = []
     for alloc in allocations:
-        if alloc.description not in ignore_descriptions and alloc.total_mb() >= min_mb:
+        if (
+            alloc.description not in ignore_descriptions
+            and alloc.total_mb() >= min_mb
+        ):
             filtered.append(alloc)
     return filtered
 
@@ -215,9 +222,9 @@ def print_description_group(
         max_bytes = max(alloc.total_bytes() for alloc in allocs)
         print(f"\n{desc}:")
         print(
-            f"  Total bytes (includes non-unique allocs): {desc_total_bytes / (1024*1024):.2f} MB"
+            f"  Total bytes (includes non-unique allocs): {desc_total_bytes / (1024 * 1024):.2f} MB"
         )
-        print(f"  Max bytes  : {max_bytes / (1024*1024):.2f} MB")
+        print(f"  Max bytes  : {max_bytes / (1024 * 1024):.2f} MB")
 
         # Track seen entries for this description
         seen_entries = set()
@@ -275,9 +282,9 @@ def print_size_group(
 
         print(f"\nSize: {size} elements:")
         print(
-            f"  Total bytes (includes non-unique allocs): {size_total_bytes / (1024*1024):.2f} MB"
+            f"  Total bytes (includes non-unique allocs): {size_total_bytes / (1024 * 1024):.2f} MB"
         )
-        print(f"  Max bytes  : {max_bytes / (1024*1024):.2f} MB")
+        print(f"  Max bytes  : {max_bytes / (1024 * 1024):.2f} MB")
 
         for alloc in allocs:
             mb_size = alloc.total_mb()
diff --git a/scripts/pre-commit/yamllint.yml b/scripts/pre-commit/yamllint.yml
new file mode 100644
index 00000000..2017e01d
--- /dev/null
+++ b/scripts/pre-commit/yamllint.yml
@@ -0,0 +1,6 @@
+---
+extends: default
+rules:
+  truthy:
+    ignore: ".github/workflows/*.yml"
+  line-length: disable
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..eed372ce
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,7 @@
+[mypy]
+python_version = 3.11
+strict = True
+implicit_reexport = true
+
+[mypy-legate_sparse._version]
+ignore_errors = True
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 68efb75c..c358d32e
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 
 setup(
     name="legate-sparse",
-    version="25.07.00",
+    version="26.02.00",
     description="An Aspiring Drop-In Replacement for SciPy Sparse module at Scale",
     author="NVIDIA Corporation",
     license="Apache 2.0",
@@ -52,10 +52,7 @@
         "Programming Language :: Python :: 3.12",
         "Programming Language :: Python :: 3.13",
     ],
-    packages=find_packages(
-        where=".",
-        include=["legate_sparse*"],
-    ),
+    packages=find_packages(where=".", include=["legate_sparse*"]),
     include_package_data=True,
     zip_safe=False,
 )
diff --git a/src/legate_sparse/array/conv/csr_to_dense.cc b/src/legate_sparse/array/conv/csr_to_dense.cc
index de9a8958..5ff66d63 100644
--- a/src/legate_sparse/array/conv/csr_to_dense.cc
+++ b/src/legate_sparse/array/conv/csr_to_dense.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRToDenseImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRToDenseImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/conv/csr_to_dense.cu b/src/legate_sparse/array/conv/csr_to_dense.cu
index 2d3c159f..de98b015 100644
--- a/src/legate_sparse/array/conv/csr_to_dense.cu
+++ b/src/legate_sparse/array/conv/csr_to_dense.cu
@@ -48,6 +48,9 @@ __global__ void CSRtoDenseKernel(size_t rows,
 
 template <>
 struct CSRToDenseImpl<VariantKind::GPU> {
+  TaskContext context;
+  explicit CSRToDenseImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(CSRToDenseArgs& args) const
   {
@@ -64,7 +67,7 @@ struct CSRToDenseImpl<VariantKind::GPU> {
       return;
     }
 
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
 
     auto B_domain = B_pos.domain();
     auto rows     = B_domain.hi()[0] - B_domain.lo()[0] + 1;
diff --git a/src/legate_sparse/array/conv/csr_to_dense_omp.cc b/src/legate_sparse/array/conv/csr_to_dense_omp.cc
index ec5da532..d048e0d6 100644
--- a/src/legate_sparse/array/conv/csr_to_dense_omp.cc
+++ b/src/legate_sparse/array/conv/csr_to_dense_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRToDenseImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRToDenseImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/conv/csr_to_dense_template.inl b/src/legate_sparse/array/conv/csr_to_dense_template.inl
index 58529312..9fb8d4dd 100644
--- a/src/legate_sparse/array/conv/csr_to_dense_template.inl
+++ b/src/legate_sparse/array/conv/csr_to_dense_template.inl
@@ -31,6 +31,9 @@ struct CSRToDenseImplBody;
 
 template <VariantKind KIND>
 struct CSRToDenseImpl {
+  TaskContext context;
+  explicit CSRToDenseImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(CSRToDenseArgs& args) const
   {
@@ -45,7 +48,7 @@ struct CSRToDenseImpl {
     if (args.A_vals.domain().empty()) {
       return;
     }
-    CSRToDenseImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+    CSRToDenseImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(
       A_vals, B_pos, B_crd, B_vals, args.A_vals.shape<2>());
   }
 };
@@ -61,7 +64,7 @@ static void csr_to_dense_template(TaskContext context)
   CSRToDenseArgs args{outputs[0], context.inputs()[0], context.inputs()[1], context.inputs()[2]};
 
   index_type_value_type_dispatch(
-    args.B_crd.code(), args.A_vals.code(), CSRToDenseImpl<KIND>{}, args);
+    args.B_crd.code(), args.A_vals.code(), CSRToDenseImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/conv/dense_to_csr.cc b/src/legate_sparse/array/conv/dense_to_csr.cc
index 3304b558..410b37d8 100644
--- a/src/legate_sparse/array/conv/dense_to_csr.cc
+++ b/src/legate_sparse/array/conv/dense_to_csr.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code VAL_CODE>
 struct DenseToCSRNNZImplBody<VariantKind::CPU, VAL_CODE> {
+  TaskContext context;
+  explicit DenseToCSRNNZImplBody(TaskContext context) : context(context) {}
+
   using VAL_TY = type_of<VAL_CODE>;
 
   void operator()(const AccessorWO<nnz_ty, 2>& nnz,
@@ -43,6 +46,9 @@ struct DenseToCSRNNZImplBody<VariantKind::CPU, VAL_CODE> {
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct DenseToCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit DenseToCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/conv/dense_to_csr.cu b/src/legate_sparse/array/conv/dense_to_csr.cu
index e38d906b..34698a4d 100644
--- a/src/legate_sparse/array/conv/dense_to_csr.cu
+++ b/src/legate_sparse/array/conv/dense_to_csr.cu
@@ -44,6 +44,9 @@ __global__ void denseToCSRNNZKernel(size_t rows,
 
 template <>
 struct DenseToCSRNNZImpl<VariantKind::GPU> {
+  TaskContext context;
+  explicit DenseToCSRNNZImpl(TaskContext context) : context(context) {}
+
   template <Type::Code VAL_CODE>
   void operator()(DenseToCSRNNZArgs& args) const
   {
@@ -57,7 +60,7 @@ struct DenseToCSRNNZImpl<VariantKind::GPU> {
       return;
     }
 
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
 
 // #if (CUSPARSE_VER_MAJOR < 11 || (CUSPARSE_VER_MAJOR == 11 && CUSPARSE_VER_MINOR < 2))
 #if 1
@@ -149,6 +152,9 @@ __global__ void denseToCSRKernel(size_t rows,
 
 template <>
 struct DenseToCSRImpl<VariantKind::GPU> {
+  TaskContext context;
+  explicit DenseToCSRImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(DenseToCSRArgs& args) const
   {
@@ -166,7 +172,7 @@ struct DenseToCSRImpl<VariantKind::GPU> {
     }
 
     // Get context sensitive objects.
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
 
     auto B_domain = B_vals.domain();
     auto rows     = B_domain.hi()[0] - B_domain.lo()[0] + 1;
diff --git a/src/legate_sparse/array/conv/dense_to_csr_omp.cc b/src/legate_sparse/array/conv/dense_to_csr_omp.cc
index 78e060de..7de5334d 100644
--- a/src/legate_sparse/array/conv/dense_to_csr_omp.cc
+++ b/src/legate_sparse/array/conv/dense_to_csr_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code VAL_CODE>
 struct DenseToCSRNNZImplBody<VariantKind::OMP, VAL_CODE> {
+  TaskContext context;
+  explicit DenseToCSRNNZImplBody(TaskContext context) : context(context) {}
+
   using VAL_TY = type_of<VAL_CODE>;
 
   void operator()(const AccessorWO<nnz_ty, 2>& nnz,
@@ -44,6 +47,9 @@ struct DenseToCSRNNZImplBody<VariantKind::OMP, VAL_CODE> {
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct DenseToCSRImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit DenseToCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/conv/dense_to_csr_template.inl b/src/legate_sparse/array/conv/dense_to_csr_template.inl
index 31c81686..bbf98cb4 100644
--- a/src/legate_sparse/array/conv/dense_to_csr_template.inl
+++ b/src/legate_sparse/array/conv/dense_to_csr_template.inl
@@ -32,6 +32,9 @@ struct DenseToCSRNNZImplBody;
 
 template <VariantKind KIND>
 struct DenseToCSRNNZImpl {
+  TaskContext context;
+  explicit DenseToCSRNNZImpl(TaskContext context) : context(context) {}
+
   template <Type::Code VAL_CODE>
   void operator()(DenseToCSRNNZArgs& args) const
   {
@@ -43,7 +46,7 @@ struct DenseToCSRNNZImpl {
     if (args.nnz.domain().empty()) {
       return;
     }
-    DenseToCSRNNZImplBody<KIND, VAL_CODE>()(nnz, B_vals, args.B_vals.shape<2>());
+    DenseToCSRNNZImplBody<KIND, VAL_CODE>{context}(nnz, B_vals, args.B_vals.shape<2>());
   }
 };
 
@@ -52,6 +55,9 @@ struct DenseToCSRImplBody;
 
 template <VariantKind KIND>
 struct DenseToCSRImpl {
+  TaskContext context;
+  explicit DenseToCSRImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(DenseToCSRArgs& args) const
   {
@@ -66,7 +72,7 @@ struct DenseToCSRImpl {
     if (args.A_pos.domain().empty()) {
       return;
     }
-    DenseToCSRImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+    DenseToCSRImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(
       A_pos, A_crd, A_vals, B_vals, args.B_vals.shape<2>());
   }
 };
@@ -78,7 +84,7 @@ static void dense_to_csr_nnz_template(TaskContext context)
     context.output(0),  // nnz_per_row
     context.input(0)    // B_vals
   };
-  value_type_dispatch(args.B_vals.code(), DenseToCSRNNZImpl<KIND>{}, args);
+  value_type_dispatch(args.B_vals.code(), DenseToCSRNNZImpl<KIND>{context}, args);
 }
 
 template <VariantKind KIND>
@@ -92,7 +98,7 @@ static void dense_to_csr_template(TaskContext context)
   };
 
   index_type_value_type_dispatch(
-    args.A_crd.code(), args.A_vals.code(), DenseToCSRImpl<KIND>{}, args);
+    args.A_crd.code(), args.A_vals.code(), DenseToCSRImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cc b/src/legate_sparse/array/conv/pos_to_coordinates.cc
index 7cadb10e..20773a22 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.cc
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE>
 struct ExpandPosToCoordinatesImplBody<VariantKind::CPU, INDEX_CODE> {
+  TaskContext context;
+  explicit ExpandPosToCoordinatesImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorRO<Rect<1>, 1>& pos,
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cu b/src/legate_sparse/array/conv/pos_to_coordinates.cu
index c74a5c3f..ced2335d 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.cu
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.cu
@@ -44,13 +44,16 @@ __global__ void fill_row_indices(size_t rows,
 
 template <Type::Code INDEX_CODE>
 struct ExpandPosToCoordinatesImplBody<VariantKind::GPU, INDEX_CODE> {
+  TaskContext context;
+  explicit ExpandPosToCoordinatesImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorRO<Rect<1>, 1>& pos,
                   const AccessorWO<INDEX_TY, 1>& row_indices,
                   const Rect<1>& rect)
   {
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     auto blocks = get_num_blocks_1d(rect.volume());
     size_t rows = rect.volume();
 
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc b/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc
index 80da99a5..01d51002 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc
+++ b/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE>
 struct ExpandPosToCoordinatesImplBody<VariantKind::OMP, INDEX_CODE> {
+  TaskContext context;
+  explicit ExpandPosToCoordinatesImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorRO<Rect<1>, 1>& pos,
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates_template.inl b/src/legate_sparse/array/conv/pos_to_coordinates_template.inl
index 39142e70..160bc53f 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates_template.inl
+++ b/src/legate_sparse/array/conv/pos_to_coordinates_template.inl
@@ -28,6 +28,9 @@ struct ExpandPosToCoordinatesImplBody;
 
 template <VariantKind KIND>
 struct ExpandPosToCoordinatesImpl {
+  TaskContext context;
+  explicit ExpandPosToCoordinatesImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE>
   void operator()(ExpandPosToCoordinatesArgs& args) const
   {
@@ -41,7 +44,8 @@ struct ExpandPosToCoordinatesImpl {
     if (pos_domain.empty() || row_indices_domain.empty()) {
       return;
     }
-    ExpandPosToCoordinatesImplBody<KIND, INDEX_CODE>()(pos, row_indices, args.pos.shape<1>());
+    ExpandPosToCoordinatesImplBody<KIND, INDEX_CODE>{context}(
+      pos, row_indices, args.pos.shape<1>());
   }
 };
 
@@ -52,7 +56,7 @@ static void pos_to_coordinates_template(TaskContext context)
     context.outputs()[0],
     context.inputs()[0],
   };
-  index_type_dispatch(args.row_indices.code(), ExpandPosToCoordinatesImpl<KIND>(), args);
+  index_type_dispatch(args.row_indices.code(), ExpandPosToCoordinatesImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam.cc b/src/legate_sparse/array/csr/geam.cc
new file mode 100644
index 00000000..ced2d73b
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam.cc
@@ -0,0 +1,79 @@
+#include "legate_sparse/array/csr/geam.h"
+#include "legate_sparse/array/csr/geam_template.inl"
+#include "legate_sparse/array/csr/geam_kernels.h"
+
+namespace sparse {
+using namespace legate;
+
+template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct GeamComputeImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GeamComputeImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+  using VAL_TY   = type_of<VAL_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<VAL_TY, 1>& A_vals,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRO<VAL_TY, 1>& B_vals,
+                  const AccessorRO<Rect<1>, 1>& C_pos,
+                  const AccessorWO<INDEX_TY, 1>& C_crd,
+                  const AccessorWO<VAL_TY, 1>& C_vals,
+                  const AccessorRO<VAL_TY, 1>& alpha,
+                  const AccessorRO<VAL_TY, 1>& beta,
+                  const Rect<1>& rect)
+  {
+    VAL_TY alpha_val = alpha[0];
+    VAL_TY beta_val  = beta[0];
+
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      geam_compute_row(
+        row, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha_val, beta_val);
+    }
+  }
+};
+
+template <Type::Code INDEX_CODE>
+struct GeamSymbolicImplBody<VariantKind::CPU, INDEX_CODE> {
+  TaskContext context;
+  explicit GeamSymbolicImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRW<nnz_ty, 1>& nnz_per_row,
+                  const Rect<1>& rect)
+  {
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      nnz_per_row[row] = geam_symbolic_row(row, A_pos, A_crd, B_pos, B_crd);
+    }
+  }
+};
+
+/* static */ void GeamCSRCSRSymbolic::cpu_variant(legate::TaskContext context)
+{
+  geam_csr_csr_symbolic_template<VariantKind::CPU>(context);
+}
+
+/* static */ void GeamCSRCSRCompute::cpu_variant(legate::TaskContext context)
+{
+  geam_csr_csr_compute_template<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static const auto sparse_reg_task_ = []() -> char {
+  GeamCSRCSRSymbolic::register_variants();
+  GeamCSRCSRCompute::register_variants();
+  return 0;
+}();
+
+}  // namespace
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam.cu b/src/legate_sparse/array/csr/geam.cu
new file mode 100644
index 00000000..fdf467d3
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam.cu
@@ -0,0 +1,144 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/array/csr/geam.h"
+#include "legate_sparse/array/csr/geam_template.inl"
+#include "legate_sparse/array/csr/geam_kernels.h"
+#include "legate_sparse/util/cuda_help.h"
+
+namespace sparse {
+using namespace legate;
+
+// GPU kernel for symbolic phase: compute nnz_per_row
+template <typename INDEX_TY>
+__global__ void geam_symbolic_kernel(const size_t nrows,
+                                     const AccessorRO<Rect<1>, 1> A_pos,
+                                     const AccessorRO<INDEX_TY, 1> A_crd,
+                                     const AccessorRO<Rect<1>, 1> B_pos,
+                                     const AccessorRO<INDEX_TY, 1> B_crd,
+                                     const AccessorRW<nnz_ty, 1> nnz_per_row)
+{
+  const size_t row = global_tid_1d();
+  if (row >= nrows) {
+    return;
+  }
+
+  nnz_per_row[row] = geam_symbolic_row(row, A_pos, A_crd, B_pos, B_crd);
+}
+
+// GPU kernel for compute phase: C = alpha * A + beta * B
+template <typename INDEX_TY, typename VAL_TY>
+__global__ void geam_compute_kernel(const size_t nrows,
+                                    const AccessorRO<Rect<1>, 1> A_pos,
+                                    const AccessorRO<INDEX_TY, 1> A_crd,
+                                    const AccessorRO<VAL_TY, 1> A_vals,
+                                    const AccessorRO<Rect<1>, 1> B_pos,
+                                    const AccessorRO<INDEX_TY, 1> B_crd,
+                                    const AccessorRO<VAL_TY, 1> B_vals,
+                                    const AccessorRO<Rect<1>, 1> C_pos,
+                                    const AccessorWO<INDEX_TY, 1> C_crd,
+                                    const AccessorWO<VAL_TY, 1> C_vals,
+                                    const AccessorRO<VAL_TY, 1> alpha_acc,
+                                    const AccessorRO<VAL_TY, 1> beta_acc)
+{
+  const size_t row = global_tid_1d();
+  if (row >= nrows) {
+    return;
+  }
+
+  VAL_TY alpha = alpha_acc[0];
+  VAL_TY beta  = beta_acc[0];
+
+  geam_compute_row(
+    row, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha, beta);
+}
+
+// GPU implementation of the symbolic phase
+template <Type::Code INDEX_CODE>
+struct GeamSymbolicImplBody<VariantKind::GPU, INDEX_CODE> {
+  TaskContext context;
+  explicit GeamSymbolicImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRW<nnz_ty, 1>& nnz_per_row,
+                  const Rect<1>& rect)
+  {
+    auto stream     = context.get_task_stream();
+    auto nrows      = rect.hi[0] - rect.lo[0] + 1;
+    auto num_blocks = get_num_blocks_1d(nrows);
+
+    if (nrows == 0) {
+      return;
+    }
+
+    geam_symbolic_kernel<INDEX_TY><<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
+      nrows, A_pos, A_crd, B_pos, B_crd, nnz_per_row);
+    LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
+  }
+};
+
+/*static*/ void GeamCSRCSRSymbolic::gpu_variant(TaskContext context)
+{
+  geam_csr_csr_symbolic_template<VariantKind::GPU>(context);
+}
+
+// GPU implementation of the compute phase
+template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct GeamComputeImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GeamComputeImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+  using VAL_TY   = type_of<VAL_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<VAL_TY, 1>& A_vals,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRO<VAL_TY, 1>& B_vals,
+                  const AccessorRO<Rect<1>, 1>& C_pos,
+                  const AccessorWO<INDEX_TY, 1>& C_crd,
+                  const AccessorWO<VAL_TY, 1>& C_vals,
+                  const AccessorRO<VAL_TY, 1>& alpha,
+                  const AccessorRO<VAL_TY, 1>& beta,
+                  const Rect<1>& rect)
+  {
+    auto stream     = context.get_task_stream();
+    auto nrows      = rect.hi[0] - rect.lo[0] + 1;
+    auto num_blocks = get_num_blocks_1d(nrows);
+
+    if (nrows == 0) {
+      return;
+    }
+
+    geam_compute_kernel<INDEX_TY, VAL_TY><<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
+      nrows, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha, beta);
+    LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
+  }
+};
+
+/*static*/ void GeamCSRCSRCompute::gpu_variant(TaskContext context)
+{
+  geam_csr_csr_compute_template<VariantKind::GPU>(context);
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam.h b/src/legate_sparse/array/csr/geam.h
new file mode 100644
index 00000000..6329f307
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam.h
@@ -0,0 +1,91 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/sparse.h"
+#include "legate_sparse/sparse_c.h"
+#include "legate.h"
+
+namespace sparse {
+
+struct GeamCSRCSRSymbolicArgs {
+  // Symbolic phase: compute the sparsity pattern of C = alpha * A + beta * B
+  // This phase only needs the positions and coordinates, not the values or scalars
+  const legate::PhysicalStore& A_pos;
+  const legate::PhysicalStore& A_crd;
+  const legate::PhysicalStore& B_pos;
+  const legate::PhysicalStore& B_crd;
+  const legate::PhysicalStore& nnz_per_row;  // output: number of non-zeros per row
+};
+
+struct GeamCSRCSRComputeArgs {
+  // Compute phase: compute the output C where C = alpha * A + beta * B
+  // Inputs
+  const legate::PhysicalStore& A_pos;
+  const legate::PhysicalStore& A_crd;
+  const legate::PhysicalStore& A_vals;
+  const legate::PhysicalStore& B_pos;
+  const legate::PhysicalStore& B_crd;
+  const legate::PhysicalStore& B_vals;
+
+  // C_pos is an INPUT (computed in symbolic phase, read-only here)
+  const legate::PhysicalStore& C_pos;
+
+  // C_crd and C_vals are outputs
+  const legate::PhysicalStore& C_crd;
+  const legate::PhysicalStore& C_vals;
+
+  // Scalar constants
+  const legate::PhysicalStore& alpha;
+  const legate::PhysicalStore& beta;
+};
+
+class GeamCSRCSRCompute : public SparseTask<GeamCSRCSRCompute> {
+ public:
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE}};
+
+ public:
+  static void cpu_variant(legate::TaskContext context);
+
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext context);
+#endif
+
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+class GeamCSRCSRSymbolic : public SparseTask<GeamCSRCSRSymbolic> {
+ public:
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC}};
+
+ public:
+  static void cpu_variant(legate::TaskContext context);
+
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext context);
+#endif
+
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam_kernels.h b/src/legate_sparse/array/csr/geam_kernels.h
new file mode 100644
index 00000000..6ae7c6f0
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam_kernels.h
@@ -0,0 +1,129 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/util/typedefs.h"
+#include "legate.h"
+
+namespace sparse {
+using namespace legate;
+
+// =============================================================================
+// Symbolic Phase: Compute nnz per row for C = A + B
+// =============================================================================
+
+// Computes the number of non-zeros in a single row of C = A + B
+template <typename INDEX_TY>
+LEGATE_HOST_DEVICE inline nnz_ty geam_symbolic_row(size_t row,
+                                                   const AccessorRO<Rect<1>, 1> A_pos,
+                                                   const AccessorRO<INDEX_TY, 1> A_crd,
+                                                   const AccessorRO<Rect<1>, 1> B_pos,
+                                                   const AccessorRO<INDEX_TY, 1> B_crd)
+{
+  size_t A_pos_start = A_pos[row].lo;
+  size_t A_pos_end   = A_pos[row].hi + 1;
+  size_t B_pos_start = B_pos[row].lo;
+  size_t B_pos_end   = B_pos[row].hi + 1;
+
+  size_t a_pos = A_pos_start;
+  size_t b_pos = B_pos_start;
+  nnz_ty count = 0;
+
+  // Merge sorted column indices and count unique entries
+  while (a_pos < A_pos_end && b_pos < B_pos_end) {
+    if (A_crd[a_pos] < B_crd[b_pos]) {
+      a_pos++;
+    } else if (A_crd[a_pos] > B_crd[b_pos]) {
+      b_pos++;
+    } else {
+      a_pos++;
+      b_pos++;
+    }
+    count++;
+  }
+
+  // Add remaining elements
+  count += (A_pos_end - a_pos) + (B_pos_end - b_pos);
+  return count;
+}
+
+// =============================================================================
+// Compute Phase: Compute C = alpha * A + beta * B for a single row
+// =============================================================================
+
+// Computes a single row of C = alpha * A + beta * B
+template <typename INDEX_TY, typename VAL_TY>
+LEGATE_HOST_DEVICE inline void geam_compute_row(size_t row,
+                                                const AccessorRO<Rect<1>, 1> A_pos,
+                                                const AccessorRO<INDEX_TY, 1> A_crd,
+                                                const AccessorRO<VAL_TY, 1> A_vals,
+                                                const AccessorRO<Rect<1>, 1> B_pos,
+                                                const AccessorRO<INDEX_TY, 1> B_crd,
+                                                const AccessorRO<VAL_TY, 1> B_vals,
+                                                const AccessorRO<Rect<1>, 1> C_pos,
+                                                const AccessorWO<INDEX_TY, 1> C_crd,
+                                                const AccessorWO<VAL_TY, 1> C_vals,
+                                                VAL_TY alpha,
+                                                VAL_TY beta)
+{
+  size_t A_pos_start = A_pos[row].lo;
+  size_t A_pos_end   = A_pos[row].hi + 1;
+  size_t B_pos_start = B_pos[row].lo;
+  size_t B_pos_end   = B_pos[row].hi + 1;
+  size_t C_pos_start = C_pos[row].lo;
+
+  size_t a_pos = A_pos_start;
+  size_t b_pos = B_pos_start;
+  size_t c_pos = C_pos_start;
+
+  // Merge sorted column indices and compute values
+  while (a_pos < A_pos_end && b_pos < B_pos_end) {
+    if (A_crd[a_pos] < B_crd[b_pos]) {
+      C_crd[c_pos]  = A_crd[a_pos];
+      C_vals[c_pos] = alpha * A_vals[a_pos];
+      a_pos++;
+    } else if (A_crd[a_pos] > B_crd[b_pos]) {
+      C_crd[c_pos]  = B_crd[b_pos];
+      C_vals[c_pos] = beta * B_vals[b_pos];
+      b_pos++;
+    } else {
+      C_crd[c_pos]  = A_crd[a_pos];
+      C_vals[c_pos] = alpha * A_vals[a_pos] + beta * B_vals[b_pos];
+      a_pos++;
+      b_pos++;
+    }
+    c_pos++;
+  }
+
+  // Add remaining elements from A
+  while (a_pos < A_pos_end) {
+    C_crd[c_pos]  = A_crd[a_pos];
+    C_vals[c_pos] = alpha * A_vals[a_pos];
+    a_pos++;
+    c_pos++;
+  }
+
+  // Add remaining elements from B
+  while (b_pos < B_pos_end) {
+    C_crd[c_pos]  = B_crd[b_pos];
+    C_vals[c_pos] = beta * B_vals[b_pos];
+    b_pos++;
+    c_pos++;
+  }
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam_omp.cc b/src/legate_sparse/array/csr/geam_omp.cc
new file mode 100644
index 00000000..26c52361
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam_omp.cc
@@ -0,0 +1,87 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/array/csr/geam.h"
+#include "legate_sparse/array/csr/geam_template.inl"
+#include "legate_sparse/array/csr/geam_kernels.h"
+
+namespace sparse {
+using namespace legate;
+
+template <Type::Code INDEX_CODE>
+struct GeamSymbolicImplBody<VariantKind::OMP, INDEX_CODE> {
+  TaskContext context;
+  explicit GeamSymbolicImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRW<nnz_ty, 1>& nnz_per_row,
+                  const Rect<1>& rect)
+  {
+#pragma omp parallel for
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      nnz_per_row[row] = geam_symbolic_row(row, A_pos, A_crd, B_pos, B_crd);
+    }
+  }
+};
+
+/* static */ void GeamCSRCSRSymbolic::omp_variant(TaskContext context)
+{
+  geam_csr_csr_symbolic_template<VariantKind::OMP>(context);
+}
+
+template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct GeamComputeImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GeamComputeImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+  using VAL_TY   = type_of<VAL_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<VAL_TY, 1>& A_vals,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRO<VAL_TY, 1>& B_vals,
+                  const AccessorRO<Rect<1>, 1>& C_pos,
+                  const AccessorWO<INDEX_TY, 1>& C_crd,
+                  const AccessorWO<VAL_TY, 1>& C_vals,
+                  const AccessorRO<VAL_TY, 1>& alpha,
+                  const AccessorRO<VAL_TY, 1>& beta,
+                  const Rect<1>& rect)
+  {
+    VAL_TY alpha_val = alpha[0];
+    VAL_TY beta_val  = beta[0];
+
+#pragma omp parallel for
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      geam_compute_row(
+        row, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha_val, beta_val);
+    }
+  }
+};
+
+/* static */ void GeamCSRCSRCompute::omp_variant(TaskContext context)
+{
+  geam_csr_csr_compute_template<VariantKind::OMP>(context);
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam_template.inl b/src/legate_sparse/array/csr/geam_template.inl
new file mode 100644
index 00000000..0dc13513
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam_template.inl
@@ -0,0 +1,139 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/array/csr/geam.h"
+#include "legate_sparse/util/dispatch.h"
+#include "legate_sparse/util/typedefs.h"
+
+namespace sparse {
+using namespace legate;
+
+// ============================================================================
+// Symbolic phase templates
+// ============================================================================
+
+template <VariantKind KIND, Type::Code INDEX_TY>
+struct GeamSymbolicImplBody;
+
+template <VariantKind KIND>
+struct GeamSymbolicImpl {
+  TaskContext context;
+  explicit GeamSymbolicImpl(TaskContext context) : context(context) {}
+
+  template <Type::Code INDEX_CODE>
+  void operator()(const GeamCSRCSRSymbolicArgs& args)
+  {
+    using INDEX_TY = type_of<INDEX_CODE>;
+
+    auto A_pos = args.A_pos.read_accessor<Rect<1>, 1>();
+    auto A_crd = args.A_crd.read_accessor<INDEX_TY, 1>();
+    auto B_pos = args.B_pos.read_accessor<Rect<1>, 1>();
+    auto B_crd = args.B_crd.read_accessor<INDEX_TY, 1>();
+
+    auto nnz_per_row = args.nnz_per_row.read_write_accessor<nnz_ty, 1>();
+
+    GeamSymbolicImplBody<KIND, INDEX_CODE>{context}(
+      A_pos, A_crd, B_pos, B_crd, nnz_per_row, args.A_pos.shape<1>());
+  }
+};
+
+template <VariantKind KIND>
+static void geam_csr_csr_symbolic_template(TaskContext context)
+{
+  GeamCSRCSRSymbolicArgs args{
+    context.inputs()[0],   // A_pos
+    context.inputs()[1],   // A_crd
+    context.inputs()[2],   // B_pos
+    context.inputs()[3],   // B_crd
+    context.outputs()[0],  // nnz_per_row
+  };
+
+  index_type_dispatch(args.A_crd.code(), GeamSymbolicImpl<KIND>{context}, args);
+}
+
+// ============================================================================
+// Compute phase templates
+// ============================================================================
+
+template <VariantKind KIND, Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct GeamComputeImplBody;
+
+template <VariantKind KIND>
+struct GeamComputeImpl {
+  TaskContext context;
+  explicit GeamComputeImpl(TaskContext context) : context(context) {}
+
+  template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+  void operator()(const GeamCSRCSRComputeArgs& args)
+  {
+    using INDEX_TY = type_of<INDEX_CODE>;
+    using VAL_TY   = type_of<VAL_CODE>;
+
+    auto A_pos  = args.A_pos.read_accessor<Rect<1>, 1>();
+    auto A_crd  = args.A_crd.read_accessor<INDEX_TY, 1>();
+    auto A_vals = args.A_vals.read_accessor<VAL_TY, 1>();
+    auto B_pos  = args.B_pos.read_accessor<Rect<1>, 1>();
+    auto B_crd  = args.B_crd.read_accessor<INDEX_TY, 1>();
+    auto B_vals = args.B_vals.read_accessor<VAL_TY, 1>();
+
+    // C_pos is read-only (computed in symbolic phase)
+    auto C_pos  = args.C_pos.read_accessor<Rect<1>, 1>();
+    auto C_crd  = args.C_crd.write_accessor<INDEX_TY, 1>();
+    auto C_vals = args.C_vals.write_accessor<VAL_TY, 1>();
+
+    // Read scalar values
+    auto alpha = args.alpha.read_accessor<VAL_TY, 1>();
+    auto beta  = args.beta.read_accessor<VAL_TY, 1>();
+
+    GeamComputeImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(A_pos,
+                                                             A_crd,
+                                                             A_vals,
+                                                             B_pos,
+                                                             B_crd,
+                                                             B_vals,
+                                                             C_pos,
+                                                             C_crd,
+                                                             C_vals,
+                                                             alpha,
+                                                             beta,
+                                                             args.A_pos.shape<1>());
+  }
+};
+
+template <VariantKind KIND>
+static void geam_csr_csr_compute_template(TaskContext context)
+{
+  GeamCSRCSRComputeArgs args{
+    context.inputs()[0],   // A_pos
+    context.inputs()[1],   // A_crd
+    context.inputs()[2],   // A_vals
+    context.inputs()[3],   // B_pos
+    context.inputs()[4],   // B_crd
+    context.inputs()[5],   // B_vals
+    context.inputs()[6],   // C_pos (read-only, computed in symbolic phase)
+    context.outputs()[0],  // C_crd
+    context.outputs()[1],  // C_vals
+    context.inputs()[7],   // alpha
+    context.inputs()[8],   // beta
+  };
+
+  index_type_value_type_dispatch(
+    args.A_crd.code(), args.A_vals.code(), GeamComputeImpl<KIND>{context}, args);
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/get_diagonal.cc b/src/legate_sparse/array/csr/get_diagonal.cc
index cace6438..47a8c7d1 100644
--- a/src/legate_sparse/array/csr/get_diagonal.cc
+++ b/src/legate_sparse/array/csr/get_diagonal.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct GetCSRDiagonalImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GetCSRDiagonalImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/get_diagonal.cu b/src/legate_sparse/array/csr/get_diagonal.cu
index 15e5e8a4..f0a32dfd 100644
--- a/src/legate_sparse/array/csr/get_diagonal.cu
+++ b/src/legate_sparse/array/csr/get_diagonal.cu
@@ -45,6 +45,9 @@ __global__ void compute_diag_kernel(size_t rows,
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct GetCSRDiagonalImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GetCSRDiagonalImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
@@ -54,7 +57,7 @@ struct GetCSRDiagonalImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
                   const AccessorRO<VAL_TY, 1>& vals,
                   const Rect<1>& rect)
   {
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     auto blocks = get_num_blocks_1d(rect.volume());
     compute_diag_kernel<INDEX_TY, VAL_TY>
       <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(rect.volume(), rect.lo[0], diag, pos, crd, vals);
diff --git a/src/legate_sparse/array/csr/get_diagonal_omp.cc b/src/legate_sparse/array/csr/get_diagonal_omp.cc
index ad698eed..c3d114ef 100644
--- a/src/legate_sparse/array/csr/get_diagonal_omp.cc
+++ b/src/legate_sparse/array/csr/get_diagonal_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct GetCSRDiagonalImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GetCSRDiagonalImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/get_diagonal_template.inl b/src/legate_sparse/array/csr/get_diagonal_template.inl
index 74ac61bb..0cee5e9a 100644
--- a/src/legate_sparse/array/csr/get_diagonal_template.inl
+++ b/src/legate_sparse/array/csr/get_diagonal_template.inl
@@ -29,6 +29,9 @@ struct GetCSRDiagonalImplBody;
 
 template <VariantKind KIND>
 struct GetCSRDiagonalImpl {
+  TaskContext context;
+  explicit GetCSRDiagonalImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(GetCSRDiagonalArgs& args) const
   {
@@ -45,7 +48,7 @@ struct GetCSRDiagonalImpl {
       return;
     }
 
-    GetCSRDiagonalImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+    GetCSRDiagonalImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(
       diag, pos, crd, vals, args.diag.shape<1>());
   }
 };
@@ -56,6 +59,6 @@ static void get_csr_diagonal_template(TaskContext context)
   auto inputs = context.inputs();
   GetCSRDiagonalArgs args{context.outputs()[0], inputs[0], inputs[1], inputs[2]};
   index_type_value_type_dispatch(
-    args.crd.code(), args.diag.code(), GetCSRDiagonalImpl<KIND>{}, args);
+    args.crd.code(), args.diag.code(), GetCSRDiagonalImpl<KIND>{context}, args);
 }
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/indexing.cc b/src/legate_sparse/array/csr/indexing.cc
index f40c901b..1ec4f57a 100644
--- a/src/legate_sparse/array/csr/indexing.cc
+++ b/src/legate_sparse/array/csr/indexing.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRIndexingCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRIndexingCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/indexing.cu b/src/legate_sparse/array/csr/indexing.cu
index 25e96097..68d03ffd 100644
--- a/src/legate_sparse/array/csr/indexing.cu
+++ b/src/legate_sparse/array/csr/indexing.cu
@@ -84,6 +84,9 @@ __global__ void csr_indexing_csr_kernel(const size_t num_rows,
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRIndexingCSRImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRIndexingCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
@@ -98,9 +101,7 @@ struct CSRIndexingCSRImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
     // Get the number of rows in the matrix
     size_t num_rows = rect.hi[0] - rect.lo[0] + 1;
 
-    std::cout << "GPU variant" << std::endl;
-
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     auto blocks = get_num_blocks_1d(rect.volume());
     csr_indexing_csr_kernel<INDEX_TY, VAL_TY><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       num_rows, A_pos, A_crd, A_vals, mask_pos, mask_crd, value);
diff --git a/src/legate_sparse/array/csr/indexing_omp.cc b/src/legate_sparse/array/csr/indexing_omp.cc
index c429481f..a96fc270 100644
--- a/src/legate_sparse/array/csr/indexing_omp.cc
+++ b/src/legate_sparse/array/csr/indexing_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRIndexingCSRImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRIndexingCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
@@ -34,7 +37,6 @@ struct CSRIndexingCSRImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
                   const AccessorRO<VAL_TY, 1>& value,
                   const Rect<1>& rect)
   {
-    std::cout << "OMP variant" << std::endl;
 #pragma omp parallel for
     for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
       size_t j_pos_start = A_pos[row].lo;
diff --git a/src/legate_sparse/array/csr/indexing_template.inl b/src/legate_sparse/array/csr/indexing_template.inl
index 381ca45b..e73efa60 100644
--- a/src/legate_sparse/array/csr/indexing_template.inl
+++ b/src/legate_sparse/array/csr/indexing_template.inl
@@ -28,6 +28,9 @@ struct CSRIndexingCSRImplBody;
 
 template <VariantKind KIND>
 struct CSRIndexingCSRImpl {
+  TaskContext context;
+  explicit CSRIndexingCSRImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(const CSRIndexingCSRArgs& args)
   {
@@ -44,7 +47,7 @@ struct CSRIndexingCSRImpl {
     auto value = args.value.read_accessor<VAL_TY, 1>();
 
     // TODO: Rect is based on A_pos.shape, is that correct?
-    CSRIndexingCSRImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+    CSRIndexingCSRImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(
       A_pos, A_crd, A_vals, key_pos, key_crd, value, args.A_pos.shape<1>());
   }
 };
@@ -62,7 +65,7 @@ static void csr_indexing_csr_template(TaskContext context)
   };
 
   index_type_value_type_dispatch(
-    args.A_crd.code(), args.A_vals.code(), CSRIndexingCSRImpl<KIND>(), args);
+    args.A_crd.code(), args.A_vals.code(), CSRIndexingCSRImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
index 6c4945de..71728397 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
@@ -27,6 +27,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE>
 struct SpGEMMCSRxCSRxCSRNNZImplBody<VariantKind::CPU, INDEX_CODE> {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRNNZImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorWO<nnz_ty, 1>& nnz,
@@ -94,6 +97,9 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody<VariantKind::CPU, INDEX_CODE> {
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct SpGEMMCSRxCSRxCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
index 66827db6..3aa14a26 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
@@ -41,10 +41,10 @@ __global__ void cast_and_offset(size_t elems, DST* dst, const SRC* src, int64_t
   dst[idx] = static_cast<DST>(src[idx] - offset);
 }
 
-int64_t local_offset_from_nnz(ncclComm_t comm, coord_t task_id, coord_t task_num, int64_t A_nnz)
+int64_t local_offset_from_nnz(
+  ncclComm_t comm, coord_t task_id, coord_t task_num, int64_t A_nnz, cudaStream_t stream)
 {
   ThrustAllocator alloc(Memory::GPU_FB_MEM);
-  auto stream         = get_cached_stream();
   auto policy         = thrust::cuda::par(alloc).on(stream);
   auto buf            = CREATE_BUFFER(int64_t, task_num, Memory::GPU_FB_MEM, "nnz_reduce_buf");
   auto nnz_reduce_buf = buf.ptr(0);
@@ -67,6 +67,9 @@ int64_t local_offset_from_nnz(ncclComm_t comm, coord_t task_id, coord_t task_num
 }
 
 struct SpGEMMCSRxCSRxCSRGPUImpl {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRGPUImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(SpGEMMCSRxCSRxCSRGPUArgs& args, coord_t task_id, coord_t task_size) const
   {
@@ -106,7 +109,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
 
     // Get context sensitive objects.
     auto handle = get_cusparse();
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     CHECK_CUSPARSE(cusparseSetStream(handle, stream));
 
     auto B_rows      = B_pos.domain().get_volume();
@@ -331,7 +334,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
         //@TODO (marsaev): we don't really need nccl comm here
         // latency for 1 int and host comm should be much better
         ncclComm_t* comm = args.comms[0].get<ncclComm_t*>();
-        offset_nnz       = local_offset_from_nnz(*comm, task_id, task_num, A_nnz);
+        offset_nnz       = local_offset_from_nnz(*comm, task_id, task_num, A_nnz, stream);
       }
 
       // Convert the A_indptr array into a pos array.
@@ -472,7 +475,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
         //@TODO (marsaev): we don't really need nccl comm here
         // latency for 1 int and host comm should be much better
         ncclComm_t* comm = args.comms[0].get<ncclComm_t*>();
-        offset_nnz       = local_offset_from_nnz(*comm, task_id, task_num, A_nnz);
+        offset_nnz       = local_offset_from_nnz(*comm, task_id, task_num, A_nnz, stream);
       }
 
       // Convert the A_indptr array into a pos array.
@@ -524,7 +527,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
                                 context.communicators()};
   index_type_floating_point_value_type_dispatch(args.A_crd.code(),
                                                 args.A_vals.code(),
-                                                SpGEMMCSRxCSRxCSRGPUImpl{},
+                                                SpGEMMCSRxCSRxCSRGPUImpl{context},
                                                 args,
                                                 context.get_task_index()[0],
                                                 context.get_launch_domain().hi()[0]);
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
index e6ac4ef6..addf5d59 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
@@ -28,6 +28,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE>
 struct SpGEMMCSRxCSRxCSRNNZImplBody<VariantKind::OMP, INDEX_CODE> {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRNNZImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorWO<nnz_ty, 1>& nnz,
@@ -96,6 +99,9 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody<VariantKind::OMP, INDEX_CODE> {
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct SpGEMMCSRxCSRxCSRImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl
index c958752a..0b99743a 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl
@@ -32,6 +32,9 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody;
 
 template <VariantKind KIND>
 struct SpGEMMCSRxCSRxCSRNNZImpl {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRNNZImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE>
   void operator()(SpGEMMCSRxCSRxCSRNNZArgs& args) const
   {
@@ -43,7 +46,7 @@ struct SpGEMMCSRxCSRxCSRNNZImpl {
     auto C_pos = args.C_pos.read_accessor<Rect<1>, 1>();
     auto C_crd = args.C_crd.read_accessor<INDEX_TY, 1>();
 
-    SpGEMMCSRxCSRxCSRNNZImplBody<KIND, INDEX_CODE>()(
+    SpGEMMCSRxCSRxCSRNNZImplBody<KIND, INDEX_CODE>{context}(
       nnz, B_pos, B_crd, C_pos, C_crd, args.B_pos.shape<1>(), args.C_crd.shape<1>());
   }
 };
@@ -53,6 +56,9 @@ struct SpGEMMCSRxCSRxCSRImplBody;
 
 template <VariantKind KIND>
 struct SpGEMMCSRxCSRxCSRImpl {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(SpGEMMCSRxCSRxCSRArgs& args) const
   {
@@ -69,17 +75,17 @@ struct SpGEMMCSRxCSRxCSRImpl {
     auto C_crd  = args.C_crd.read_accessor<INDEX_TY, 1>();
     auto C_vals = args.C_vals.read_accessor<VAL_TY, 1>();
 
-    SpGEMMCSRxCSRxCSRImplBody<KIND, INDEX_CODE, VAL_CODE>()(A_pos,
-                                                            A_crd,
-                                                            A_vals,
-                                                            B_pos,
-                                                            B_crd,
-                                                            B_vals,
-                                                            C_pos,
-                                                            C_crd,
-                                                            C_vals,
-                                                            args.B_pos.shape<1>(),
-                                                            args.C_crd.shape<1>());
+    SpGEMMCSRxCSRxCSRImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(A_pos,
+                                                                   A_crd,
+                                                                   A_vals,
+                                                                   B_pos,
+                                                                   B_crd,
+                                                                   B_vals,
+                                                                   C_pos,
+                                                                   C_crd,
+                                                                   C_vals,
+                                                                   args.B_pos.shape<1>(),
+                                                                   args.C_crd.shape<1>());
   }
 };
 
@@ -95,7 +101,7 @@ static void spgemm_csr_csr_csr_nnz_template(TaskContext context)
     inputs[3],
   };
 
-  index_type_dispatch(args.B_crd.code(), SpGEMMCSRxCSRxCSRNNZImpl<KIND>{}, args);
+  index_type_dispatch(args.B_crd.code(), SpGEMMCSRxCSRxCSRNNZImpl<KIND>{context}, args);
 }
 
 template <VariantKind KIND>
@@ -115,7 +121,7 @@ static void spgemm_csr_csr_csr_template(TaskContext context)
     inputs[5],
   };
   index_type_floating_point_value_type_dispatch(
-    args.A_crd.code(), args.A_vals.code(), SpGEMMCSRxCSRxCSRImpl<KIND>{}, args);
+    args.A_crd.code(), args.A_vals.code(), SpGEMMCSRxCSRxCSRImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spmv.cc b/src/legate_sparse/array/csr/spmv.cc
index d9efa4fd..42d2576c 100644
--- a/src/legate_sparse/array/csr/spmv.cc
+++ b/src/legate_sparse/array/csr/spmv.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRSpMVRowSplitImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRSpMVRowSplitImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/spmv.cu b/src/legate_sparse/array/csr/spmv.cu
index f2c5f1a1..536a18d8 100644
--- a/src/legate_sparse/array/csr/spmv.cu
+++ b/src/legate_sparse/array/csr/spmv.cu
@@ -29,6 +29,9 @@ namespace sparse {
 
 template <>
 struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
+  TaskContext context;
+  explicit CSRSpMVRowSplitImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(CSRSpMVRowSplitArgs& args) const
   {
@@ -48,7 +51,7 @@ struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
 
     // Get context sensitive objects.
     auto handle = get_cusparse();
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     CHECK_CUSPARSE(cusparseSetStream(handle, stream));
 
     // Older cusparse has bug when output vector is not aligned to 16 bytes
@@ -109,7 +112,7 @@ struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
     CHECK_CUSPARSE(cusparseCreateDnVec(
       &cusparse_y, y_domain_size /* size */, output_ptr, cusparseDataType<VAL_TY>()));
 
-    auto cusparse_A = makeCuSparseCSR<INDEX_TY, VAL_TY>(A_pos, A_crd, A_vals, cols);
+    auto cusparse_A = makeCuSparseCSR<INDEX_TY, VAL_TY>(A_pos, A_crd, A_vals, cols, stream);
 
     // Make the CUSPARSE calls.
     VAL_TY alpha   = 1.0;
@@ -153,7 +156,7 @@ struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
                                 workspacePtr));
     // if we used temporary buffer, copy result to output
     if (y_aligned) {
-      LEGATE_CHECK_CUDA(cudaMemcpyAsync(
+      LEGATE_SPARSE_CHECK_CUDA(cudaMemcpyAsync(
         y_raw_ptr, output_ptr, y_domain_size * sizeof(VAL_TY), cudaMemcpyDeviceToDevice, stream));
     }
     // Destroy the created objects.
@@ -170,7 +173,7 @@ struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
   CSRSpMVRowSplitArgs args{context.outputs()[0], inputs[0], inputs[1], inputs[2], inputs[3]};
 
   index_type_floating_point_value_type_dispatch(
-    args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl<VariantKind::GPU>{}, args);
+    args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl<VariantKind::GPU>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spmv_omp.cc b/src/legate_sparse/array/csr/spmv_omp.cc
index 40b84e83..2937d848 100644
--- a/src/legate_sparse/array/csr/spmv_omp.cc
+++ b/src/legate_sparse/array/csr/spmv_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRSpMVRowSplitImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRSpMVRowSplitImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/spmv_template.inl b/src/legate_sparse/array/csr/spmv_template.inl
index f339e2d1..ba55490a 100644
--- a/src/legate_sparse/array/csr/spmv_template.inl
+++ b/src/legate_sparse/array/csr/spmv_template.inl
@@ -31,6 +31,9 @@ struct CSRSpMVRowSplitImplBody;
 
 template <VariantKind KIND>
 struct CSRSpMVRowSplitImpl {
+  TaskContext context;
+  explicit CSRSpMVRowSplitImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(CSRSpMVRowSplitArgs& args) const
   {
@@ -48,7 +51,7 @@ struct CSRSpMVRowSplitImpl {
       return;
     }
 
-    CSRSpMVRowSplitImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+    CSRSpMVRowSplitImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(
       y, A_pos, A_crd, A_vals, x, args.y.shape<1>());
   }
 };
@@ -60,7 +63,7 @@ static void csr_spmv_row_split_template(TaskContext context)
   CSRSpMVRowSplitArgs args{context.outputs()[0], inputs[0], inputs[1], inputs[2], inputs[3]};
 
   index_type_value_type_dispatch(
-    args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl<KIND>{}, args);
+    args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/scale_rect.cc b/src/legate_sparse/array/util/scale_rect.cc
index c2d2df90..50bbedd8 100644
--- a/src/legate_sparse/array/util/scale_rect.cc
+++ b/src/legate_sparse/array/util/scale_rect.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <>
 struct ScaleRect1ImplBody<VariantKind::CPU> {
+  TaskContext context;
+  explicit ScaleRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorRW<Rect<1>, 1>& output, const int64_t scale, const Rect<1>& rect)
   {
     for (coord_t i = rect.lo[0]; i < rect.hi[0] + 1; i++) {
diff --git a/src/legate_sparse/array/util/scale_rect.cu b/src/legate_sparse/array/util/scale_rect.cu
index 22132340..fc07bcd6 100644
--- a/src/legate_sparse/array/util/scale_rect.cu
+++ b/src/legate_sparse/array/util/scale_rect.cu
@@ -38,11 +38,14 @@ __global__ void scale_rect1_kernel(size_t elems,
 
 template <>
 struct ScaleRect1ImplBody<VariantKind::GPU> {
+  TaskContext context;
+  explicit ScaleRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorRW<Rect<1>, 1>& output, const int64_t scale, const Rect<1>& rect)
   {
     auto elems  = rect.volume();
     auto blocks = get_num_blocks_1d(elems);
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     scale_rect1_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(elems, rect.lo, output, scale);
     LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
   }
diff --git a/src/legate_sparse/array/util/scale_rect_omp.cc b/src/legate_sparse/array/util/scale_rect_omp.cc
index fc850bf3..1783d335 100644
--- a/src/legate_sparse/array/util/scale_rect_omp.cc
+++ b/src/legate_sparse/array/util/scale_rect_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <>
 struct ScaleRect1ImplBody<VariantKind::OMP> {
+  TaskContext context;
+  explicit ScaleRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorRW<Rect<1>, 1>& output, const int64_t scale, const Rect<1>& rect)
   {
 #pragma omp parallel for schedule(static)
diff --git a/src/legate_sparse/array/util/scale_rect_template.inl b/src/legate_sparse/array/util/scale_rect_template.inl
index 11724c24..512dc2c8 100644
--- a/src/legate_sparse/array/util/scale_rect_template.inl
+++ b/src/legate_sparse/array/util/scale_rect_template.inl
@@ -29,13 +29,16 @@ struct ScaleRect1ImplBody;
 
 template <VariantKind KIND>
 struct ScaleRect1Impl {
+  TaskContext context;
+  explicit ScaleRect1Impl(TaskContext context) : context(context) {}
+
   void operator()(ScaleRect1Args& args) const
   {
     auto output = args.out.read_write_accessor<Rect<1>, 1>();
     if (args.out.domain().empty()) {
       return;
     }
-    ScaleRect1ImplBody<KIND>()(output, args.scale, args.out.shape<1>());
+    ScaleRect1ImplBody<KIND>{context}(output, args.scale, args.out.shape<1>());
   }
 };
 
@@ -45,7 +48,7 @@ static void scale_rect_1_template(TaskContext context)
   auto task  = context.task_;
   auto scale = task->futures[0].get_result<int64_t>();
   ScaleRect1Args args{context.outputs()[0], scale};
-  ScaleRect1Impl<KIND>{}(args);
+  ScaleRect1Impl<KIND>{context}(args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/unzip_rect.cc b/src/legate_sparse/array/util/unzip_rect.cc
index 1272e9cc..08170da7 100644
--- a/src/legate_sparse/array/util/unzip_rect.cc
+++ b/src/legate_sparse/array/util/unzip_rect.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <>
 struct UnZipRect1ImplBody<VariantKind::CPU> {
+  TaskContext context;
+  explicit UnZipRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<int64_t, 1>& out1,
                   const AccessorWO<int64_t, 1>& out2,
                   const AccessorRO<Rect<1>, 1>& in,
diff --git a/src/legate_sparse/array/util/unzip_rect.cu b/src/legate_sparse/array/util/unzip_rect.cu
index 28067190..d7964c01 100644
--- a/src/legate_sparse/array/util/unzip_rect.cu
+++ b/src/legate_sparse/array/util/unzip_rect.cu
@@ -39,6 +39,9 @@ __global__ void unzip_rect1_kernel(size_t elems,
 
 template <>
 struct UnZipRect1ImplBody<VariantKind::GPU> {
+  TaskContext context;
+  explicit UnZipRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<int64_t, 1>& out1,
                   const AccessorWO<int64_t, 1>& out2,
                   const AccessorRO<Rect<1>, 1>& in,
@@ -46,7 +49,7 @@ struct UnZipRect1ImplBody<VariantKind::GPU> {
   {
     auto elems  = rect.volume();
     auto blocks = get_num_blocks_1d(elems);
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     unzip_rect1_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(elems, rect.lo, out1, out2, in);
     LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
   }
diff --git a/src/legate_sparse/array/util/unzip_rect_omp.cc b/src/legate_sparse/array/util/unzip_rect_omp.cc
index e57c43cd..b0345795 100644
--- a/src/legate_sparse/array/util/unzip_rect_omp.cc
+++ b/src/legate_sparse/array/util/unzip_rect_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <>
 struct UnZipRect1ImplBody<VariantKind::OMP> {
+  TaskContext context;
+  explicit UnZipRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<int64_t, 1>& out1,
                   const AccessorWO<int64_t, 1>& out2,
                   const AccessorRO<Rect<1>, 1>& in,
diff --git a/src/legate_sparse/array/util/unzip_rect_template.inl b/src/legate_sparse/array/util/unzip_rect_template.inl
index d8bd9d2e..2c97d28d 100644
--- a/src/legate_sparse/array/util/unzip_rect_template.inl
+++ b/src/legate_sparse/array/util/unzip_rect_template.inl
@@ -29,6 +29,9 @@ struct UnZipRect1ImplBody;
 
 template <VariantKind KIND>
 struct UnZipRect1Impl {
+  TaskContext context;
+  explicit UnZipRect1Impl(TaskContext context) : context(context) {}
+
   void operator()(UnZipRect1Args& args) const
   {
     auto out1 = args.out1.write_accessor<int64_t, 1>();
@@ -37,7 +40,7 @@ struct UnZipRect1Impl {
     if (args.in.domain().empty()) {
       return;
     }
-    UnZipRect1ImplBody<KIND>()(out1, out2, in, args.in.shape<1>());
+    UnZipRect1ImplBody<KIND>{context}(out1, out2, in, args.in.shape<1>());
   }
 };
 
@@ -46,7 +49,7 @@ static void unzip_rect_1_template(TaskContext context)
 {
   auto outputs = context.outputs();
   UnZipRect1Args args{outputs[0], outputs[1], context.inputs()[0]};
-  UnZipRect1Impl<KIND>{}(args);
+  UnZipRect1Impl<KIND>{context}(args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/zip_to_rect.cc b/src/legate_sparse/array/util/zip_to_rect.cc
index c8871583..dcbd8dfb 100644
--- a/src/legate_sparse/array/util/zip_to_rect.cc
+++ b/src/legate_sparse/array/util/zip_to_rect.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <typename VAL>
 struct ZipToRect1ImplBody<VariantKind::CPU, VAL> {
+  TaskContext context;
+  explicit ZipToRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<Rect<1>, 1>& output,
                   const AccessorRO<VAL, 1>& lo,
                   const AccessorRO<VAL, 1>& hi,
diff --git a/src/legate_sparse/array/util/zip_to_rect.cu b/src/legate_sparse/array/util/zip_to_rect.cu
index 697422e0..393c5860 100644
--- a/src/legate_sparse/array/util/zip_to_rect.cu
+++ b/src/legate_sparse/array/util/zip_to_rect.cu
@@ -39,12 +39,15 @@ __global__ void zip_rect1_kernel(size_t elems,
 
 template <typename VAL>
 struct ZipToRect1ImplBody<VariantKind::GPU, VAL> {
+  TaskContext context;
+  explicit ZipToRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<Rect<1>, 1>& output,
                   const AccessorRO<VAL, 1>& lo,
                   const AccessorRO<VAL, 1>& hi,
                   const Rect<1>& rect)
   {
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     auto elems  = rect.volume();
     auto blocks = get_num_blocks_1d(elems);
     zip_rect1_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(elems, rect.lo, output, lo, hi);
diff --git a/src/legate_sparse/array/util/zip_to_rect_omp.cc b/src/legate_sparse/array/util/zip_to_rect_omp.cc
index 03738d36..0b72d8d3 100644
--- a/src/legate_sparse/array/util/zip_to_rect_omp.cc
+++ b/src/legate_sparse/array/util/zip_to_rect_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <typename VAL>
 struct ZipToRect1ImplBody<VariantKind::OMP, VAL> {
+  TaskContext context;
+  explicit ZipToRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<Rect<1>, 1>& output,
                   const AccessorRO<VAL, 1>& lo,
                   const AccessorRO<VAL, 1>& hi,
diff --git a/src/legate_sparse/array/util/zip_to_rect_template.inl b/src/legate_sparse/array/util/zip_to_rect_template.inl
index c53411e8..7ebad169 100644
--- a/src/legate_sparse/array/util/zip_to_rect_template.inl
+++ b/src/legate_sparse/array/util/zip_to_rect_template.inl
@@ -29,6 +29,9 @@ struct ZipToRect1ImplBody;
 
 template <VariantKind KIND, typename VAL>
 struct ZipToRect1Impl {
+  TaskContext context;
+  explicit ZipToRect1Impl(TaskContext context) : context(context) {}
+
   void operator()(ZipToRect1Args& args) const
   {
     auto output = args.out.write_accessor<Rect<1>, 1>();
@@ -37,7 +40,7 @@ struct ZipToRect1Impl {
     if (args.out.domain().empty()) {
       return;
     }
-    ZipToRect1ImplBody<KIND, VAL>()(output, lo, hi, args.out.shape<1>());
+    ZipToRect1ImplBody<KIND, VAL>{context}(output, lo, hi, args.out.shape<1>());
   }
 };
 
@@ -47,10 +50,10 @@ static void zip_to_rect_1_template(TaskContext context)
   auto inputs = context.inputs();
   ZipToRect1Args args{context.outputs()[0], inputs[0], inputs[1]};
   if (inputs[0].data().type().code() == legate::Type::Code::INT64) {
-    ZipToRect1Impl<KIND, int64_t>{}(args);
+    ZipToRect1Impl<KIND, int64_t>{context}(args);
   } else {
     assert(inputs[0].data().type().code() == legate::Type::Code::UINT64);
-    ZipToRect1Impl<KIND, uint64_t>{}(args);
+    ZipToRect1Impl<KIND, uint64_t>{context}(args);
   }
 }
 
diff --git a/src/legate_sparse/cffi.h b/src/legate_sparse/cffi.h
index cdd53926..584311aa 100644
--- a/src/legate_sparse/cffi.h
+++ b/src/legate_sparse/cffi.h
@@ -45,6 +45,13 @@ enum LegateSparseOpCode {
   // like iterative linear solvers.
   LEGATE_SPARSE_AXPBY,
 
+  // Sparse direct linear solve
+  LEGATE_SPARSE_SPSOLVE,
+
+  // Computes GEAM: alpha * A + beta * B = C
+  LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC,
+  LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE,
+
   // nonzero API
   LEGATE_SPARSE_NONZERO,
 
diff --git a/src/legate_sparse/cudalibs.cu b/src/legate_sparse/cudalibs.cu
index 6ec45bd5..d2607fb2 100644
--- a/src/legate_sparse/cudalibs.cu
+++ b/src/legate_sparse/cudalibs.cu
@@ -22,7 +22,7 @@
 
 namespace sparse {
 
-CUDALibraries::CUDALibraries() : finalized_(false), cusparse_(nullptr) {}
+CUDALibraries::CUDALibraries() : finalized_(false), cusparse_(nullptr), cudss_(nullptr) {}
 
 CUDALibraries::~CUDALibraries() { finalize(); }
 
@@ -34,6 +34,9 @@ void CUDALibraries::finalize()
   if (cusparse_ != nullptr) {
     finalize_cusparse();
   }
+  if (cudss_ != nullptr) {
+    finalize_cudss();
+  }
   finalized_ = true;
 }
 
@@ -51,6 +54,20 @@ cusparseHandle_t CUDALibraries::get_cusparse()
   return this->cusparse_;
 }
 
+void CUDALibraries::finalize_cudss()
+{
+  CHECK_CUDSS(cudssDestroy(cudss_));
+  cudss_ = nullptr;
+}
+
+cudssHandle_t CUDALibraries::get_cudss()
+{
+  if (this->cudss_ == nullptr) {
+    CHECK_CUDSS(cudssCreate(&this->cudss_));
+  }
+  return this->cudss_;
+}
+
 static CUDALibraries& get_cuda_libraries(legate::Processor proc)
 {
   if (proc.kind() != legate::Processor::TOC_PROC) {
@@ -63,16 +80,18 @@ static CUDALibraries& get_cuda_libraries(legate::Processor proc)
   return cuda_libraries[proc_id];
 }
 
-legate::cuda::StreamView get_cached_stream()
+cusparseHandle_t get_cusparse()
 {
-  return legate::cuda::StreamPool::get_stream_pool().get_stream();
+  const auto proc = legate::Processor::get_executing_processor();
+  auto& lib       = get_cuda_libraries(proc);
+  return lib.get_cusparse();
 }
 
-cusparseHandle_t get_cusparse()
+cudssHandle_t get_cudss()
 {
   const auto proc = legate::Processor::get_executing_processor();
   auto& lib       = get_cuda_libraries(proc);
-  return lib.get_cusparse();
+  return lib.get_cudss();
 }
 
 class LoadCUDALibsTask : public SparseTask<LoadCUDALibsTask> {
@@ -86,6 +105,7 @@ class LoadCUDALibsTask : public SparseTask<LoadCUDALibsTask> {
     const auto proc = legate::Processor::get_executing_processor();
     auto& lib       = get_cuda_libraries(proc);
     lib.get_cusparse();
+    lib.get_cudss();
   }
 };
 
diff --git a/src/legate_sparse/cudalibs.h b/src/legate_sparse/cudalibs.h
index 5a387200..47596a79 100644
--- a/src/legate_sparse/cudalibs.h
+++ b/src/legate_sparse/cudalibs.h
@@ -33,13 +33,16 @@ struct CUDALibraries {
  public:
   void finalize();
   cusparseHandle_t get_cusparse();
+  cudssHandle_t get_cudss();
 
  private:
   void finalize_cusparse();
+  void finalize_cudss();
 
  private:
   bool finalized_;
   cusparseHandle_t cusparse_;
+  cudssHandle_t cudss_;
 };
 
 }  // namespace sparse
diff --git a/src/legate_sparse/linalg/axpby.cc b/src/legate_sparse/linalg/axpby.cc
index 547ad927..43e99520 100644
--- a/src/legate_sparse/linalg/axpby.cc
+++ b/src/legate_sparse/linalg/axpby.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code VAL_CODE, bool IS_ALPHA, bool NEGATE>
 struct AXPBYImplBody<VariantKind::CPU, VAL_CODE, IS_ALPHA, NEGATE> {
+  TaskContext context;
+  explicit AXPBYImplBody(TaskContext context) : context(context) {}
+
   using VAL_TY = type_of<VAL_CODE>;
 
   void operator()(const AccessorRW<VAL_TY, 1>& y,
diff --git a/src/legate_sparse/linalg/axpby.cu b/src/legate_sparse/linalg/axpby.cu
index 784e77a3..f7ee1feb 100644
--- a/src/legate_sparse/linalg/axpby.cu
+++ b/src/legate_sparse/linalg/axpby.cu
@@ -48,6 +48,9 @@ __global__ void axpby_kernel(size_t elems,
 
 template <Type::Code VAL_CODE, bool IS_ALPHA, bool NEGATE>
 struct AXPBYImplBody<VariantKind::GPU, VAL_CODE, IS_ALPHA, NEGATE> {
+  TaskContext context;
+  explicit AXPBYImplBody(TaskContext context) : context(context) {}
+
   using VAL_TY = type_of<VAL_CODE>;
 
   void operator()(const AccessorRW<VAL_TY, 1>& y,
@@ -58,7 +61,7 @@ struct AXPBYImplBody<VariantKind::GPU, VAL_CODE, IS_ALPHA, NEGATE> {
   {
     auto elems  = rect.volume();
     auto blocks = get_num_blocks_1d(elems);
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     axpby_kernel<VAL_TY, IS_ALPHA, NEGATE>
       <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(elems, rect.lo[0], y, x, a, b);
     LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
diff --git a/src/legate_sparse/linalg/axpby_omp.cc b/src/legate_sparse/linalg/axpby_omp.cc
index eb019b5c..c5569fba 100644
--- a/src/legate_sparse/linalg/axpby_omp.cc
+++ b/src/legate_sparse/linalg/axpby_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code VAL_CODE, bool IS_ALPHA, bool NEGATE>
 struct AXPBYImplBody<VariantKind::OMP, VAL_CODE, IS_ALPHA, NEGATE> {
+  TaskContext context;
+  explicit AXPBYImplBody(TaskContext context) : context(context) {}
+
   using VAL_TY = type_of<VAL_CODE>;
 
   void operator()(const AccessorRW<VAL_TY, 1>& y,
diff --git a/src/legate_sparse/linalg/axpby_template.inl b/src/legate_sparse/linalg/axpby_template.inl
index 8651672d..d110f6de 100644
--- a/src/legate_sparse/linalg/axpby_template.inl
+++ b/src/legate_sparse/linalg/axpby_template.inl
@@ -29,6 +29,9 @@ struct AXPBYImplBody;
 
 template <VariantKind KIND>
 struct AXPBYImpl {
+  TaskContext context;
+  explicit AXPBYImpl(TaskContext context) : context(context) {}
+
   template <Type::Code VAL_CODE>
   void operator()(AXPBYArgs& args) const
   {
@@ -42,15 +45,15 @@ struct AXPBYImpl {
     }
     if (args.isalpha) {
       if (args.negate) {
-        AXPBYImplBody<KIND, VAL_CODE, true, true>()(y, x, a, b, args.y.shape<1>());
+        AXPBYImplBody<KIND, VAL_CODE, true, true>{context}(y, x, a, b, args.y.shape<1>());
       } else {
-        AXPBYImplBody<KIND, VAL_CODE, true, false>()(y, x, a, b, args.y.shape<1>());
+        AXPBYImplBody<KIND, VAL_CODE, true, false>{context}(y, x, a, b, args.y.shape<1>());
       }
     } else {
       if (args.negate) {
-        AXPBYImplBody<KIND, VAL_CODE, false, true>()(y, x, a, b, args.y.shape<1>());
+        AXPBYImplBody<KIND, VAL_CODE, false, true>{context}(y, x, a, b, args.y.shape<1>());
       } else {
-        AXPBYImplBody<KIND, VAL_CODE, false, false>()(y, x, a, b, args.y.shape<1>());
+        AXPBYImplBody<KIND, VAL_CODE, false, false>{context}(y, x, a, b, args.y.shape<1>());
       }
     }
   }
@@ -67,7 +70,7 @@ static void axpby_template(TaskContext context)
     context.scalars()[0].value<bool>(),
     context.scalars()[1].value<bool>(),
   };
-  value_type_dispatch(args.y.code(), AXPBYImpl<KIND>{}, args);
+  value_type_dispatch(args.y.code(), AXPBYImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/linalg/spsolve.cc b/src/legate_sparse/linalg/spsolve.cc
new file mode 100644
index 00000000..446d9e04
--- /dev/null
+++ b/src/legate_sparse/linalg/spsolve.cc
@@ -0,0 +1,34 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/linalg/spsolve.h"
+#include "legate_sparse/util/dispatch.h"
+#include "legate_sparse/util/legate_utils.h"
+
+namespace sparse {
+
+using namespace legate;
+
+namespace  // unnamed
+{
+static const auto sparse_reg_task_ = []() -> char {
+  SpSolve::register_variants();
+  return 0;
+}();
+
+}  // namespace
+
+}  // namespace sparse
diff --git a/src/legate_sparse/linalg/spsolve.cu b/src/legate_sparse/linalg/spsolve.cu
new file mode 100644
index 00000000..ace6fe5e
--- /dev/null
+++ b/src/legate_sparse/linalg/spsolve.cu
@@ -0,0 +1,184 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/linalg/spsolve.h"
+#include "legate_sparse/util/cusparse_utils.h"
+#include "legate_sparse/util/cudss_utils.h"
+#include "legate_sparse/util/dispatch.h"
+#include "legate_sparse/util/legate_utils.h"
+
+namespace sparse {
+
+struct SpSolveImpl {
+  TaskContext context;
+  explicit SpSolveImpl(TaskContext context) : context(context) {}
+
+  template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+  void operator()(SpSolveArgs& args, int num_gpus) const
+  {
+    using INDEX_TY = type_of<INDEX_CODE>;
+    using VAL_TY   = type_of<VAL_CODE>;
+
+    auto& A_pos      = args.A_pos;
+    auto& A_crd      = args.A_crd;
+    auto& A_vals     = args.A_vals;
+    auto& b          = args.b;
+    auto& x          = args.x;  // output
+    auto comms       = args.comms;
+    uint64_t nrows_g = args.nrows_g;
+    uint64_t nnz_g   = args.nnz_g;
+    uint64_t ncols_g = nrows_g;
+
+    int hybrid_mode = 0;  // 0 = GPU-only execution in cuDSS
+
+    // cuDSS handle and stream set
+    auto handle = get_cudss();
+    auto stream = context.get_task_stream();
+    CHECK_CUDSS(cudssSetStream(handle, stream));
+
+    // create configuration and data objects
+    cudssConfig_t config;
+    cudssData_t solverData;
+
+    CHECK_CUDSS(cudssConfigCreate(&config));
+    CHECK_CUDSS(cudssConfigSet(config, CUDSS_CONFIG_HYBRID_MODE, &hybrid_mode, sizeof(int)));
+    CHECK_CUDSS(cudssDataCreate(handle, &solverData));
+
+    //    A      x   =   b
+    // (m, n) (n, 1) = (m, 1); m = nrows, n = ncols
+    // _l: local  (e.g., shape of the partitioned array)
+    // _g: global (e.g., global shape of the array)
+
+    int64_t nrows_l = A_pos.domain().get_volume();
+    int64_t ncols_l = x.domain().get_volume();
+    int64_t nnz_l   = A_vals.domain().get_volume();
+
+    int64_t nrhs = 1;        // Number of right-hand side
+    int64_t ldb  = nrows_g;  // leading dimension of b
+    int64_t ldx  = ncols_g;  // leading dimension of x
+
+    auto A_indptr = CREATE_BUFFER(int64_t, nrows_l + 1, Memory::GPU_FB_MEM, "A_indptr");
+    {
+      auto blocks = get_num_blocks_1d(nrows_l);
+      convertGlobalPosToLocalIndPtr<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
+        nrows_l, A_pos.read_accessor<Rect<1>, 1>().ptr(A_pos.domain().lo()), A_indptr.ptr(0));
+    }
+
+    CHECK_CUDSS(cudssSetStream(handle, stream));
+
+    cudssMatrix_t mat_A, vec_b, vec_x;
+    CHECK_CUDSS(cudssMatrixCreateCsr(&mat_A,                  // pointer to the matrix
+                                     nrows_g,                 // number of rows
+                                     ncols_g,                 // number of columns
+                                     nnz_g,                   // number of non-zeros
+                                     (void*)A_indptr.ptr(0),  // offsets,
+                                     nullptr,                 // end index if start index was used
+                                     getPtrFromStore<INDEX_TY, 1>(A_crd),  // column indices
+                                     getPtrFromStore<VAL_TY, 1>(A_vals),   // values
+                                     cudssIndexType<INDEX_TY>(),           // indexType
+                                     cudssDataType<VAL_TY>(),              // valueType
+                                     CUDSS_MTYPE_GENERAL,                  // matrix type
+                                     CUDSS_MVIEW_FULL,                     // matrix view
+                                     CUDSS_BASE_ZERO                       // indexBase
+                                     ));
+
+    // NOTE:
+    // nrhs should be derived from b (b.shape[1]) and MUST be 1 right now.
+    // When we support multi-dimensional right-hand sides, we need to
+    // make sure that a column major order is chosen in the mapper
+
+    auto x_ptr = getPtrFromStore<VAL_TY, 1>(x);
+
+    // Create dense output vector, x, of shape (ncol_g, nrhs)
+    CHECK_CUDSS(cudssMatrixCreateDn(&vec_x,
+                                    ncols_g,                  // number of rows
+                                    nrhs,                     // number of RHS, set to 1
+                                    ldx,                      // Leading dimension of x
+                                    (void*)x_ptr,             // Values of the dense matrix
+                                    cudssDataType<VAL_TY>(),  // Data type of the dense vector
+                                    CUDSS_LAYOUT_COL_MAJOR)   // Layout
+    );
+
+    auto b_ptr = getPtrFromStore<VAL_TY, 1>(b);
+
+    // Create dense RHS vector, b, of shape (nrows_g, nrhs)
+    CHECK_CUDSS(cudssMatrixCreateDn(&vec_b,
+                                    nrows_g,                  // number of rows
+                                    nrhs,                     // number of RHS, set to 1
+                                    ldb,                      // Leading dimension of b
+                                    (void*)b_ptr,             // Values of the dense matrix
+                                    cudssDataType<VAL_TY>(),  // Data type of the dense vector
+                                    CUDSS_LAYOUT_COL_MAJOR)   // Layout
+    );
+
+    // Matrix and Vectors are partitioned row-wise
+    if (num_gpus > 1) {
+      ncclComm_t* comm = comms[0].get<ncclComm_t*>();
+      cudssMatrixSetDistributionRow1d(mat_A,
+                                      static_cast<int64_t>(A_pos.domain().lo()[0]),
+                                      static_cast<int64_t>(A_pos.domain().hi()[0]));
+      cudssMatrixSetDistributionRow1d(
+        vec_b, static_cast<int64_t>(b.domain().lo()[0]), static_cast<int64_t>(b.domain().hi()[0]));
+      cudssMatrixSetDistributionRow1d(
+        vec_x, static_cast<int64_t>(x.domain().lo()[0]), static_cast<int64_t>(x.domain().hi()[0]));
+
+      // path to libcudss_commlayer_nccl.so is obtained from the env CUDSS_COMM_LIB
+      CHECK_CUDSS(cudssSetCommLayer(handle, nullptr));
+      CHECK_CUDSS(cudssDataSet(handle, solverData, CUDSS_DATA_COMM, comm, sizeof(ncclComm_t*)));
+    }
+
+    // Solve
+    CHECK_CUDSS(
+      cudssExecute(handle, CUDSS_PHASE_ANALYSIS, config, solverData, mat_A, vec_x, vec_b));
+
+    CHECK_CUDSS(
+      cudssExecute(handle, CUDSS_PHASE_FACTORIZATION, config, solverData, mat_A, vec_x, vec_b));
+
+    CHECK_CUDSS(cudssExecute(handle, CUDSS_PHASE_SOLVE, config, solverData, mat_A, vec_x, vec_b));
+
+    // Destroy matrix, vectors, and setup
+    CHECK_CUDSS(cudssMatrixDestroy(mat_A));
+    CHECK_CUDSS(cudssMatrixDestroy(vec_x));
+    CHECK_CUDSS(cudssMatrixDestroy(vec_b));
+    CHECK_CUDSS(cudssDataDestroy(handle, solverData));
+    CHECK_CUDSS(cudssConfigDestroy(config));
+
+    LEGATE_SPARSE_CHECK_CUDA(cudaStreamSynchronize(stream));
+  }
+};
+
+/* static */ void SpSolve::gpu_variant(TaskContext context)
+{
+  auto inputs  = context.inputs();
+  auto outputs = context.outputs();
+  auto comms   = context.communicators();
+
+  SpSolveArgs args{inputs[0],                               // A_pos
+                   inputs[1],                               // A_crd
+                   inputs[2],                               // A_vals
+                   inputs[3],                               // b
+                   outputs[0],                              // x
+                   context.scalars()[0].value<uint64_t>(),  // nrows_g
+                   context.scalars()[1].value<uint64_t>(),  // nnz_g
+                   comms};
+  int num_gpus = static_cast<size_t>(context.get_launch_domain().hi()[0]) + 1;
+  index_type_floating_point_value_type_dispatch(
+    args.A_crd.code(), args.A_vals.code(), SpSolveImpl{context}, args, num_gpus);
+}
+
+using namespace legate;
+
+}  // namespace sparse
diff --git a/src/legate_sparse/linalg/spsolve.h b/src/legate_sparse/linalg/spsolve.h
new file mode 100644
index 00000000..68908f3e
--- /dev/null
+++ b/src/legate_sparse/linalg/spsolve.h
@@ -0,0 +1,48 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/sparse.h"
+#include "legate_sparse/sparse_c.h"
+#include "legate.h"
+
+namespace sparse {
+
+struct SpSolveArgs {
+  const legate::PhysicalStore& A_pos;
+  const legate::PhysicalStore& A_crd;
+  const legate::PhysicalStore& A_vals;
+  const legate::PhysicalStore& b;
+  const legate::PhysicalStore& x;  // output
+  const uint64_t nrows_g;          // global number of rows
+  const uint64_t nnz_g;            // global number of nonzeros
+  std::vector<legate::comm::Communicator> comms;
+};
+
+class SpSolve : public SparseTask<SpSolve> {
+ public:
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPSOLVE}};
+  static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
+    legate::VariantOptions{}.with_has_allocations(true);
+
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext ctx);
+#endif
+};
+
+}  // namespace sparse
diff --git a/src/legate_sparse/mapper/mapper.cc b/src/legate_sparse/mapper/mapper.cc
index 6357d898..8c330b9d 100644
--- a/src/legate_sparse/mapper/mapper.cc
+++ b/src/legate_sparse/mapper/mapper.cc
@@ -126,6 +126,10 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
       return std::nullopt;
     }
 
+    case LEGATE_SPARSE_SPSOLVE: {
+      return std::nullopt;
+    }
+
     default: {
       // Handle any unhandled enum values
       LEGATE_ABORT("Unsupported Legate Sparse task_id: " + std::to_string(task_id));
diff --git a/src/legate_sparse/partition/fast_image_partition.cu b/src/legate_sparse/partition/fast_image_partition.cu
index 47a79606..1825bce0 100644
--- a/src/legate_sparse/partition/fast_image_partition.cu
+++ b/src/legate_sparse/partition/fast_image_partition.cu
@@ -28,6 +28,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE>
 struct FastImageRangeImplBody<VariantKind::GPU, INDEX_CODE> {
+  TaskContext context;
+  explicit FastImageRangeImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorWO<Rect<1>, 1>& out_pos,
@@ -37,7 +40,7 @@ struct FastImageRangeImplBody<VariantKind::GPU, INDEX_CODE> {
                   const Rect<1>& bounds)
   {
     ThrustAllocator alloc(Memory::GPU_FB_MEM);
-    auto stream             = get_cached_stream();
+    auto stream             = context.get_task_stream();
     auto thrust_exec_policy = thrust::cuda::par(alloc).on(stream);
 
     thrust::pair<const INDEX_TY*, const INDEX_TY*> result = thrust::minmax_element(
diff --git a/src/legate_sparse/partition/fast_image_partition_template.inl b/src/legate_sparse/partition/fast_image_partition_template.inl
index 4e74731e..2b7cb3e6 100644
--- a/src/legate_sparse/partition/fast_image_partition_template.inl
+++ b/src/legate_sparse/partition/fast_image_partition_template.inl
@@ -30,6 +30,9 @@ struct FastImageRangeImplBody;
 
 template <VariantKind KIND>
 struct FastImageRangeImpl {
+  TaskContext context;
+  explicit FastImageRangeImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE>
   void operator()(FastImageRangeArgs& args) const
   {
@@ -43,7 +46,7 @@ struct FastImageRangeImpl {
     if (args.input_crd.domain().empty()) {
       return;
     }
-    FastImageRangeImplBody<KIND, INDEX_CODE>()(
+    FastImageRangeImplBody<KIND, INDEX_CODE>{context}(
       output_pos, input_pos, input_crd, args.input_pos.shape<1>(), args.input_crd.shape<1>());
   }
 };
@@ -52,7 +55,7 @@ template <VariantKind KIND>
 static void fast_image_range_template(TaskContext context)
 {
   FastImageRangeArgs args{context.output(0), context.input(0), context.input(1)};
-  index_type_dispatch(args.input_crd.code(), FastImageRangeImpl<KIND>{}, args);
+  index_type_dispatch(args.input_crd.code(), FastImageRangeImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/util/cuda_help.h b/src/legate_sparse/util/cuda_help.h
index d009f9d6..61e83da0 100644
--- a/src/legate_sparse/util/cuda_help.h
+++ b/src/legate_sparse/util/cuda_help.h
@@ -18,9 +18,13 @@
 
 #include <cstdlib>
 #include "legate.h"
-#include "legate/cuda/cuda.h"
-#include "legate/cuda/stream_pool.h"
+
+// For sparse matrix ops like spGEMM and spMv
 #include <cusparse.h>
+
+// For direct solvers
+#include <cudss.h>
+
 #include <nccl.h>
 
 #define THREADS_PER_BLOCK 128
@@ -31,6 +35,12 @@
     check_cusparse(result, __FILE__, __LINE__); \
   } while (false)
 
+#define CHECK_CUDSS(expr)                    \
+  do {                                       \
+    cudssStatus_t result = (expr);           \
+    check_cudss(result, __FILE__, __LINE__); \
+  } while (false)
+
 #define CHECK_NCCL(expr)                    \
   do {                                      \
     ncclResult_t result = (expr);           \
@@ -102,6 +112,24 @@ __host__ inline void check_cusparse(cusparseStatus_t status, const char* file, i
   }
 }
 
+__host__ inline void check_cudss(cudssStatus_t status, const char* file, int line)
+{
+  // TODO: Need to get the equivalent error message from cuDSS
+  if (status != CUDSS_STATUS_SUCCESS) {
+    fprintf(stderr,
+            "Internal CUDSS failure with error code %d in file %s at line %d\n",
+            status,
+            // TODO
+            file,
+            line);
+#ifdef DEBUG_LEGATE_SPARSE
+    assert(false);
+#else
+    exit(status);
+#endif
+  }
+}
+
 __host__ inline void check_nccl(ncclResult_t error, const char* file, int line)
 {
   if (error != ncclSuccess) {
@@ -118,10 +146,9 @@ __host__ inline void check_nccl(ncclResult_t error, const char* file, int line)
   }
 }
 
-// Return a cached stream for the current GPU.
-legate::cuda::StreamView get_cached_stream();
-
 // Method to get the CUSPARSE handle associated with the current GPU.
 cusparseHandle_t get_cusparse();
 
+cudssHandle_t get_cudss();
+
 }  // namespace sparse
diff --git a/src/legate_sparse/util/cudss_utils.h b/src/legate_sparse/util/cudss_utils.h
new file mode 100644
index 00000000..e72d466c
--- /dev/null
+++ b/src/legate_sparse/util/cudss_utils.h
@@ -0,0 +1,73 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/sparse.h"
+#include "legate_sparse/util/cuda_help.h"
+#include "legate_sparse/util/legate_utils.h"
+
+namespace sparse {
+
+using namespace legate;
+
+// Template dispatch for value type.
+// Note: cuDSS only supports floating-point and complex types.
+// Integer and boolean types are not supported by cuDSS.
+template <typename VAL_TY>
+cudaDataType_t cudssDataType();
+
+template <>
+inline cudaDataType_t cudssDataType<float>()
+{
+  return CUDA_R_32F;
+}
+
+template <>
+inline cudaDataType_t cudssDataType<double>()
+{
+  return CUDA_R_64F;
+}
+
+template <>
+inline cudaDataType_t cudssDataType<legate::Complex<float>>()
+{
+  return CUDA_C_32F;
+}
+
+template <>
+inline cudaDataType_t cudssDataType<legate::Complex<double>>()
+{
+  return CUDA_C_64F;
+}
+
+// Template dispatch for the index type.
+template <typename INDEX_TY>
+cudaDataType_t cudssIndexType();
+
+template <>
+inline cudaDataType_t cudssIndexType<int32_t>()
+{
+  return CUDA_R_32I;
+}
+
+template <>
+inline cudaDataType_t cudssIndexType<int64_t>()
+{
+  return CUDA_R_64I;
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/util/cusparse_utils.h b/src/legate_sparse/util/cusparse_utils.h
index 6d496a3e..3ea1029b 100644
--- a/src/legate_sparse/util/cusparse_utils.h
+++ b/src/legate_sparse/util/cusparse_utils.h
@@ -14,9 +14,12 @@
  *
  */
 
+#pragma once
+
 #include "legate_sparse/sparse.h"
 #include "legate_sparse/util/cuda_help.h"
 #include "legate_sparse/util/legate_utils.h"
+#include <legate/redop/redop.h>
 
 namespace sparse {
 
@@ -75,8 +78,7 @@ void* getPtrFromStore(const legate::PhysicalStore& store)
   } else if (!store.is_writable() && store.is_readable()) {
     return const_cast<T*>(store.read_accessor<T, DIM>().ptr(dom.lo()));
   } else if (store.is_reducible()) {
-    return store.reduce_accessor<Legion::SumReduction<T>, true /* exclusive */, DIM>().ptr(
-      dom.lo());
+    return store.reduce_accessor<SumReduction<T>, true, DIM>().ptr(dom.lo());
   } else {
     assert(false);
     return nullptr;
@@ -100,13 +102,13 @@ inline cudaDataType cusparseDataType<double>()
 }
 
 template <>
-inline cudaDataType cusparseDataType<complex<float>>()
+inline cudaDataType cusparseDataType<legate::Complex<float>>()
 {
   return CUDA_C_32F;
 }
 
 template <>
-inline cudaDataType cusparseDataType<complex<double>>()
+inline cudaDataType cusparseDataType<legate::Complex<double>>()
 {
   return CUDA_C_64F;
 }
@@ -133,10 +135,10 @@ template <typename INDEX_TY = int64_t, typename VAL_TY = double>
 cusparseSpMatDescr_t makeCuSparseCSR(const legate::PhysicalStore& pos,
                                      const legate::PhysicalStore& crd,
                                      const legate::PhysicalStore& vals,
-                                     size_t cols)
+                                     size_t cols,
+                                     cudaStream_t stream)
 {
   cusparseSpMatDescr_t matDescr;
-  auto stream = get_cached_stream();
 
   auto pos_domain = pos.domain();
   auto crd_domain = crd.domain();
@@ -169,10 +171,10 @@ template <typename INDEX_TY = int64_t, typename VAL_TY = double>
 cusparseSpMatDescr_t makeCuSparseCSC(const legate::PhysicalStore& pos,
                                      const legate::PhysicalStore& crd,
                                      const legate::PhysicalStore& vals,
-                                     size_t rows)
+                                     size_t rows,
+                                     cudaStream_t stream)
 {
   cusparseSpMatDescr_t matDescr;
-  auto stream = get_cached_stream();
 
   auto pos_domain = pos.domain();
   auto crd_domain = crd.domain();
@@ -237,7 +239,7 @@ cusparseDnMatDescr_t makeCuSparseDenseMat(const legate::PhysicalStore& mat)
     valsPtr  = const_cast<VAL_TY*>(acc.ptr(d.lo()));
     ld       = acc.accessor.strides[0] / sizeof(VAL_TY);
   } else if (mat.is_reducible()) {
-    auto acc = mat.reduce_accessor<Legion::SumReduction<VAL_TY>, true /* exclusive */, 2>();
+    auto acc = mat.reduce_accessor<SumReduction<VAL_TY>, true, 2>();
     valsPtr  = acc.ptr(d.lo());
     ld       = acc.accessor.strides[0] / sizeof(VAL_TY);
   } else {
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index a8b2f17e..0629e734 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -42,7 +42,9 @@ def _create_mask(rows, density=0.3):
         row_idx = numpy.random.randint(0, rows, size=nnz)
         col_idx = numpy.random.randint(0, cols, size=nnz)
         data = numpy.ones(nnz, dtype=bool)
-        A_scipy = scipy_sparse.csr_array((data, (row_idx, col_idx)), shape=(rows, cols))
+        A_scipy = scipy_sparse.csr_array(
+            (data, (row_idx, col_idx)), shape=(rows, cols)
+        )
 
         # Sparse
         A_sparse = sparse.csr_array(A_scipy.todense())
@@ -90,7 +92,9 @@ def create_matrix():
     """
 
     def _create_matrix(N, tol=0.5):
-        _, A_scipy, _ = simple_system_gen(N, N, scipy_sparse.csr_array, tol=tol)
+        _, A_scipy, _ = simple_system_gen(
+            N, N, scipy_sparse.csr_array, tol=tol
+        )
         A_sparse = sparse.csr_array(A_scipy)
 
         # Verify matrices are equivalent
@@ -103,3 +107,318 @@ def _create_matrix(N, tol=0.5):
         return A_scipy, A_sparse
 
     return _create_matrix
+
+
+@pytest.fixture
+def create_tridiagonal_complex_hermitian_matrix():
+    """Create a tridiagonal complex Hermitian sparse matrix.
+
+    This fixture creates a tridiagonal complex Hermitian sparse matrix suitable
+    for eigenvalue computations. The matrix has a real main diagonal and complex
+    conjugate off-diagonals.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+
+    Returns
+    -------
+    scipy.sparse.csr_array
+        A tridiagonal complex Hermitian sparse matrix in SciPy CSR format.
+
+    Notes
+    -----
+    The matrix is constructed with:
+    - Main diagonal: 4.0
+    - Upper diagonal: -(1.0 + 1.0j)
+    - Lower diagonal: -(1.0 - 1.0j) (complex conjugate)
+
+    """
+
+    def _create_tridiagonal_complex_hermitian_matrix(N: int):
+        """Returns a scipy.sparse csr_array that is tridiagonal Hermitian"""
+        main_diag_val = 4.0
+        off_diag_val = -(1.0 + 1.0j)
+
+        main_diag = numpy.full(N, main_diag_val)
+        upper_diag = numpy.full(N - 1, off_diag_val)
+        lower_diag = numpy.full(N - 1, numpy.conjugate(off_diag_val))
+
+        diagonals = [lower_diag, main_diag, upper_diag]
+        offsets = [-1, 0, 1]
+
+        A = scipy_sparse.diags(
+            diagonals,
+            offsets,
+            shape=(N, N),
+            format="csr",
+            dtype=numpy.complex128,
+        )
+
+        return A
+
+    return _create_tridiagonal_complex_hermitian_matrix
+
+
+@pytest.fixture
+def create_tridiagonal_real_symmetric_matrix():
+    """Create a tridiagonal real symmetric sparse matrix.
+
+    This fixture creates a tridiagonal real symmetric sparse matrix suitable
+    for eigenvalue computations. The matrix has a constant main diagonal and
+    constant off-diagonals.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+
+    Returns
+    -------
+    scipy.sparse.csr_array
+        A tridiagonal real symmetric sparse matrix in SciPy CSR format.
+
+    Notes
+    -----
+    The matrix is constructed with:
+    - Main diagonal: 4.0
+    - Upper diagonal: -1.0
+    - Lower diagonal: -1.0
+
+    """
+
+    def _create_tridiagonal_real_symmetric_matrix(N: int):
+        """Returns a scipy.sparse csr_array that is tridiagonal symmetric"""
+        main_diag_val = 4.0
+        off_diag_val = -1.0
+
+        main_diag = numpy.full(N, main_diag_val)
+        upper_diag = numpy.full(N - 1, off_diag_val)
+        lower_diag = numpy.full(N - 1, numpy.conjugate(off_diag_val))
+
+        diagonals = [lower_diag, main_diag, upper_diag]
+        offsets = [-1, 0, 1]
+
+        A = scipy_sparse.diags(
+            diagonals, offsets, shape=(N, N), format="csr", dtype=numpy.float64
+        )
+
+        return A
+
+    return _create_tridiagonal_real_symmetric_matrix
+
+
+@pytest.fixture
+def create_sparse_real_symmetric_matrix():
+    """Create a generic real symmetric sparse matrix with random sparsity.
+
+    This fixture creates a real symmetric sparse matrix suitable for eigenvalue
+    computations. The sparsity pattern changes with N, making it suitable for
+    testing across different matrix sizes.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+    density : float, optional
+        Approximate density of non-zero elements. Default is 0.3.
+    seed : int, optional
+        Random seed for reproducibility. Default is 42.
+
+    Returns
+    -------
+    scipy.sparse.csr_array
+        A real symmetric sparse matrix in SciPy CSR format.
+
+    Notes
+    -----
+    The matrix is constructed by:
+    1. Generating a random sparse matrix
+    2. Making it symmetric: A = (A + A.T) / 2
+    3. Adding a diagonal component to ensure positive definiteness
+
+    """
+
+    def _create_sparse_real_symmetric_matrix(N: int, density=0.3, seed=42):
+        """Returns a scipy.sparse csr_array that is symmetric with random sparsity"""
+        numpy.random.seed(seed)
+
+        # Generate random sparse matrix
+        nnz = int(N * N * density)
+        row_idx = numpy.random.randint(0, N, size=nnz)
+        col_idx = numpy.random.randint(0, N, size=nnz)
+        data = numpy.random.randn(nnz)
+
+        A = scipy_sparse.csr_array((data, (row_idx, col_idx)), shape=(N, N))
+
+        # Make it symmetric: A = (A + A.T) / 2
+        A = (A + A.T) / 2
+
+        # Add diagonal dominance to ensure well-conditioned matrix
+        # This helps with convergence in eigenvalue computations
+        A = A + scipy_sparse.eye(N, format="csr") * N
+
+        return A
+
+    return _create_sparse_real_symmetric_matrix
+
+
+@pytest.fixture
+def create_sparse_complex_hermitian_matrix():
+    """Create a generic complex Hermitian sparse matrix with random sparsity.
+
+    This fixture creates a complex Hermitian sparse matrix suitable for
+    eigenvalue computations. The sparsity pattern changes with N, making it
+    suitable for testing across different matrix sizes.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+    density : float, optional
+        Approximate density of non-zero elements. Default is 0.3.
+    seed : int, optional
+        Random seed for reproducibility. Default is 42.
+
+    Returns
+    -------
+    scipy.sparse.csr_array
+        A complex Hermitian sparse matrix in SciPy CSR format.
+
+    Notes
+    -----
+    The matrix is constructed by:
+    1. Generating a random complex sparse matrix
+    2. Making it Hermitian: A = (A + A.H) / 2
+    3. Adding a diagonal component to ensure positive definiteness
+
+    """
+
+    def _create_sparse_complex_hermitian_matrix(N: int, density=0.3, seed=42):
+        """Returns a scipy.sparse csr_array that is Hermitian with random sparsity"""
+        numpy.random.seed(seed)
+
+        # Generate random complex sparse matrix
+        nnz = int(N * N * density)
+        row_idx = numpy.random.randint(0, N, size=nnz)
+        col_idx = numpy.random.randint(0, N, size=nnz)
+        data_real = numpy.random.randn(nnz)
+        data_imag = numpy.random.randn(nnz)
+        data = data_real + 1j * data_imag
+
+        A = scipy_sparse.csr_array(
+            (data, (row_idx, col_idx)), shape=(N, N), dtype=numpy.complex128
+        )
+
+        # Make it Hermitian: A = (A + A.H) / 2
+        A = (A + A.conjugate().T) / 2
+
+        # Add diagonal dominance to ensure well-conditioned matrix
+        # This helps with convergence in eigenvalue computations
+        A = A + scipy_sparse.eye(N, format="csr", dtype=numpy.complex128) * N
+
+        return A
+
+    return _create_sparse_complex_hermitian_matrix
+
+
+@pytest.fixture
+def create_matrix_with_zero_diagonal():
+    """Create a symmetric/Hermitian matrix with at least one zero diagonal entry.
+
+    This fixture creates a sparse matrix with a missing diagonal element
+    to test error handling in eigenvalue computations.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+    dtype : numpy.dtype
+        Data type of the matrix (numpy.float64 or numpy.complex128).
+    zero_index : int, optional
+        Index of the diagonal element to set to zero. Default is N//2.
+    density : float, optional
+        Approximate density of non-zero elements. Default is 0.3.
+    seed : int, optional
+        Random seed for reproducibility. Default is 42.
+
+    Returns
+    -------
+    scipy.sparse.csr_array
+        A sparse matrix with a zero diagonal entry.
+
+    """
+
+    def _create_matrix_with_zero_diagonal(
+        N: int, dtype=numpy.float64, zero_index=None, density=0.3, seed=42
+    ):
+        """Returns a scipy.sparse csr_array with a zero diagonal entry"""
+        if zero_index is None:
+            zero_index = N // 2
+
+        numpy.random.seed(seed)
+
+        # Generate random sparse matrix
+        nnz = int(N * N * density)
+        row_idx = numpy.random.randint(0, N, size=nnz)
+        col_idx = numpy.random.randint(0, N, size=nnz)
+
+        if dtype == numpy.complex128:
+            data_real = numpy.random.randn(nnz)
+            data_imag = numpy.random.randn(nnz)
+            data = data_real + 1j * data_imag
+            A = scipy_sparse.csr_array(
+                (data, (row_idx, col_idx)), shape=(N, N), dtype=dtype
+            )
+            # Make it Hermitian
+            A = (A + A.conjugate().T) / 2
+            # Add diagonal dominance except for the zero index
+            diag_vals = numpy.full(N, N, dtype=dtype)
+            diag_vals[zero_index] = 0.0
+            A = A + scipy_sparse.diags(diag_vals, 0, format="csr", dtype=dtype)
+        else:
+            data = numpy.random.randn(nnz)
+            A = scipy_sparse.csr_array(
+                (data, (row_idx, col_idx)), shape=(N, N)
+            )
+            # Make it symmetric
+            A = (A + A.T) / 2
+            # Add diagonal dominance except for the zero index
+            diag_vals = numpy.full(N, N, dtype=dtype)
+            diag_vals[zero_index] = 0.0
+            A = A + scipy_sparse.diags(diag_vals, 0, format="csr")
+
+        # Remove the zero from the sparse representation
+        A.eliminate_zeros()
+
+        return A
+
+    return _create_matrix_with_zero_diagonal
+
+
+@pytest.fixture
+def create_non_square_matrix():
+    """Create a non-square matrix for testing error handling.
+
+    Parameters
+    ----------
+    rows : int
+        Number of rows in the matrix.
+    cols : int
+        Number of columns in the matrix.
+    dtype : numpy.dtype
+        Data type of the matrix.
+
+    Returns
+    -------
+    numpy.ndarray
+        A non-square dense matrix.
+
+    """
+
+    def _create_non_square_matrix(rows: int, cols: int, dtype=numpy.float64):
+        """Returns a non-square matrix"""
+        return numpy.random.randn(rows, cols).astype(dtype)
+
+    return _create_non_square_matrix
diff --git a/tests/integration/test_block_array.py b/tests/integration/test_block_array.py
new file mode 100644
index 00000000..c4cbaad9
--- /dev/null
+++ b/tests/integration/test_block_array.py
@@ -0,0 +1,176 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for block_array construction function."""
+
+import sys
+
+import cupynumeric as np
+import pytest
+import scipy.sparse as sp
+
+import legate_sparse as sparse
+
+# Temporary release unblock for a known cupynumeric runtime issue.
+pytestmark = pytest.mark.skip(
+    reason=(
+        "Temporarily disabled for release unblock: "
+        "https://github.com/nv-legate/cupynumeric/issues/1224"
+    )
+)
+
+
+class TestBlockArray:
+    """Tests for the block_array function."""
+
+    def test_basic_2x2_blocks(self):
+        """Test basic 2x2 block assembly."""
+        A = sparse.csr_array(np.array([[1, 2], [3, 4]]))
+        B = sparse.csr_array(np.array([[5, 6], [7, 8]]))
+        C = sparse.csr_array(np.array([[9, 10], [11, 12]]))
+        D = sparse.csr_array(np.array([[13, 14], [15, 16]]))
+
+        result = sparse.block_array([[A, B], [C, D]])
+
+        expected = np.array(
+            [[1, 2, 5, 6], [3, 4, 7, 8], [9, 10, 13, 14], [11, 12, 15, 16]]
+        )
+        assert np.array_equal(result.todense(), expected)
+
+    def test_with_none_blocks(self):
+        """Test block assembly with None (zero) blocks."""
+        A = sparse.csr_array(np.array([[1, 2], [3, 4]]))
+        B = sparse.csr_array(np.array([[5, 6], [7, 8]]))
+
+        result = sparse.block_array([[A, None], [None, B]])
+
+        expected = np.array(
+            [[1, 2, 0, 0], [3, 4, 0, 0], [0, 0, 5, 6], [0, 0, 7, 8]]
+        )
+        assert np.array_equal(result.todense(), expected)
+
+    def test_rectangular_blocks(self):
+        """Test with rectangular blocks."""
+        A = sparse.csr_array(np.array([[1, 2, 3], [4, 5, 6]]))
+        B = sparse.csr_array(np.array([[7], [8]]))
+
+        result = sparse.block_array([[A, B]])
+
+        expected = np.array([[1, 2, 3, 7], [4, 5, 6, 8]])
+        assert np.array_equal(result.todense(), expected)
+
+    def test_single_block(self):
+        """Test with a single block."""
+        A = sparse.csr_array(np.array([[1, 2], [3, 4]]))
+        result = sparse.block_array([[A]])
+        assert np.array_equal(result.todense(), A.todense())
+
+    def test_dtype_inference(self):
+        """Test that dtype is correctly inferred."""
+        A = sparse.csr_array(np.array([[1.5, 2.5]]))
+        B = sparse.csr_array(np.array([[3, 4]]))
+        result = sparse.block_array([[A], [B]])
+        assert result.dtype == np.float64
+
+    def test_explicit_dtype(self):
+        """Test explicit dtype specification."""
+        A = sparse.csr_array(np.array([[1, 2]]))
+        result = sparse.block_array([[A]], dtype=np.float32)
+        assert result.dtype == np.float32
+
+    def test_sparse_blocks(self):
+        """Test with actual sparse blocks (blocks with zeros)."""
+        # Create sparse matrices with actual zero patterns
+        data_A = np.array([1, 0, 0, 2])
+        A = sparse.csr_array(data_A.reshape(2, 2))
+
+        data_B = np.array([0, 3, 4, 0])
+        B = sparse.csr_array(data_B.reshape(2, 2))
+
+        result = sparse.block_array([[A, B]])
+
+        expected = np.array([[1, 0, 0, 3], [0, 2, 4, 0]])
+        assert np.array_equal(result.todense(), expected)
+
+    def test_matches_scipy(self):
+        """Test that output matches SciPy's block_array."""
+        np.random.seed(42)
+
+        # Create random sparse blocks
+        A_dense = np.random.rand(3, 4)
+        B_dense = np.random.rand(3, 2)
+        C_dense = np.random.rand(2, 4)
+        D_dense = np.random.rand(2, 2)
+
+        # SciPy version
+        A_sp = sp.csr_array(A_dense)
+        B_sp = sp.csr_array(B_dense)
+        C_sp = sp.csr_array(C_dense)
+        D_sp = sp.csr_array(D_dense)
+        scipy_result = sp.block_array([[A_sp, B_sp], [C_sp, D_sp]]).todense()
+
+        # Legate version
+        A_lg = sparse.csr_array(A_dense)
+        B_lg = sparse.csr_array(B_dense)
+        C_lg = sparse.csr_array(C_dense)
+        D_lg = sparse.csr_array(D_dense)
+        legate_result = sparse.block_array(
+            [[A_lg, B_lg], [C_lg, D_lg]]
+        ).todense()
+
+        assert np.allclose(legate_result, scipy_result)
+
+
+class TestBlockArrayErrors:
+    """Tests for block_array error handling."""
+
+    def test_empty_blocks_raises(self):
+        """Test that empty blocks raises ValueError."""
+        with pytest.raises(ValueError, match="cannot be empty"):
+            sparse.block_array([])
+
+    def test_inconsistent_row_count_raises(self):
+        """Test that inconsistent row counts raise ValueError."""
+        A = sparse.csr_array(np.array([[1, 2], [3, 4]]))
+        B = sparse.csr_array(np.array([[5, 6, 7]]))  # Only 1 row
+
+        with pytest.raises(ValueError, match="rows"):
+            sparse.block_array([[A, B]])
+
+    def test_inconsistent_col_count_raises(self):
+        """Test that inconsistent column counts raise ValueError."""
+        A = sparse.csr_array(np.array([[1, 2], [3, 4]]))
+        B = sparse.csr_array(np.array([[5], [6], [7]]))  # 3 rows, but 1 col
+        C = sparse.csr_array(
+            np.array([[8, 9]])
+        )  # 1 row, but needs 3 cols below B
+
+        with pytest.raises(ValueError):
+            sparse.block_array([[A, B], [C, None]])
+
+    def test_unsupported_format_raises(self):
+        """Test that unsupported format raises ValueError."""
+        A = sparse.csr_array(np.array([[1, 2]]))
+        with pytest.raises(ValueError, match="csr"):
+            sparse.block_array([[A]], format="coo")
+
+    def test_non_csr_block_raises(self):
+        """Test that non-CSR blocks raise TypeError."""
+        A = sparse.csr_array(np.array([[1, 2]]))
+        with pytest.raises(TypeError, match="csr_array"):
+            sparse.block_array([[A, np.array([[3, 4]])]])
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_cg_solve.py b/tests/integration/test_cg_solve.py
index d8e046e3..abed202b 100644
--- a/tests/integration/test_cg_solve.py
+++ b/tests/integration/test_cg_solve.py
@@ -50,7 +50,7 @@ def test_cg_solve():
     x = sample_dense_vector(D, 0.1, seed)
     y = A @ x
     x_pred, iters = linalg.cg(A, y, tol=1e-8)
-    assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0)
+    assert np.allclose((A @ x_pred), y, rtol=1e-8)
 
 
 def test_cg_solve_with_callback():
@@ -92,7 +92,7 @@ def callback(x):
         residuals.append(y - A @ x)
 
     x_pred, iters = linalg.cg(A, y, tol=1e-8, callback=callback)
-    assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0)
+    assert np.allclose((A @ x_pred), y, rtol=1e-8)
     assert len(residuals) > 0
 
 
@@ -150,7 +150,7 @@ def matvec(x):
     x_pred, iters = linalg.cg(
         linalg.LinearOperator(A.shape, matvec=matvec), y, tol=1e-8
     )
-    assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0)
+    assert np.allclose((A @ x_pred), y, rtol=1e-8)
 
     def matvec(x, out=None):
         return A.dot(x, out=out)
@@ -158,7 +158,7 @@ def matvec(x, out=None):
     x_pred, iters = linalg.cg(
         linalg.LinearOperator(A.shape, matvec=matvec), y, tol=1e-8
     )
-    assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0)
+    assert np.allclose((A @ x_pred), y, rtol=1e-8)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_csr_from_csr.py b/tests/integration/test_csr_from_csr.py
index 4dd7b2f8..bdfe57b1 100644
--- a/tests/integration/test_csr_from_csr.py
+++ b/tests/integration/test_csr_from_csr.py
@@ -31,8 +31,12 @@ def test_csr_from_csr_fixed():
     7 0 0 0 2 1
     """
     row_offsets = np.array([0, 2, 5, 7, 9, 11, 14], dtype=np.int64)
-    csr_vals = np.array([2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64)
-    col_indices = np.array([0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64)
+    csr_vals = np.array(
+        [2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64
+    )
+    col_indices = np.array(
+        [0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64
+    )
     matrix_shape = (6, 6)
 
     A = sparse.csr_array(  # noqa: F841
diff --git a/tests/integration/test_csr_to_dense.py b/tests/integration/test_csr_to_dense.py
index 7a444938..5177efa0 100644
--- a/tests/integration/test_csr_to_dense.py
+++ b/tests/integration/test_csr_to_dense.py
@@ -22,11 +22,17 @@
 
 def test_csr_to_dense():
     row_offsets = np.array([0, 2, 5, 7, 9, 11, 14], dtype=np.int64)
-    csr_vals = np.array([2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64)
-    col_indices = np.array([0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64)
+    csr_vals = np.array(
+        [2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64
+    )
+    col_indices = np.array(
+        [0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64
+    )
     matrix_shape = (6, 6)
 
-    A = sparse.csr_array((csr_vals, col_indices, row_offsets), shape=matrix_shape)
+    A = sparse.csr_array(
+        (csr_vals, col_indices, row_offsets), shape=matrix_shape
+    )
 
     B = A.todense()
     expected_B = np.array(
diff --git a/tests/integration/test_diags.py b/tests/integration/test_diags.py
index 04cf7c56..a8fcc936 100644
--- a/tests/integration/test_diags.py
+++ b/tests/integration/test_diags.py
@@ -23,7 +23,9 @@
 
 @pytest.mark.parametrize("N", [12, 34])
 @pytest.mark.parametrize("diagonals", [3, 5])
-@pytest.mark.parametrize("dtype", (np.float32, np.float64, np.complex64, np.complex128))
+@pytest.mark.parametrize(
+    "dtype", (np.float32, np.float64, np.complex64, np.complex128)
+)
 @pytest.mark.parametrize("fmt", ["csr", "dia"])
 def test_diags(N, diagonals, dtype, fmt):
     A = sparse.diags(
diff --git a/tests/integration/test_eigsh.py b/tests/integration/test_eigsh.py
new file mode 100644
index 00000000..c3819d57
--- /dev/null
+++ b/tests/integration/test_eigsh.py
@@ -0,0 +1,392 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupynumeric as cn
+import numpy
+import pytest
+
+import legate_sparse.linalg as linalg
+from legate_sparse import csr_array
+
+
+@pytest.fixture
+def check_eigsh_result():
+    """Checks if the Eigenvalues match Ax = wx.
+
+    Parameters
+    ----------
+    A : csr_array
+        Input sparse matrix
+    w: numpy.ndarray
+        Eigen values
+    x: numpy.ndarray
+        Eigen vectors
+    res_tol: float, optional
+        Acceptable residual
+    """
+
+    def _check_eigsh_result(A, w, x, res_tol: float = 1e-3):
+        """Verify eigsh results by checking residual, Ax - wx"""
+        for i in range(w.size):
+            # ||Ax - wx|| / ||w||
+            Ax = A @ x[:, i]
+            wx = w[i] * x[:, i]
+            res = cn.linalg.norm(Ax - wx) / cn.abs(w[i])
+            assert res < res_tol, (
+                f"Residual {res} exceeds tol of {res_tol} for {i}th eigen value"
+            )
+
+    return _check_eigsh_result
+
+
+class TestEigsh:
+    """Test eigsh with various parameters following CuPy's testing approach."""
+
+    # ------ Test arguments: N, k, which
+
+    @pytest.mark.parametrize("N", [10, 16])
+    @pytest.mark.parametrize("which", ["LM", "LA", "SA"])
+    @pytest.mark.parametrize("k", [1, 3])
+    def test_eigsh_real_symmetric(
+        self,
+        N,
+        which,
+        k,
+        create_tridiagonal_real_symmetric_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with real symmetric tridiagonal matrices."""
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real for real symmetric matrices"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues, found {w.shape}"
+
+        check_eigsh_result(A, w, x)
+
+    @pytest.mark.parametrize("N", [10, 16])
+    @pytest.mark.parametrize("which", ["LM", "LA", "SA"])
+    @pytest.mark.parametrize("k", [1, 3])
+    def test_eigsh_complex_hermitian(
+        self,
+        N,
+        which,
+        k,
+        create_tridiagonal_complex_hermitian_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with complex Hermitian tridiagonal matrices."""
+        A_scipy = create_tridiagonal_complex_hermitian_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real for Hermitian matrices"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+
+        check_eigsh_result(A, w, x)
+
+    # ------ Test argument return_eigenvector
+
+    def test_eigsh_eigenvalues_only_real(
+        self, create_tridiagonal_real_symmetric_matrix
+    ):
+        """Test eigsh with return_eigenvectors=False for real matrices."""
+        N, k = 10, 2
+        which = "LM"
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w = linalg.eigsh(A, k=k, which=which, return_eigenvectors=False)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+
+    def test_eigsh_eigenvalues_only_complex(
+        self, create_tridiagonal_complex_hermitian_matrix
+    ):
+        """Test eigsh with return_eigenvectors=False for complex matrices."""
+        N, k = 10, 2
+        which = "LM"
+        A_scipy = create_tridiagonal_complex_hermitian_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w = linalg.eigsh(A, k=k, which=which, return_eigenvectors=False)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+
+    # ------ Test argument v0
+
+    def test_eigsh_with_v0_real(
+        self, create_tridiagonal_real_symmetric_matrix, check_eigsh_result
+    ):
+        """Test eigsh with user-provided initial vector v0 for real matrices."""
+        N, k = 10, 2
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        v0 = numpy.array(cn.random.randn(N), dtype=numpy.float64)
+
+        w, x = linalg.eigsh(
+            A, k=k, which="LM", v0=v0, return_eigenvectors=True
+        )
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        check_eigsh_result(A, w, x)
+
+    def test_eigsh_with_v0_complex(
+        self, create_tridiagonal_complex_hermitian_matrix, check_eigsh_result
+    ):
+        """Test eigsh with user-provided initial vector v0 for complex matrices."""
+        N, k = 10, 2
+        A_scipy = create_tridiagonal_complex_hermitian_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        v0 = cn.array(
+            numpy.random.randn(N) + 1j * numpy.random.randn(N),
+            dtype=numpy.complex128,
+        )
+
+        w, x = linalg.eigsh(
+            A, k=k, which="LM", v0=v0, return_eigenvectors=True
+        )
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-10), (
+            "Eigenvalues should be real"
+        )
+        check_eigsh_result(A, w, x)
+
+    # ------ Test output sortedness
+
+    def test_eigsh_sorted_eigenvalues(
+        self, create_tridiagonal_real_symmetric_matrix
+    ):
+        """Test that eigenvalues are returned sorted."""
+        N, k = 20, 6
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w, _ = linalg.eigsh(A, k=k, which="LM", return_eigenvectors=True)
+
+        # Eigenvalues should be sorted in ascending order
+        w_sorted = cn.sort(w)
+        assert cn.allclose(w, w_sorted), "Eigenvalues should be sorted"
+
+
+class TestEigshLargeProblems:
+    """Test eigsh with larger problem sizes."""
+
+    @pytest.mark.parametrize("N", [15, 30])
+    @pytest.mark.parametrize("k", [3, 6])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_large_real_symmetric(
+        self,
+        N,
+        k,
+        which,
+        create_tridiagonal_real_symmetric_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with large real symmetric tridiagonal matrices."""
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A, w, x)
+
+    @pytest.mark.parametrize("N", [15, 30])
+    @pytest.mark.parametrize("k", [3, 6])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_large_complex_hermitian(
+        self,
+        N,
+        k,
+        which,
+        create_tridiagonal_complex_hermitian_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with large complex Hermitian tridiagonal matrices."""
+        A_scipy = create_tridiagonal_complex_hermitian_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A, w, x)
+
+
+class TestEigshRandomSparse:
+    """Test eigsh with random sparse symmetric/Hermitian matrices."""
+
+    @pytest.mark.parametrize("N", [15, 30])
+    @pytest.mark.parametrize("k", [1, 3])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_random_real_symmetric(
+        self,
+        N,
+        k,
+        which,
+        create_sparse_real_symmetric_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with random sparse real symmetric matrices."""
+        A_scipy = create_sparse_real_symmetric_matrix(N, density=0.3, seed=42)
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A, w, x)
+
+    @pytest.mark.parametrize("N", [15, 30])
+    @pytest.mark.parametrize("k", [1, 3])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_random_complex_hermitian(
+        self,
+        N,
+        k,
+        which,
+        create_sparse_complex_hermitian_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with random sparse complex Hermitian matrices."""
+        A_scipy = create_sparse_complex_hermitian_matrix(
+            N, density=0.3, seed=42
+        )
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A, w, x)
+
+
+class TestEigshLinearOperator:
+    """Test eigsh with LinearOperator input."""
+
+    @pytest.mark.parametrize("N", [10, 20])
+    @pytest.mark.parametrize("k", [1, 3])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_linear_operator_real(
+        self,
+        N,
+        k,
+        which,
+        create_tridiagonal_real_symmetric_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with LinearOperator wrapping a real symmetric matrix."""
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A_dense = cn.array(A_scipy.todense())
+
+        A_op = linalg.LinearOperator(
+            shape=(N, N), matvec=lambda v: A_dense @ v, dtype=A_dense.dtype
+        )
+
+        w, x = linalg.eigsh(A_op, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0.0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A_dense, w, x)
+
+    @pytest.mark.parametrize("N", [10, 20])
+    @pytest.mark.parametrize("k", [1, 3])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_linear_operator_complex(
+        self,
+        N,
+        k,
+        which,
+        create_tridiagonal_complex_hermitian_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with LinearOperator wrapping a complex Hermitian matrix."""
+        A_scipy = create_tridiagonal_complex_hermitian_matrix(N)
+        A_dense = cn.array(A_scipy.todense())
+
+        A_op = linalg.LinearOperator(
+            shape=(N, N), matvec=lambda v: A_dense @ v, dtype=A_dense.dtype
+        )
+
+        w, x = linalg.eigsh(A_op, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0.0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A_dense, w, x)
+
+
+class TestEigshErrors:
+    """Test eigsh error handling."""
+
+    def test_non_square_matrix(self):
+        """Test that non-square matrix raises ValueError."""
+        A_rect = csr_array(numpy.random.randn(10, 15))
+        with pytest.raises(ValueError, match="expected square matrix"):
+            linalg.eigsh(A_rect, k=1)
+
+    def test_k_too_large(self):
+        """Test that k >= n raises ValueError."""
+        n = 10
+        A = csr_array(numpy.eye(n))
+        with pytest.raises(ValueError, match="k must be smaller than n"):
+            linalg.eigsh(A, k=n)
+
+    def test_k_zero_or_negative(self):
+        """Test that k <= 0 raises ValueError."""
+        A = csr_array(numpy.eye(10))
+        with pytest.raises(ValueError, match="k must be greater than 0"):
+            linalg.eigsh(A, k=0)
+
+    def test_invalid_which(self):
+        """Test that invalid which raises ValueError."""
+        A = csr_array(numpy.eye(10))
+        with pytest.raises(ValueError, match="which must be"):
+            linalg.eigsh(A, k=1, which="INVALID")
+
+
+if __name__ == "__main__":
+    import sys
+
+    pytest.main(sys.argv)
diff --git a/tests/integration/test_geam.py b/tests/integration/test_geam.py
new file mode 100644
index 00000000..665c5092
--- /dev/null
+++ b/tests/integration/test_geam.py
@@ -0,0 +1,269 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for GEAM API and sparse matrix arithmetic operations."""
+
+import sys
+
+import cupynumeric as np
+import pytest
+from utils.banded_matrix import banded_matrix
+from utils.sample import simple_system_gen
+
+import legate_sparse as sparse
+from legate_sparse.csr import geam
+
+
+# =============================================================================
+# GEAM API Tests - Error Cases
+# =============================================================================
+
+
+def test_geam_sparse_dense_mismatch_A_dense():
+    """Test that geam raises error when only one of the arrays is sparse."""
+    N = 5
+    np.random.seed(42)
+    A_dense = np.random.rand(N, N)
+    B_sparse = banded_matrix(N, 3)
+
+    with pytest.raises((TypeError, AttributeError)):
+        geam(A_dense, B_sparse, 1.0, 2.0)
+
+    with pytest.raises((TypeError, AttributeError)):
+        geam(B_sparse, A_dense, 1.0, 2.0)
+
+
+def test_geam_wrong_sparsity_pattern_for_C():
+    """Providing C with incompatible sparsity pattern leads to incorrect results."""
+    N = 5
+    np.random.seed(42)
+
+    A = banded_matrix(N, 3)  # tri-diagonal
+    B = banded_matrix(N, 5)  # penta-diagonal
+
+    C_correct = geam(A, B, 2.0, 3.0)
+    C_wrong = banded_matrix(N, 3)  # wrong pattern - too few non-zeros
+    C_result = geam(A, B, 2.0, 3.0, C=C_wrong)
+
+    # Results should NOT match due to incompatible sparsity
+    assert not np.allclose(C_correct.todense(), C_result.todense())
+
+
+# =============================================================================
+# GEAM API Tests - Success Cases
+# =============================================================================
+
+
+@pytest.mark.parametrize("N", [5, 15, 30])
+def test_geam_basic_without_C(N):
+    """Test geam without providing C."""
+    np.random.seed(42)
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+    B_dense, B_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+
+    C_sparse = geam(A_sparse, B_sparse, 2.5, -1.5)
+    C_expected = 2.5 * A_dense + (-1.5) * B_dense
+
+    assert np.allclose(C_sparse.todense(), C_expected, rtol=1e-10, atol=1e-12)
+
+
+@pytest.mark.parametrize("N", [5, 15, 30])
+def test_geam_basic_with_C(N):
+    """Test geam with pre-allocated C, then reuse it."""
+    np.random.seed(42)
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+    B_dense, B_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+
+    C_sparse = geam(A_sparse, B_sparse, 2.0, 3.0)
+    assert np.allclose(C_sparse.todense(), 2.0 * A_dense + 3.0 * B_dense)
+
+    C_sparse = geam(A_sparse, B_sparse, -1.0, 0.5, C=C_sparse)
+    assert np.allclose(C_sparse.todense(), -1.0 * A_dense + 0.5 * B_dense)
+
+
+@pytest.mark.parametrize(
+    "alpha,beta", [(1.0, 1.0), (1.0, -1.0), (2.0, 0.0), (0.0, 3.0)]
+)
+def test_geam_various_scalars(alpha, beta):
+    """Test geam with various scalar combinations."""
+    N = 15
+    np.random.seed(42)
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+    B_dense, B_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+
+    C_sparse = geam(A_sparse, B_sparse, alpha, beta)
+    assert np.allclose(C_sparse.todense(), alpha * A_dense + beta * B_dense)
+
+
+def test_geam_loop_with_C_reuse():
+    """Test geam in a loop where C is reused across iterations."""
+    N = 15
+    np.random.seed(42)
+
+    A_sparse = banded_matrix(N, 3)
+    B_sparse = banded_matrix(N, 3)
+    C_sparse = geam(A_sparse, B_sparse, 1.0, 1.0)
+
+    for i in range(1, 5):
+        A_new = banded_matrix(N, 3, init_with_ones=False)
+        B_new = banded_matrix(N, 3, init_with_ones=False)
+        scale_A, scale_B = float(i + 1), float(i + 2)
+
+        C_sparse = geam(A_new, B_new, scale_A, scale_B, C=C_sparse)
+        C_expected = scale_A * A_new.todense() + scale_B * B_new.todense()
+
+        assert np.allclose(C_sparse.todense(), C_expected)
+
+
+def test_geam_identical_matrices():
+    """Test geam when A and B are identical."""
+    N = 15
+    np.random.seed(42)
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+
+    C_sparse = geam(A_sparse, A_sparse, 2.0, 3.0)
+    assert np.allclose(C_sparse.todense(), 5.0 * A_dense)
+
+
+def test_geam_disjoint_sparsity_patterns():
+    """Test geam when A and B have disjoint sparsity patterns."""
+    N = 15
+    np.random.seed(42)
+
+    A_dense = np.triu(np.random.rand(N, N))
+    A_sparse = sparse.csr_array(A_dense)
+    B_dense = np.tril(np.random.rand(N, N), k=-1)
+    B_sparse = sparse.csr_array(B_dense)
+
+    C_sparse = geam(A_sparse, B_sparse, 1.5, 2.5)
+    assert np.allclose(C_sparse.todense(), 1.5 * A_dense + 2.5 * B_dense)
+
+
+# =============================================================================
+# Dunder Method Tests (__add__, __sub__, __radd__, __rsub__)
+# =============================================================================
+
+
+class TestCSRArithmetic:
+    """Tests for CSR matrix arithmetic dunder methods."""
+
+    @pytest.fixture
+    def matrices(self):
+        """Create test matrices."""
+        N = 15
+        np.random.seed(42)
+        A_dense, A_sparse, _ = simple_system_gen(
+            N, N, sparse.csr_array, tol=0.3
+        )
+        B_dense, B_sparse, _ = simple_system_gen(
+            N, N, sparse.csr_array, tol=0.3
+        )
+        return A_dense, A_sparse, B_dense, B_sparse
+
+    # -------------------------------------------------------------------------
+    # Sparse + Sparse, Sparse - Sparse
+    # -------------------------------------------------------------------------
+
+    def test_add_sparse_sparse(self, matrices):
+        """A + B where both are sparse."""
+        A_dense, A_sparse, B_dense, B_sparse = matrices
+        C = A_sparse + B_sparse
+        assert np.allclose(C.todense(), A_dense + B_dense)
+
+        C = A_sparse - B_sparse
+        assert np.allclose(C.todense(), A_dense - B_dense)
+
+    # -------------------------------------------------------------------------
+    # Sparse + Dense, Dense + Sparse
+    # -------------------------------------------------------------------------
+
+    def test_add_sparse_dense(self, matrices):
+        """sparse + dense returns dense."""
+        A_dense, A_sparse, B_dense, _ = matrices
+        C = A_sparse + B_dense
+        assert np.allclose(C, A_dense + B_dense)
+
+    @pytest.mark.skip(
+        reason="cupynumeric intercepts dense+sparse before __radd__ is called"
+    )
+    def test_add_dense_sparse(self, matrices):
+        """dense + sparse should return dense (currently broken in cupynumeric)."""
+        A_dense, _, B_dense, B_sparse = matrices
+        C = A_dense + B_sparse
+        assert np.allclose(C, A_dense + B_dense)
+
+    # -------------------------------------------------------------------------
+    # Sparse - Dense, Dense - Sparse
+    # -------------------------------------------------------------------------
+
+    def test_sub_sparse_dense(self, matrices):
+        """sparse - dense returns dense."""
+        A_dense, A_sparse, B_dense, _ = matrices
+        C = A_sparse - B_dense
+        assert np.allclose(C, A_dense - B_dense)
+
+    @pytest.mark.skip(
+        reason="cupynumeric intercepts dense-sparse before __rsub__ is called"
+    )
+    def test_sub_dense_sparse(self, matrices):
+        """dense - sparse should return dense (currently broken in cupynumeric)."""
+        A_dense, _, B_dense, B_sparse = matrices
+        C = A_dense - B_sparse
+        assert np.allclose(C, A_dense - B_dense)
+
+    # -------------------------------------------------------------------------
+    # Sparse + Scalar, Scalar + Sparse
+    # -------------------------------------------------------------------------
+
+    def test_add_sparse_zero(self, matrices):
+        """A + 0 should return a copy of A."""
+        A_dense, A_sparse, _, _ = matrices
+        C = A_sparse + 0
+        assert np.allclose(C.todense(), A_dense)
+
+        C = 0 + A_sparse
+        assert np.allclose(C.todense(), A_dense)
+
+    def test_add_sparse_nonzero_scalar_raises(self, matrices):
+        """A + nonzero scalar should raise NotImplementedError."""
+        _, A_sparse, _, _ = matrices
+        with pytest.raises(NotImplementedError):
+            _ = A_sparse + 5.0
+        with pytest.raises(NotImplementedError):
+            _ = 5.0 + A_sparse
+
+    # -------------------------------------------------------------------------
+    # Sparse - Scalar, Scalar - Sparse
+    # -------------------------------------------------------------------------
+
+    def test_sub_sparse_zero(self, matrices):
+        """A - 0 should return a copy of A."""
+        A_dense, A_sparse, _, _ = matrices
+        C = A_sparse - 0
+        assert np.allclose(C.todense(), A_dense)
+
+        C = 0 - A_sparse
+        assert np.allclose(C.todense(), -A_dense)
+
+    def test_sub_sparse_nonzero_scalar_raises(self, matrices):
+        """Subtracting a nonzero scalar should raise NotImplementedError."""
+        _, A_sparse, _, _ = matrices
+        with pytest.raises(NotImplementedError):
+            _ = A_sparse - 5.0
+        with pytest.raises(NotImplementedError):
+            _ = 5.0 - A_sparse
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_indexing.py b/tests/integration/test_indexing.py
index 259c7996..01b2dbac 100644
--- a/tests/integration/test_indexing.py
+++ b/tests/integration/test_indexing.py
@@ -72,7 +72,7 @@ def test_incompatible_mask(self, N, create_matrix, create_mask):
 
         # make sure the values are updated correctly
         A_dense = numpy.asarray(A.todense())
-        assert numpy.allclose(A_dense[mask_dense].sum() / num_nonzeros, value)
+        assert numpy.allclose(A_dense[mask_dense].sum(), value * num_nonzeros)
 
         # TODO: Add a check/test for location of nonzeros as well
 
@@ -200,33 +200,11 @@ def test_random_column_order(self):
         This is important because CSR format requires column indices to be
         sorted within each row for efficient operations.
         """
-        row_indices = cupynumeric.array(
-            [
-                2,
-                4,
-                5,
-                3,
-                5,
-                1,
-                1,
-                5,
-                5,
-            ]
+        row_indices = cupynumeric.array([2, 4, 5, 3, 5, 1, 1, 5, 5])
+        col_indices = cupynumeric.array([3, 1, 2, 2, 5, 1, 4, 1, 3])
+        data = cupynumeric.array(
+            [7.0, 9.0, 3.0, 4.0, 5.0, 19.0, 2.0, 99.0, 109.0]
         )
-        col_indices = cupynumeric.array(
-            [
-                3,
-                1,
-                2,
-                2,
-                5,
-                1,
-                4,
-                1,
-                3,
-            ]
-        )
-        data = cupynumeric.array([7.0, 9.0, 3.0, 4.0, 5.0, 19.0, 2.0, 99.0, 109.0])
 
         # note that the data in row 5 is ordered (2, 5, 1, 3),which will get
         # sorted to (1, 2, 5, 3) during instantiation, which is needed for indexing
diff --git a/tests/integration/test_manual_sorting.py b/tests/integration/test_manual_sorting.py
index 7d3ed282..4999c946 100644
--- a/tests/integration/test_manual_sorting.py
+++ b/tests/integration/test_manual_sorting.py
@@ -15,6 +15,7 @@
 import cupynumeric as np
 import numpy
 import pytest
+
 from legate_sparse.utils import sort_by_rows_then_cols
 
 
diff --git a/tests/integration/test_negate.py b/tests/integration/test_negate.py
new file mode 100644
index 00000000..ab9a1e21
--- /dev/null
+++ b/tests/integration/test_negate.py
@@ -0,0 +1,38 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for sparse matrix negation (__neg__)."""
+
+import sys
+
+import cupynumeric as np
+import pytest
+from utils.sample import simple_system_gen
+
+import legate_sparse as sparse
+
+
+def test_negate():
+    """-A returns a sparse matrix with negated values."""
+    N = 15
+    np.random.seed(42)
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+
+    C = -A_sparse
+
+    assert np.allclose(C.todense(), -A_dense)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_spgemm.py b/tests/integration/test_spgemm.py
index 5954df79..9e4c725d 100644
--- a/tests/integration/test_spgemm.py
+++ b/tests/integration/test_spgemm.py
@@ -16,11 +16,11 @@
 
 import cupynumeric as np
 import pytest
-from legate_sparse.runtime import runtime
 from utils.banded_matrix import banded_matrix
 from utils.sample import simple_system_gen
 
 import legate_sparse as sparse
+from legate_sparse.runtime import runtime
 
 
 @pytest.mark.parametrize("N", [5, 29])
diff --git a/tests/integration/test_spmv.py b/tests/integration/test_spmv.py
index 0c3590df..1b953150 100644
--- a/tests/integration/test_spmv.py
+++ b/tests/integration/test_spmv.py
@@ -16,11 +16,11 @@
 
 import cupynumeric as np
 import pytest
-from legate_sparse.runtime import runtime
 from utils.banded_matrix import banded_matrix
 from utils.sample import simple_system_gen
 
 import legate_sparse as sparse
+from legate_sparse.runtime import runtime
 
 
 @pytest.mark.parametrize("N", [5, 29])
@@ -105,5 +105,39 @@ def test_csr_spmv_unsupported_dtype(N, nnz_per_row, unsupported_dtype):
             y = A.dot(x)  # noqa: F841
 
 
+@pytest.mark.parametrize("N", [5, 29])
+@pytest.mark.parametrize("M", [7, 17])
+@pytest.mark.parametrize("complex_dtype", [np.complex64, np.complex128])
+def test_csr_spmv_complex(N, M, complex_dtype):
+    """Test sparse matrix-vector multiplication with complex datatypes.
+
+    This test verifies that sparse matrix-vector multiplication works
+    correctly for complex64 and complex128 datatypes.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    M : int
+        Number of columns in the matrix.
+    complex_dtype : dtype
+        Complex datatype to use (complex64 or complex128).
+    """
+
+    # get real and imag parts separately
+    A_dense_real, _, x_real = simple_system_gen(N, M, sparse.csr_array)
+    A_dense_imag, _, x_imag = simple_system_gen(N, M, sparse.csr_array)
+
+    A_dense = A_dense_real.astype(complex_dtype) + 1j * A_dense_imag.astype(
+        complex_dtype
+    )
+    x = x_real.astype(complex_dtype) + 1j * x_imag.astype(complex_dtype)
+    A = sparse.csr_array(A_dense.astype(complex_dtype))
+
+    y = A @ x
+
+    assert np.all(np.isclose(y, A_dense @ x))
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_spsolve.py b/tests/integration/test_spsolve.py
new file mode 100644
index 00000000..f4b03e2e
--- /dev/null
+++ b/tests/integration/test_spsolve.py
@@ -0,0 +1,199 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupynumeric as np
+import pytest
+import scipy.sparse as scipy_sparse
+import scipy.sparse.linalg as scipy_linalg
+from utils.sample import sample_dense
+
+import legate_sparse.linalg as linalg
+from legate_sparse import csr_array
+from legate_sparse.runtime import runtime
+
+# Skip all tests in this module if no GPUs are available
+# since spsolve is only supported on GPU
+pytestmark = pytest.mark.skipif(
+    runtime.num_gpus == 0, reason="spsolve is only supported on GPU backend"
+)
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_identity_matrix(N):
+    """Test spsolve with an identity matrix."""
+    A = csr_array(np.eye(N))
+    b = np.ones(N)
+    x = linalg.spsolve(A, b)
+
+    # For identity matrix, x should equal b
+    assert np.allclose(x, b, rtol=1e-10, atol=1e-12), (
+        f"Identity matrix solution incorrect: max error = {np.max(np.abs(x - b))}"
+    )
+
+
+def test_spsolve_basic_square_matrix():
+    """Test spsolve with a basic square matrix."""
+
+    N = 5
+    np.random.seed(42)
+    A_dense = sample_dense(N, N, 0.3, 42)
+    A_dense = A_dense + N * np.eye(N)
+
+    A = csr_array(A_dense)
+    b = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), (
+        f"Solution verification failed: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+    A_scipy = scipy_sparse.csr_matrix(np.array(A.todense()))
+    x_scipy = scipy_linalg.spsolve(A_scipy, np.array(b))
+    assert np.allclose(x, x_scipy, rtol=1e-5, atol=1e-6), (
+        f"Solution differs from SciPy: max error = {np.max(np.abs(x - x_scipy))}"
+    )
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_diagonal_matrix(N):
+    """Test spsolve with a diagonal matrix."""
+    diag_values = np.arange(1.0, N + 1.0)
+    A_dense = np.diag(diag_values)
+    A = csr_array(A_dense)
+    b = np.ones(N)
+    x = linalg.spsolve(A, b)
+    x_expected = b / diag_values
+    assert np.allclose(x, x_expected, rtol=1e-10, atol=1e-12), (
+        f"Diagonal matrix solution incorrect: max error = {np.max(np.abs(x - x_expected))}"
+    )
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_tridiagonal_matrix(N):
+    """Test spsolve with a tridiagonal matrix."""
+    main_diag = np.full(N, 4.0)
+    off_diag = np.full(N - 1, -1.0)
+    A_dense = np.diag(main_diag) + np.diag(off_diag, 1) + np.diag(off_diag, -1)
+    A = csr_array(A_dense)
+    b = np.ones(N)
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), (
+        f"Tridiagonal solution verification failed: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+    A_scipy = scipy_sparse.csr_matrix(np.array(A.todense()))
+    x_scipy = scipy_linalg.spsolve(A_scipy, np.array(b))
+    assert np.allclose(x, x_scipy, rtol=1e-5, atol=1e-6), (
+        f"Tridiagonal solution differs from SciPy: max error = {np.max(np.abs(x - x_scipy))}"
+    )
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_symmetric_positive_definite(N):
+    """Test spsolve with a symmetric positive definite matrix.
+    We create an SPD matrix by A = B^T * B + N * I.
+    """
+    seed = 42
+    B_dense = sample_dense(N, N, 0.2, seed)
+    A_dense = B_dense.T @ B_dense + N * np.eye(N)
+    A = csr_array(A_dense)
+
+    # make sure it's positive definite
+    eigenvalues = np.linalg.eigvals(A_dense)
+    assert np.all(eigenvalues > 0), "Matrix is not positive definite"
+
+    b = np.random.rand(N)
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-4, atol=1e-5), (
+        f"SPD solution verification failed: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+
+@pytest.mark.parametrize(
+    "dtype", [np.float32, np.float64, np.complex64, np.complex128]
+)
+def test_spsolve_all_dtypes(dtype):
+    """Comprehensive test for spsolve with all cuDSS-supported data types.
+
+    Note: cuDSS only supports floating-point and complex types.
+    Integer and boolean types are not supported
+    """
+    N = 10
+
+    # Create a well-conditioned matrix for each dtype
+    if dtype in [np.complex64, np.complex128]:
+        # For complex types, create a Hermitian positive definite matrix
+        seed = 42
+        np.random.seed(seed)
+        B = np.random.randn(N, N) + 1j * np.random.randn(N, N)
+        A_dense = (B @ B.conj().T + N * np.eye(N)).astype(dtype)
+        b = np.ones(N, dtype=dtype)
+    else:
+        seed = 42
+        A_dense = sample_dense(N, N, 0.3, seed).astype(dtype)
+        A_dense = A_dense + N * np.eye(N, dtype=dtype)
+        b = np.ones(N, dtype=dtype)
+
+    # Solve the system
+    A = csr_array(A_dense)
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-4, atol=1e-5), (
+        f"Solution verification failed for dtype {dtype}: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+    assert x.dtype == b.dtype, (
+        f"Output dtype {x.dtype} doesn't match input dtype {b.dtype} for dtype {dtype}"
+    )
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_upper_triangular(N):
+    """Test spsolve with an upper triangular matrix."""
+    A_dense = np.triu(np.random.rand(N, N) + np.eye(N))
+    A = csr_array(A_dense)
+    b = np.ones(N)
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), (
+        f"Upper triangular solution verification failed: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_lower_triangular(N):
+    """Test spsolve with a lower triangular matrix."""
+    A_dense = np.tril(np.ones((N, N)) + np.eye(N))
+    A = csr_array(A_dense)
+    b = np.ones(N)
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), (
+        f"Lower triangular solution verification failed: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+
+if __name__ == "__main__":
+    import sys
+
+    pytest.main(sys.argv)
+    sys.exit(0)
diff --git a/tests/integration/test_unary_operation.py b/tests/integration/test_unary_operation.py
index f1f3c07d..432381d3 100644
--- a/tests/integration/test_unary_operation.py
+++ b/tests/integration/test_unary_operation.py
@@ -22,11 +22,17 @@
 
 def test_unary_operation():
     row_offsets = np.array([0, 2, 5, 7, 9, 11, 14], dtype=np.int64)
-    csr_vals = np.array([2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64)
-    col_indices = np.array([0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64)
+    csr_vals = np.array(
+        [2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64
+    )
+    col_indices = np.array(
+        [0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64
+    )
     matrix_shape = (6, 6)
 
-    A = sparse.csr_array((csr_vals, col_indices, row_offsets), shape=matrix_shape)
+    A = sparse.csr_array(
+        (csr_vals, col_indices, row_offsets), shape=matrix_shape
+    )
 
     B = A * 2
     Bvalues = np.asarray(B.vals)
diff --git a/tests/integration/utils/banded_matrix.py b/tests/integration/utils/banded_matrix.py
index fda5ef5f..2467f452 100644
--- a/tests/integration/utils/banded_matrix.py
+++ b/tests/integration/utils/banded_matrix.py
@@ -90,7 +90,9 @@ def banded_matrix(
 
         pred = np.arange(nnz_per_row - half_nnz, nnz_per_row + 1)
         post = np.flip(pred)
-        nnz_arr = np.concatenate((pred, np.ones(main_rows) * nnz_per_row, post))
+        nnz_arr = np.concatenate(
+            (pred, np.ones(main_rows) * nnz_per_row, post)
+        )
 
         if sparse.__name__ == "legate_sparse":
             row_offsets = np.zeros(N + 1).astype(sparse.coord_ty)
diff --git a/tests/testdata/GlossGT.mtx b/tests/testdata/GlossGT.mtx
index 27869886..b3bbe5d0 100644
--- a/tests/testdata/GlossGT.mtx
+++ b/tests/testdata/GlossGT.mtx
@@ -14,15 +14,15 @@
 %-------------------------------------------------------------------------------
 % notes:
 % ------------------------------------------------------------------------------
-% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse 
+% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse
 % matrix collection, Tim Davis.  For Pajek datasets, See V. Batagelj & A. Mrvar,
-% http://vlado.fmf.uni-lj.si/pub/networks/data/.                                
+% http://vlado.fmf.uni-lj.si/pub/networks/data/.
 % ------------------------------------------------------------------------------
-%  Bill Cherowitzo: Graph and Digraph Glossary                                  
-%  http://www-math.cudenver.edu/~wcherowi/courses/m4408/glossary.html           
-%  Pajek's network: Barbara Zemlji"c, 2. nov 2003                               
-% The original problem had 3D xyz coordinates, but all values of z were equal   
-% to 0, and have been removed.  This graph has 2D coordinates.                  
+%  Bill Cherowitzo: Graph and Digraph Glossary
+%  http://www-math.cudenver.edu/~wcherowi/courses/m4408/glossary.html
+%  Pajek's network: Barbara Zemlji"c, 2. nov 2003
+% The original problem had 3D xyz coordinates, but all values of z were equal
+% to 0, and have been removed.  This graph has 2D coordinates.
 %-------------------------------------------------------------------------------
 72 72 122
 3 4
diff --git a/tests/testdata/Ragusa18.mtx b/tests/testdata/Ragusa18.mtx
index 2e8bd6ce..24eaa03b 100644
--- a/tests/testdata/Ragusa18.mtx
+++ b/tests/testdata/Ragusa18.mtx
@@ -14,9 +14,9 @@
 %-------------------------------------------------------------------------------
 % notes:
 % ------------------------------------------------------------------------------
-% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse 
+% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse
 % matrix collection, Tim Davis.  For Pajek datasets, See V. Batagelj & A. Mrvar,
-% http://vlado.fmf.uni-lj.si/pub/networks/data/.                                
+% http://vlado.fmf.uni-lj.si/pub/networks/data/.
 % ------------------------------------------------------------------------------
 %-------------------------------------------------------------------------------
 23 23 64
diff --git a/tests/testdata/karate.mtx b/tests/testdata/karate.mtx
index 59df7607..9ecdff42 100644
--- a/tests/testdata/karate.mtx
+++ b/tests/testdata/karate.mtx
@@ -12,14 +12,14 @@
 % kind: undirected graph
 %-------------------------------------------------------------------------------
 % notes:
-% Network collection from M. Newman                                          
-% http://www-personal.umich.edu/~mejn/netdata/                               
-%                                                                            
-% The graph "karate" contains the network of friendships between the 34      
-% members of a karate club at a US university, as described by Wayne Zachary 
+% Network collection from M. Newman
+% http://www-personal.umich.edu/~mejn/netdata/
+%
+% The graph "karate" contains the network of friendships between the 34
+% members of a karate club at a US university, as described by Wayne Zachary
 % in 1977.  If you use these data in your work, please cite W. W. Zachary, An
 % information flow model for conflict and fission in small groups, Journal of
-% Anthropological Research 33, 452-473 (1977).                               
+% Anthropological Research 33, 452-473 (1977).
 %-------------------------------------------------------------------------------
 34 34 78
 2 1