From 886cb7e1bb562299f69f78c0a76e4b97e18fa887 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Wed, 30 Jul 2025 12:36:51 -0700
Subject: [PATCH 1/3] v25.07.00

---
 CMakeLists.txt                                |   2 +-
 README.md                                     |   3 +-
 cmake/thirdparty/get_legate.cmake             |   1 +
 cmake/versions.json                           |  14 +-
 conda/conda-build/conda_build_config.yaml     |   6 +-
 conda/conda-build/meta.yaml                   |  15 +-
 examples/common.py                            | 263 +++++++-
 examples/matrix_power.py                      | 145 +++--
 examples/pde.py                               |  45 +-
 examples/spgemm_microbenchmark.py             |  94 +++
 examples/spmv_microbenchmark.py               |  70 ++
 legate_sparse/__init__.py                     |   1 +
 legate_sparse/base.py                         | 190 +++++-
 legate_sparse/config.py                       |  85 ++-
 legate_sparse/csr.py                          | 605 ++++++++++++++++--
 legate_sparse/dia.py                          | 215 ++++++-
 legate_sparse/gallery.py                      |  49 +-
 legate_sparse/install_info.pyi                |  16 +
 legate_sparse/io.py                           |  32 +
 legate_sparse/linalg.py                       | 290 +++++++--
 legate_sparse/module.py                       |  66 +-
 legate_sparse/types.py                        |  11 +
 legate_sparse/utils.py                        | 330 ++++++++--
 setup.py                                      |   5 +-
 src/legate_sparse/array/conv/csr_to_dense.cc  |   6 +-
 src/legate_sparse/array/conv/csr_to_dense.h   |   3 +-
 src/legate_sparse/array/conv/dense_to_csr.cc  |   7 +-
 src/legate_sparse/array/conv/dense_to_csr.h   |   6 +-
 .../array/conv/pos_to_coordinates.cc          |   6 +-
 .../array/conv/pos_to_coordinates.h           |   3 +-
 src/legate_sparse/array/csr/get_diagonal.cc   |   7 +-
 src/legate_sparse/array/csr/get_diagonal.h    |   3 +-
 src/legate_sparse/array/csr/indexing.cc       |   7 +-
 src/legate_sparse/array/csr/indexing.h        |   3 +-
 .../array/csr/spgemm_csr_csr_csr.cc           |   7 +-
 .../array/csr/spgemm_csr_csr_csr.h            |   9 +-
 src/legate_sparse/array/csr/spmv.cc           |   7 +-
 src/legate_sparse/array/csr/spmv.h            |   3 +-
 src/legate_sparse/array/util/scale_rect.cc    |   6 +-
 src/legate_sparse/array/util/scale_rect.h     |   3 +-
 src/legate_sparse/array/util/unzip_rect.cc    |   6 +-
 src/legate_sparse/array/util/unzip_rect.h     |   3 +-
 src/legate_sparse/array/util/zip_to_rect.cc   |   6 +-
 src/legate_sparse/array/util/zip_to_rect.h    |   3 +-
 src/legate_sparse/cudalibs.cu                 |  12 +-
 src/legate_sparse/io/mtx_to_coo.cc            |  20 +-
 src/legate_sparse/io/mtx_to_coo.h             |   3 +-
 src/legate_sparse/linalg/axpby.cc             |   6 +-
 src/legate_sparse/linalg/axpby.h              |   3 +-
 src/legate_sparse/mapper/mapper.cc            |  20 +-
 .../partition/fast_image_partition.cc         |   8 +-
 .../partition/fast_image_partition.h          |   3 +-
 src/legate_sparse/util/upcast_future.cc       |   8 +-
 src/legate_sparse/util/upcast_future.h        |   3 +-
 test.py                                       |   3 +-
 tests/integration/conftest.py                 |  54 +-
 tests/integration/test_cg_solve.py            |  74 ++-
 tests/integration/test_comparison.py          |  46 +-
 tests/integration/test_diagonal.py            |  30 +
 tests/integration/test_gmres_solve.py         |  22 +
 tests/integration/test_indexing.py            | 107 +++-
 tests/integration/test_io.py                  |  48 ++
 tests/integration/test_spgemm.py              |  42 ++
 tests/integration/test_spmv.py                |  51 ++
 tests/integration/utils/banded_matrix.py      |  51 +-
 tests/integration/utils/sample.py             | 128 ++++
 66 files changed, 3035 insertions(+), 364 deletions(-)
 create mode 100644 legate_sparse/install_info.pyi

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82e98886..c32254c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ include(rapids-find)
 ###################################
 # Project
 
-set(legate_sparse_version 25.03.00)
+set(legate_sparse_version 25.07.00)
 
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
 set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g")
diff --git a/README.md b/README.md
index 5eb4af78..b03ea8c6 100644
--- a/README.md
+++ b/README.md
@@ -29,8 +29,7 @@ for [NumPy](https://numpy.org/doc/stable/reference/index.html#reference), to
 enable writing programs that operate on distributed dense and sparse arrays.
 Take a look at the `examples` directory for some applications that can 
 use Legate Sparse. We have implemented
-an explicit partial-differential equation (PDE) [solver](examples/pde.py) 
-and [Geometric multi-grid](examples/gmg.py) solver. 
+an explicit partial-differential equation (PDE) [solver](examples/pde.py).
 More complex and interesting applications are on the way -- stay tuned!
 
 Legate Sparse is currently in alpha and supports a subset of APIs 
diff --git a/cmake/thirdparty/get_legate.cmake b/cmake/thirdparty/get_legate.cmake
index 709577a1..727671fd 100644
--- a/cmake/thirdparty/get_legate.cmake
+++ b/cmake/thirdparty/get_legate.cmake
@@ -24,6 +24,7 @@ function(find_or_configure_legate)
   include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
   rapids_cpm_package_details(legate version git_repo git_branch shallow exclude_from_all)
 
+  set(version ${PKG_VERSION})
   set(exclude_from_all ${PKG_EXCLUDE_FROM_ALL})
   if(PKG_BRANCH)
     set(git_branch "${PKG_BRANCH}")
diff --git a/cmake/versions.json b/cmake/versions.json
index 6c8498d4..6c5440f4 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -3,22 +3,22 @@
     "legate" : {
       "repo": "legate.internal",
       "org": "nv-legate",
-      "version": "25.03.02",
-      "git_url" : "git@github.com:nv-legate/legate.internal.git",
+      "version": "25.07.00",
+      "git_url" : "git@github.com:nv-legate/legate.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "75dc0a92bbd2dfb79b6b680a0f37cbd0370d0181",
+      "git_tag" : "a46dc3d5b176ff9546bc831409c394c1bbc3b936",
       "anaconda_label": "main"
     },
     "cupynumeric" : {
       "repo": "cupynumeric.internal",
       "org": "nv-legate",
-      "version": "25.03.02",
-      "git_url" : "git@github.com:nv-legate/cupynumeric.internal",
+      "version": "25.07.00",
+      "git_url" : "git@github.com:nv-legate/cupynumeric",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "1fa45603c560068508c3be2e0df45aec62359019",
-      "anaconda_label": "experimental"
+      "git_tag" : "6132d8450049a7abd7786fb4d60444eb5b4e25db",
+      "anaconda_label": "main"
     }
   }
 }
diff --git a/conda/conda-build/conda_build_config.yaml b/conda/conda-build/conda_build_config.yaml
index 79750a86..ada8dda2 100644
--- a/conda/conda-build/conda_build_config.yaml
+++ b/conda/conda-build/conda_build_config.yaml
@@ -6,12 +6,14 @@ upload_build:
   - false
 
 python:
-  - 3.10
   - 3.11
   - 3.12
+  - 3.13
 
 numpy_version:
-  - ">=1.22,<2"
+  # Not 2.1.0 which segfaults on asarray() sometimes, see
+  # https://github.com/numpy/numpy/pull/27249
+  - ">=1.22,!=2.1.0"
 
 cmake_version:
   - ">=3.20.1,!=3.23.0"
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 03d1f9d3..9bdad28f 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -7,14 +7,6 @@
   {# We need to have a default value for the initial pass over the recipe #}
   {% set gpu_enabled_bool = false %}
 {% endif %}
-{% if upload_build == "true" %}
-  {% set upload_build_bool = true %}
-{% elif upload_build == "false" %}
-  {% set upload_build_bool = false %}
-{% else %}
-  {# We need to have a default value for the initial pass over the recipe #}
-  {% set upload_build_bool = false %}
-{% endif %}
 ## The placeholder version is strictly for making two-pass conda build process.
 ## It should not be used for any other purpose, and this is not a default version.
 {% set placeholder_version = '0.0.0.dev' %}
@@ -68,9 +60,9 @@ build:
 
 ## Create legate/cupynumeric version and build string
 {% set legate_version = os.environ.get("LEGATE_VERSION", "1.0.0") %}
-{% set legate_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, os.environ.get("LEGATE_BUILDSTR", ""), cpu_gpu_tag]) %}
+{% set legate_buildstr = "_".join(["py" ~ py_version, "*" ~ cpu_gpu_tag.strip('_'), os.environ.get("LEGATE_BUILDSTR", "") ]) %}
 {% set cupynumeric_version = os.environ.get("CUPYNUMERIC_VERSION", "1.0.0") %}
-{% set cupynumeric_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, os.environ.get("CUPYNUMERIC_BUILDSTR", ""), cpu_gpu_tag]) %}
+{% set cupynumeric_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, cpu_gpu_tag, os.environ.get("CUPYNUMERIC_BUILDSTR", "")]) %}
 
 {% if use_local_path is not defined %}
 # use git hash
@@ -126,7 +118,7 @@ requirements:
     #- libcurand-dev
     - openblas =* =*openmp*
     - llvm-openmp
-    - legate ={{ legate_version }}={{ legate_buildstr }}
+    - legate ={{ legate_version }}=*{{ legate_buildstr }}
     - cupynumeric ={{ cupynumeric_version }}={{ cupynumeric_buildstr }}
 {% if gpu_enabled_bool %}
     # cupynumeric could be only in the run section and we could have just legate
@@ -146,7 +138,6 @@ requirements:
     - numpy {{ numpy_version }}
     - scipy
     - openblas =* =*openmp*
-    - legate ={{ legate_version }}={{ legate_buildstr }}
     - cupynumeric ={{ cupynumeric_version }}={{ cupynumeric_buildstr }}
 {% if gpu_enabled_bool %}
     - libnvjitlink
diff --git a/examples/common.py b/examples/common.py
index b679e5b9..99174ed6 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -20,6 +20,30 @@
 
 
 def get_arg_number(arg):
+    """Parse a string argument that may contain size suffixes.
+
+    Parameters
+    ----------
+    arg : str
+        String argument that may end with 'k', 'm', or 'g' for
+        kilobytes, megabytes, or gigabytes respectively.
+
+    Returns
+    -------
+    int
+        The parsed number with appropriate multiplier applied.
+
+    Examples
+    --------
+    >>> get_arg_number("1024")
+    1024
+    >>> get_arg_number("1k")
+    1024
+    >>> get_arg_number("1m")
+    1048576
+    >>> get_arg_number("1g")
+    1073741824
+    """
     multiplier = 1
     arg = arg.lower()
     if len(arg) == 0:
@@ -38,28 +62,47 @@ def get_arg_number(arg):
 
 
 class Timer(Protocol):
+    """Protocol for timer implementations.
+
+    This protocol defines the interface that timer classes must implement
+    for measuring execution time in the examples.
+    """
+
     def start(self):
+        """Start timing."""
         ...
 
     def stop(self):
-        """
-        Blocks execution until everything before it has completed. Returns the
-        duration since the last call to start(), in milliseconds.
+        """Stop timing and return duration.
+
+        Blocks execution until everything before it has completed.
+
+        Returns
+        -------
+        float
+            Duration since the last call to start(), in milliseconds.
         """
         ...
 
 
 class LegateTimer(Timer):
+    """Timer implementation using Legate's timing facilities.
+
+    This timer uses Legate's internal timing mechanism for accurate
+    measurement of GPU operations.
+    """
+
     def __init__(self):
         self._start = None
 
     def start(self):
+        """Start timing using Legate's time function."""
         from legate.timing import time
 
         self._start = time()
 
-    # returns time in milliseconds
     def stop(self):
+        """Stop timing and return duration in milliseconds."""
         from legate.timing import time
 
         _end = time()
@@ -67,16 +110,24 @@ def stop(self):
 
 
 class CuPyTimer(Timer):
+    """Timer implementation using CuPy's CUDA events.
+
+    This timer uses CUDA events for accurate measurement of GPU operations
+    in CuPy applications.
+    """
+
     def __init__(self):
         self._start_event = None
 
     def start(self):
+        """Start timing using CUDA events."""
         from cupy import cuda
 
         self._start_event = cuda.Event()
         self._start_event.record()
 
     def stop(self):
+        """Stop timing and return duration in milliseconds."""
         from cupy import cuda
 
         end_event = cuda.Event()
@@ -86,15 +137,23 @@ def stop(self):
 
 
 class NumPyTimer(Timer):
+    """Timer implementation using Python's high-resolution timer.
+
+    This timer uses Python's perf_counter_ns for accurate measurement
+    of CPU operations in NumPy/SciPy applications.
+    """
+
     def __init__(self):
         self._start_time = None
 
     def start(self):
+        """Start timing using perf_counter_ns."""
         from time import perf_counter_ns
 
         self._start_time = perf_counter_ns() / 1000.0
 
     def stop(self):
+        """Stop timing and return duration in milliseconds."""
         from time import perf_counter_ns
 
         end_time = perf_counter_ns() / 1000.0
@@ -105,27 +164,61 @@ def stop(self):
 # manager so that we can run both CuPy and SciPy
 # programs with resource scoping.
 class DummyScope:
+    """No-op context manager for resource scoping.
+
+    This class provides a dummy context manager that does nothing,
+    allowing the same code to run with both CuPy and SciPy programs
+    that may or may not use resource scoping.
+    """
+
     def __init__(self):
         ...
 
     def __enter__(self):
+        """Enter the context (no-op)."""
         ...
 
     def __exit__(self, _, __, ___):
+        """Exit the context (no-op)."""
         ...
 
     def __getitem__(self, item):
+        """Return self for any indexing (no-op)."""
         return self
 
     def count(self, _):
+        """Return 1 for any count operation."""
         return 1
 
     @property
     def preferred_kind(self):
+        """Return None for preferred kind."""
         return None
 
 
 def get_phase_procs(use_legate: bool):
+    """Get processor configurations for different phases of computation.
+
+    Parameters
+    ----------
+    use_legate : bool
+        Whether to use Legate-specific processor configuration.
+
+    Returns
+    -------
+    tuple
+        (build_procs, solve_procs) - processor configurations for
+        build and solve phases respectively.
+
+    Notes
+    -----
+    When use_legate is True, this function queries the available
+    processors and assigns them to different phases:
+    - Build phase: Prefers CPUs, then OpenMP processors, then GPUs
+    - Solve phase: Prefers GPUs, then OpenMP processors, then CPUs
+
+    When use_legate is False, returns DummyScope objects.
+    """
     if use_legate:
         from legate.core import TaskTarget, get_machine
 
@@ -160,6 +253,27 @@ def get_phase_procs(use_legate: bool):
 
 
 def parse_common_args():
+    """Parse common command line arguments for example scripts.
+
+    Returns
+    -------
+    tuple
+        (package, timer, np, sparse, linalg, use_legate) where:
+        - package: str - the selected package ("legate", "cupy", or "scipy")
+        - timer: Timer - appropriate timer implementation
+        - np: module - numpy/cupy/cupynumeric module
+        - sparse: module - sparse matrix module
+        - linalg: module - linear algebra module
+        - use_legate: bool - whether Legate is being used
+
+    Notes
+    -----
+    This function sets up the global environment with the appropriate
+    modules based on the --package argument. It supports:
+    - "legate": Uses cupynumeric, legate_sparse, and legate_sparse.linalg
+    - "cupy": Uses cupy, cupyx.scipy.sparse, and cupyx.scipy.sparse.linalg
+    - "scipy": Uses numpy, scipy.sparse, and scipy.sparse.linalg
+    """
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--package",
@@ -204,6 +318,44 @@ def parse_common_args():
 # `diags` construct csr from dia array, while when from_diags=False
 # we construct csr arrya directly - might be slightly faster
 def banded_matrix(N, nnz_per_row, from_diags=False):
+    """Construct a banded matrix with 1.0 as values.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrix (N x N).
+    nnz_per_row : int
+        Number of non-zeros per row. Must be odd.
+    from_diags : bool, optional
+        If True, construct using sparse.diags then convert to CSR.
+        If False, construct CSR array directly. Default is False.
+
+    Returns
+    -------
+    sparse matrix
+        A banded matrix in CSR format with 1.0 values.
+
+    Raises
+    ------
+    AssertionError
+        If N <= nnz_per_row or nnz_per_row is not odd.
+
+    Notes
+    -----
+    The matrix has a banded structure with nnz_per_row non-zeros per row,
+    centered around the main diagonal. The direct CSR construction method
+    (from_diags=False) may be slightly faster than the diags method.
+
+    Examples
+    --------
+    >>> A = banded_matrix(5, 3)
+    >>> print(A.toarray())
+    [[1. 1. 0. 0. 0.]
+     [1. 1. 1. 0. 0.]
+     [0. 1. 1. 1. 0.]
+     [0. 0. 1. 1. 1.]
+     [0. 0. 0. 1. 1.]]
+    """
     if from_diags:
         return sparse.diags(
             [1] * nnz_per_row,
@@ -248,6 +400,43 @@ def banded_matrix(N, nnz_per_row, from_diags=False):
 
 
 def stencil_grid(S, grid, dtype=None, format=None):
+    """Construct a sparse matrix resulting from a stencil
+    discretization on rectilinear grids.
+
+    Parameters
+    ----------
+    S : array_like
+        The stencil array defining the pattern of connections.
+    grid : tuple
+        Grid dimensions (e.g., (N, N) for 2D grid).
+    dtype : dtype, optional
+        Data type of the matrix. If None, uses S.dtype.
+    format : str, optional
+        Output format. If None, returns CSR format.
+
+    Returns
+    -------
+    sparse matrix
+        A sparse matrix in CSR format representing the stencil on the grid.
+
+    Notes
+    -----
+    This function constructs a sparse matrix that represents the application
+    of a stencil operator on a regular grid. The stencil defines the pattern
+    of connections between grid points.
+
+    The function handles:
+    - Boundary conditions by zeroing connections outside the grid
+    - Duplicate diagonals by summing their contributions
+    - Conversion to CSR format for efficient operations
+
+    Examples
+    --------
+    >>> # 5-point stencil for 2D grid
+    >>> S = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]])
+    >>> A = stencil_grid(S, (3, 3))
+    >>> print(A.toarray())
+    """
     N_v = int(numpy.prod(grid))  # number of vertices in the mesh
     N_s = int((S != 0).sum(dtype=int))  # number of nonzero stencil entries
 
@@ -309,6 +498,41 @@ def stencil_grid(S, grid, dtype=None, format=None):
 
 
 def poisson2D(N):
+    """Construct the 2D Poisson matrix.
+
+    Parameters
+    ----------
+    N : int
+        Grid size (N x N grid).
+
+    Returns
+    -------
+    sparse matrix
+        The 2D Poisson matrix in CSR format.
+
+    Notes
+    -----
+    This constructs the standard 5-point stencil discretization of
+    the 2D Poisson equation -u_xx - u_yy = f on an N x N grid.
+
+    The matrix has the following structure:
+    - Main diagonal: 4.0
+    - Off-diagonals: -1.0 for horizontal and vertical connections
+
+    Examples
+    --------
+    >>> A = poisson2D(3)
+    >>> print(A.toarray())
+    [[ 4. -1.  0. -1.  0.  0.  0.  0.  0.]
+     [-1.  4. -1.  0. -1.  0.  0.  0.  0.]
+     [ 0. -1.  4.  0.  0. -1.  0.  0.  0.]
+     [-1.  0.  0.  4. -1.  0. -1.  0.  0.]
+     [ 0. -1.  0. -1.  4. -1.  0. -1.  0.]
+     [ 0.  0. -1.  0. -1.  4.  0.  0. -1.]
+     [ 0.  0.  0. -1.  0.  0.  4. -1.  0.]
+     [ 0.  0.  0.  0. -1.  0. -1.  4. -1.]
+     [ 0.  0.  0.  0.  0. -1.  0. -1.  4.]]
+    """
     diag_size = N * N - 1
     first = np.full((N - 1), -1.0)
     chunks = np.concatenate([np.zeros(1), first])
@@ -326,6 +550,37 @@ def poisson2D(N):
 
 
 def diffusion2D(N, epsilon=1.0, theta=0.0):
+    """Construct a 2D diffusion matrix with anisotropy.
+
+    Parameters
+    ----------
+    N : int
+        Grid size (N x N grid).
+    epsilon : float, optional
+        Anisotropy parameter. Default is 1.0 (isotropic).
+    theta : float, optional
+        Rotation angle in radians. Default is 0.0.
+
+    Returns
+    -------
+    sparse matrix
+        The 2D diffusion matrix in CSR format.
+
+    Notes
+    -----
+    This constructs a 9-point stencil for the anisotropic diffusion equation:
+    -div(K * grad(u)) = f
+
+    where K is a diffusion tensor that depends on epsilon and theta.
+    The stencil coefficients are computed based on the rotated diffusion tensor.
+
+    Examples
+    --------
+    >>> # Isotropic diffusion
+    >>> A = diffusion2D(3, epsilon=1.0, theta=0.0)
+    >>> # Anisotropic diffusion
+    >>> A = diffusion2D(3, epsilon=0.1, theta=np.pi/4)
+    """
     eps = float(epsilon)  # for brevity
     theta = float(theta)
 
diff --git a/examples/matrix_power.py b/examples/matrix_power.py
index a43249c0..cc52c08b 100644
--- a/examples/matrix_power.py
+++ b/examples/matrix_power.py
@@ -12,9 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This example performs matrix power by repetitively multiplication. We assume
-# that the matrix is square, so the number of cols is same as the number of
-# rows in the matrix
+"""Sparse Matrix Power Microbenchmark.
+
+This script benchmarks sparse matrix power computation by performing repeated
+matrix multiplication (A^n) and measuring performance at each step. It supports:
+
+- Matrix generation with specified non-zeros per row or total non-zeros
+- Configurable number of matrix multiplications (power exponent)
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nrows: Matrix size (supports k, m, g suffixes)
+--nnz-per-row: Number of non-zeros per row for generated matrix
+--nnz-total: Total number of non-zeros for generated matrix
+--k: Number of matrix multiplications to perform
+--nwarmups: Number of warmup iterations before timing
+--same-sparsity-for-cpu-and-gpu: Use NumPy for consistent sparsity patterns
+--random-seed: Random number seed for sparsity pattern generation
+--package: Backend to use (legate, cupy, scipy)
+"""
 
 import argparse
 from functools import reduce
@@ -31,17 +47,28 @@
 
 
 def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike = None):
-    """Return a CSR matrix with a prescribed number of nonzeros in each row.
-
-    Args:
-    ----
-
-    nrows: int
-        Number of rows in the matrix. Number of columns is same as number of rows
-    nnz_per_row: int
-        Desired number of nonzero entries in each row
-    dtype: npt.DTypeLike
-        Datatype of the values. This should be one of floating point datatypes
+    """Create a CSR matrix with a prescribed number of nonzeros in each row.
+
+    Parameters
+    ----------
+    nrows : int
+        Number of rows in the matrix. Number of columns is same as number of rows.
+    nnz_per_row : int
+        Desired number of nonzero entries in each row.
+    dtype : npt.DTypeLike, optional
+        Datatype of the values. Should be one of floating point datatypes.
+        Default is np.float32.
+
+    Returns
+    -------
+    sparse matrix
+        A CSR matrix with the specified sparsity pattern.
+
+    Notes
+    -----
+    This function creates a square matrix where each row has exactly
+    nnz_per_row non-zero entries. The column indices are randomly
+    generated and sorted within each row.
     """
     dtype = np.float32 if dtype is None else dtype
     ncols = nrows
@@ -58,18 +85,28 @@ def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike =
 
 
 def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
-    """Return a CSR matrix with a prescribed number of nonzeros in the matrix.
-
-    Args:
-    ----
-
-    nrows: int
-        Number of rows in the matrix. Number of columns is same as number of rows
-    nnz_total: int
-        Desired number of nonzero entries in the matrix with no expectation of
-        nonzeros in each row of the matrix
-    dtype: npt.DTypeLike
-        Datatype of the values. This should be one of floating point datatypes
+    """Create a CSR matrix with a prescribed total number of nonzeros.
+
+    Parameters
+    ----------
+    nrows : int
+        Number of rows in the matrix. Number of columns is same as number of rows.
+    nnz_total : int
+        Desired total number of nonzero entries in the matrix.
+    dtype : npt.DTypeLike, optional
+        Datatype of the values. Should be one of floating point datatypes.
+        Default is np.float32.
+
+    Returns
+    -------
+    sparse matrix
+        A CSR matrix with the specified total number of non-zeros.
+
+    Notes
+    -----
+    This function creates a square matrix with exactly nnz_total non-zero
+    entries distributed randomly across the matrix. There is no guarantee
+    about the number of non-zeros per row.
     """
     dtype = np.float32 if dtype is None else dtype
     ncols = nrows
@@ -86,21 +123,30 @@ def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
 # ------------------------
 
 
-def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4):
-    """Multiply matrix by self ntimes and print the time elapsed.
-    Args:
-    ----
-
-    A: csr_matrix
-        The input matrix
-    timer:
-        Instance of the timer class to measure elapsed time
-    ntimes:
-        Number of matrix multiplies or the exponent in A^n
-    nwarmups:
-        Number of warmup iterations before
+def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
+    """Compute A^k and measure performance.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        The input matrix to compute A^k.
+    timer : Timer
+        Timer instance to measure elapsed time.
+    nwarmups : int, optional
+        Number of warmup iterations before timing. Default is 2.
+    k : int, optional
+        Number of matrix multiplies or the exponent in A^k. Default is 4.
+
+    Notes
+    -----
+    This function computes A^k by repeated matrix multiplication
+    and measures the time for each step. It prints detailed timing
+    information including:
+    - Matrix dimensions and sparsity
+    - Time for each multiplication step
+    - Time for copying intermediate results
+    - Overall sparsity of the final result
     """
-
     timer.start()
     B = A.copy()
     elapsed_time_init_copy = timer.stop()
@@ -108,10 +154,10 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     for _ in range(nwarmups):
         output = A @ B
 
-    elapsed_time_spgemm = [-1.0] * ntimes
-    elapsed_time_copy = [-1.0] * ntimes
+    elapsed_time_spgemm = [-1.0] * k
+    elapsed_time_copy = [-1.0] * k
 
-    for hop in range(ntimes):
+    for hop in range(k):
         timer.start()
         output = A @ B
         elapsed_time_spgemm[hop] = timer.stop()
@@ -128,9 +174,9 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     print(f"NNZ of A                               : {A.nnz}")
     print(f"NNZ of output                          : {output.nnz}")
     print(f"Sparsity of output (%)                 : {sparsity_output}")
-    print(f"Total number of hops                   : {ntimes}")
+    print(f"Total number of hops                   : {k}")
     print(f"Elapsed time for copy in init (ms)     : {elapsed_time_init_copy}")
-    for hop in range(ntimes):
+    for hop in range(k):
         print(
             f"Elapsed time for spgemm for hop {hop} (ms) : {elapsed_time_spgemm[hop]}"
         )
@@ -168,10 +214,10 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     )
 
     parser.add_argument(
-        "--ntimes",
+        "--k",
         type=int,
         default=4,
-        dest="ntimes",
+        dest="k",
         help="Number of times A @ A is performed",
     )
 
@@ -203,8 +249,7 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     nnz_total = get_arg_number(args.nnz_total)
 
     # this is a global variable
-    global random_seed
-    global rng
+    global random_seed, rng
     random_seed = args.random_seed
 
     if args.same_sparsity_for_cpu_and_gpu:
@@ -230,6 +275,6 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
         print("Matrix created with number of nonzeros per row")
     elapsed_time_matrix_gen = timer.stop()
 
-    compute_matrix_multiply_ntimes(A, timer, int(args.nwarmups), int(args.ntimes))
+    compute_A_power_k(A, timer, int(args.nwarmups), int(args.k))
 
     print(f"Elapsed time in matrix creation (ms)   : {elapsed_time_matrix_gen}")
diff --git a/examples/pde.py b/examples/pde.py
index 2313f799..d9ca0095 100644
--- a/examples/pde.py
+++ b/examples/pde.py
@@ -12,6 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Partial Differential Equation (PDE) Solver Microbenchmark.
+
+This script benchmarks the solution of 2D Poisson equations using sparse
+linear algebra operations. It implements a finite difference discretization
+with Dirichlet boundary conditions and solves the resulting linear system
+using conjugate gradient iteration. It supports:
+
+- 2D Poisson equation with analytical right-hand side
+- Configurable mesh resolution (nx, ny grid points)
+- Performance measurement of linear solver iterations
+- Throughput mode for measuring solve performance only
+- Convergence analysis with relative residual norms
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nx: Number of grid points along X axis
+--ny: Number of grid points along Y axis
+--plot: Enable residual plotting
+--plot_filename: Filename for plot output
+--throughput: Measure only solve iterations (requires max_iters)
+--tol: Convergence tolerance for linear solver
+--max-iters: Maximum number of linear solver iterations
+--warmup-iters: Number of warmup iterations (for throughput mode)
+--package: Backend to use (legate, cupy, scipy)
+"""
+
 # This PDE solving application is derived from
 # https://aquaulb.github.io/book_solving_pde_mooc/solving_pde_mooc/notebooks/05_IterativeMethods/05_01_Iteration_and_2D.html.
 
@@ -184,7 +210,6 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         _ = A.dot(np.ones((A.shape[1],)))
 
         if throughput:
-            assert max_iters > warmup_iters
             p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=warmup_iters)
             max_iters = max_iters - warmup_iters
             print(f"max_iters has been updated to: {max_iters}")
@@ -192,7 +217,10 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         timer.start()
         # If we're testing throughput, run only the prescribed number of iterations.
         if throughput:
-            p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters)
+            if use_legate:
+                p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters, conv_test_iters=max_iters)
+            else:
+                p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters)
         else:
             p_sol, iters = linalg.cg(A, bflat, rtol=tol)
         total = timer.stop()
@@ -200,9 +228,10 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         print(f"Mesh resolution                     : ({nx}, {ny})")
         print(f"Dimension of A                      : {A.shape}")
         print(f"Number of rows in A                 : {A.shape[0]}")
+        print(f"Total elapsed time (ms)             : {total}")
 
         if throughput:
-            print(f"Total elapsed time (ms)             : {total}")
+            print(f"Number of warmup iterations         : {warmup_iters}")
             print(f"Max number of iterations            : {max_iters}")
             print(f"Time per (max-)iteration (ms)       : {total / max_iters}")
 
@@ -215,9 +244,9 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
             convergence_status = True if norm_res <= norm_ini * tol else False
             print(f"Did the solution converge           : {convergence_status}")
             print(f"Final relative residual norm        : {norm_res / norm_ini}")
-            print(f"Number of iterations                : {iters}")
-            print(f"Total elapsed time (ms)             : {total}")
-            print(f"Time per iteration (ms)             : {total / iters}")
+            if iters > 0:
+                print(f"Number of iterations                : {iters}")
+                print(f"Time per iteration (ms)             : {total / iters}")
 
 
 if __name__ == "__main__":
@@ -294,8 +323,8 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
     args, _ = parser.parse_known_args()
     _, timer, np, sparse, linalg, use_legate = parse_common_args()
 
-    if args.throughput and args.max_iters is None:
-        print("Must provide --max-iters when using -throughput.")
+    if args.throughput and (args.max_iters is None or args.warmup_iters is None):
+        print("Must provide --max-iters and --warmup-iters when using --throughput.")
         sys.exit(1)
 
     execute(**vars(args), timer=timer)
diff --git a/examples/spgemm_microbenchmark.py b/examples/spgemm_microbenchmark.py
index 3741f400..e30c05dd 100644
--- a/examples/spgemm_microbenchmark.py
+++ b/examples/spgemm_microbenchmark.py
@@ -12,17 +12,80 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Sparse Matrix-Matrix Multiplication Microbenchmark.
+
+This script benchmarks sparse matrix-matrix multiplication performance
+with configurable matrix sizes and generation methods. It supports:
+
+- Banded matrix generation with specified non-zeros per row
+- Loading matrices from Matrix Market files
+- Stable mode for partition caching vs. fresh matrix creation
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nrows: Matrix size (supports k, m, g suffixes)
+--nnz-per-row: Number of non-zeros per row for banded matrices
+--stable: Enable partition caching by reusing matrices
+--filename1: Load first matrix from Matrix Market file
+--filename2: Load second matrix from Matrix Market file
+--iters: Number of benchmark iterations
+--package: Backend to use (legate, cupy, scipy)
+"""
+
 import argparse
 
 from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args
 
 
 def spgemm_dispatch(A, B):
+    """Dispatch sparse matrix-matrix multiplication operation.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        First sparse matrix operand.
+    B : sparse matrix
+        Second sparse matrix operand.
+
+    Returns
+    -------
+    sparse matrix
+        The result of A @ B.
+
+    Notes
+    -----
+    This function performs sparse matrix-matrix multiplication using
+    the @ operator, which is supported by all backends (Legate, CuPy, SciPy).
+    """
     C = A @ B
     return C
 
 
 def get_matrices(N, nnz_per_row, fname1, fname2):
+    """Get matrices for SpGEMM benchmark.
+
+    Parameters
+    ----------
+    N : int
+        Matrix size (N x N) for generated matrices.
+    nnz_per_row : int
+        Number of non-zeros per row for banded matrices.
+    fname1 : str
+        Filename for first matrix (empty string to generate).
+    fname2 : str
+        Filename for second matrix (empty string to use first matrix).
+
+    Returns
+    -------
+    tuple
+        (A, B) - two sparse matrices for multiplication.
+
+    Notes
+    -----
+    If fname1 is provided, loads matrices from Matrix Market files.
+    If fname2 is empty, uses the same matrix for both A and B.
+    Otherwise, generates banded matrices with specified parameters.
+    """
     if fname1 != "":
         # Read file from matrix
         A = sparse.mmread(fname1)
@@ -38,6 +101,37 @@ def get_matrices(N, nnz_per_row, fname1, fname2):
 
 
 def run_spgemm(N, nnz_per_row, fname1, fname2, iters, stable, timer):
+    """Run sparse matrix-matrix multiplication benchmark.
+
+    Parameters
+    ----------
+    N : int
+        Matrix size for generated matrices.
+    nnz_per_row : int
+        Number of non-zeros per row for banded matrices.
+    fname1 : str
+        Filename for first matrix.
+    fname2 : str
+        Filename for second matrix.
+    iters : int
+        Number of benchmark iterations.
+    stable : bool
+        Whether to reuse matrices (allows partition caching).
+    timer : Timer
+        Timer object for measuring performance.
+
+    Notes
+    -----
+    This function runs a benchmark of sparse matrix-matrix multiplication.
+    It supports two modes:
+    - stable=True: Reuses matrices, allowing partition caching
+    - stable=False: Creates fresh matrices each iteration
+
+    The function prints:
+    - Matrix dimensions and non-zero counts
+    - Number of iterations
+    - Total time and time per iteration
+    """
     warmup_iterations = 5
 
     if stable:
diff --git a/examples/spmv_microbenchmark.py b/examples/spmv_microbenchmark.py
index 559ccbfc..c6f11ff8 100644
--- a/examples/spmv_microbenchmark.py
+++ b/examples/spmv_microbenchmark.py
@@ -12,6 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Sparse Matrix-Vector Multiplication Microbenchmark.
+
+This script benchmarks sparse matrix-vector multiplication performance
+across different matrix sizes and configurations. It supports:
+
+- Matrix size sweeps with configurable min/max sizes
+- Banded matrix generation with specified non-zeros per row
+- Loading matrices from Matrix Market files
+- Optional repartitioning to simulate data updates
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nmin: Minimum matrix size (supports k, m, g suffixes)
+--nmax: Maximum matrix size (supports k, m, g suffixes)
+--nnz-per-row: Number of non-zeros per row for banded matrices
+--repartition: Enable alternating x/y vectors
+--filename: Load matrix from Matrix Market file
+--iters: Number of benchmark iterations
+--from-diags: Use sparse.diags for matrix construction
+--package: Backend to use (legate, cupy, scipy)
+"""
+
 import argparse
 
 from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args
@@ -19,6 +41,30 @@
 
 # Writing to pre-allocated array is preferred
 def spmv_dispatch(A, x, y, i, repartition):
+    """Dispatch sparse matrix-vector multiplication operation.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        The sparse matrix to multiply with.
+    x : array_like
+        Input vector.
+    y : array_like
+        Output vector (pre-allocated).
+    i : int
+        Iteration index.
+    repartition : bool
+        Whether to alternate between y=A*x and x=A*y.
+
+    Notes
+    -----
+    This function performs sparse matrix-vector multiplication with optional
+    repartitioning. When repartition is True, it alternates between computing
+    y = A*x and x = A*y to simulate data updates.
+
+    For Legate, it uses the dot method with pre-allocated output arrays.
+    For other backends, it uses the @ operator.
+    """
     if use_legate:
         if repartition and i % 2:
             A.dot(y, out=x)
@@ -32,6 +78,30 @@ def spmv_dispatch(A, x, y, i, repartition):
 
 
 def run_spmv(A, iters, repartition, timer):
+    """Run sparse matrix-vector multiplication benchmark.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        The sparse matrix to benchmark.
+    iters : int
+        Number of iterations to run.
+    repartition : bool
+        Whether to use repartitioning (alternate x and y).
+    timer : Timer
+        Timer object for measuring performance.
+
+    Notes
+    -----
+    This function runs a benchmark of sparse matrix-vector multiplication.
+    It includes warm-up runs and measures the total time and time per iteration.
+
+    The function prints:
+    - Matrix dimensions and number of non-zeros
+    - Number of iterations
+    - Total elapsed time
+    - Time per iteration
+    """
     x = np.ones((A.shape[1],))
     y = np.zeros((A.shape[0],))
 
diff --git a/legate_sparse/__init__.py b/legate_sparse/__init__.py
index 8a6a077a..c8f44589 100644
--- a/legate_sparse/__init__.py
+++ b/legate_sparse/__init__.py
@@ -21,6 +21,7 @@
 
 from .coverage import clone_module  # noqa: F401
 from .csr import csr_array, csr_matrix  # noqa: F401
+from .dia import dia_array, dia_matrix  # noqa: F401
 from .module import *  # noqa: F401
 
 clone_module(_sp, globals())
diff --git a/legate_sparse/base.py b/legate_sparse/base.py
index fa46fc8b..c9d99a31 100644
--- a/legate_sparse/base.py
+++ b/legate_sparse/base.py
@@ -62,8 +62,36 @@
 # CompressedBase is a base class for several different kinds of sparse
 # matrices, such as CSR, CSC, COO and DIA.
 class CompressedBase:
+    """Base class for compressed sparse matrix formats.
+
+    This class provides common functionality for compressed sparse matrix
+    formats like CSR, CSC, COO, and DIA. It handles the conversion from
+    non-zero counts to position arrays and provides common operations.
+
+    Notes
+    -----
+    This is an internal base class and should not be instantiated directly.
+    Use specific format classes like csr_array instead.
+    """
+
     @classmethod
     def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
+        """Convert non-zero counts to position arrays.
+
+        This class method converts an array of non-zero counts per row/column
+        into the position array used in compressed sparse formats.
+
+        Parameters
+        ----------
+        q_nnz : LogicalStore
+            Store containing the number of non-zeros per row/column.
+
+        Returns
+        -------
+        tuple
+            (pos, total_nnz) where pos is the position array and total_nnz
+            is the total number of non-zeros.
+        """
         q_nnz_arr = store_to_cupynumeric_array(q_nnz)
         cs = cupynumeric.cumsum(q_nnz_arr)
         cs_shifted = cs - q_nnz_arr
@@ -86,9 +114,43 @@ def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
         return pos, cs[-1]
 
     def nnz_to_pos(self, q_nnz: LogicalStore):
+        """Convert non-zero counts to position arrays for this instance.
+
+        Parameters
+        ----------
+        q_nnz : LogicalStore
+            Store containing the number of non-zeros per row/column.
+
+        Returns
+        -------
+        tuple
+            (pos, total_nnz) where pos is the position array and total_nnz
+            is the total number of non-zeros.
+        """
         return CompressedBase.nnz_to_pos_cls(q_nnz)
 
     def asformat(self, format, copy=False):
+        """Convert the matrix to a specified format.
+
+        Parameters
+        ----------
+        format : str
+            The desired format ('csr', 'csc', 'coo', etc.).
+        copy : bool, optional
+            Whether to create a copy. Default is False.
+
+        Returns
+        -------
+        sparse matrix
+            Matrix in the requested format.
+
+        Raises
+        ------
+        ValueError
+            If the format is unknown.
+        NotImplementedError
+            If conversion to the requested format is not implemented.
+        """
         if format is None or format == self.format:
             if copy:
                 raise NotImplementedError
@@ -108,35 +170,51 @@ def asformat(self, format, copy=False):
 
     # The implementation of sum is mostly lifted from scipy.sparse.
     def sum(self, axis=None, dtype=None, out=None):
-        """
-        Sum the matrix elements over a given axis.
+        """Sum the matrix elements over a given axis.
+
         Parameters
         ----------
-        axis : {-2, -1, 0, 1, None} optional
+        axis : {-2, -1, 0, 1, None}, optional
             Axis along which the sum is computed. The default is to
             compute the sum of all the matrix elements, returning a scalar
             (i.e., `axis` = `None`).
         dtype : dtype, optional
             The type of the returned matrix and of the accumulator in which
-            the elements are summed.  The dtype of `a` is used by default
+            the elements are summed. The dtype of `a` is used by default
             unless `a` has an integer dtype of less precision than the default
-            platform integer.  In that case, if `a` is signed then the platform
+            platform integer. In that case, if `a` is signed then the platform
             integer is used while if `a` is unsigned then an unsigned integer
             of the same precision as the platform integer is used.
-            .. versionadded:: 0.18.0
-        out : np.matrix, optional
-            Alternative output matrix in which to place the result. It must
+        out : cupynumeric.ndarray, optional
+            Alternative output array in which to place the result. It must
             have the same shape as the expected output, but the type of the
             output values will be cast if necessary.
-            .. versionadded:: 0.18.0
+
         Returns
         -------
-        sum_along_axis : np.matrix
+        sum_along_axis : cupynumeric.ndarray or scalar
             A matrix with the same shape as `self`, with the specified
-            axis removed.
+            axis removed, or a scalar if axis=None.
+
+        Raises
+        ------
+        NotImplementedError
+            If axis=0 (sum over columns) is requested.
+        ValueError
+            If out is provided but has incompatible shape.
+
+        Notes
+        -----
+        The implementation uses multiplication by a matrix of ones to achieve
+        the sum. For some sparse matrix formats more efficient methods are
+        possible and should override this function.
+
+        Currently, summing over columns (axis=0) is not implemented due to
+        the lack of right matrix multiplication support.
+
         See Also
         --------
-        numpy.matrix.sum : NumPy's implementation of 'sum' for matrices
+        cupynumeric.matrix.sum : NumPy's implementation of 'sum' for matrices
         """
 
         # We use multiplication by a matrix of ones to achieve this.
@@ -171,9 +249,27 @@ def sum(self, axis=None, dtype=None, out=None):
 
     # needed by _data_matrix
     def _with_data(self, data, copy=True):
-        """Returns a _different_ matrix object with the same sparsity structure as self,
-        but with different data.  By default the structure arrays
-        (i.e. .indptr and .indices) are copied. 'data' parameter is never copied.
+        """Returns a matrix object with the same sparsity structure as self,
+        but with different data.
+
+        Parameters
+        ----------
+        data : array_like
+            The new data array. This parameter is never copied.
+        copy : bool, optional
+            Whether to copy the structure arrays (indptr and indices).
+            Default is True.
+
+        Returns
+        -------
+        sparse matrix
+            A new matrix with the same sparsity structure but different data.
+
+        Notes
+        -----
+        This method creates a new matrix object with the same sparsity pattern
+        but replaces the data array. The structure arrays (indptr and indices)
+        are copied by default to avoid modifying the original matrix.
         """
 
         # For CSR and CSC compressed base we can just reuse compressed stores,
@@ -253,12 +349,42 @@ def method(self):
 # format of {Dense, Sparse}. For our purposes, that means CSC and CSR
 # matrices.
 class DenseSparseBase:
+    """Base class for sparse matrices with dense-sparse format.
+
+    This class provides functionality for sparse matrices that have a TACO
+    format of {Dense, Sparse}, which includes CSR and CSC matrices.
+
+    Notes
+    -----
+    This is an internal base class and should not be instantiated directly.
+    Use specific format classes like csr_array instead.
+    """
+
     def __init__(self):
+        """Initialize the DenseSparseBase class."""
         self._balanced_pos_partition = None
 
     # consider using _with_data() here
     @classmethod
     def make_with_same_nnz_structure(cls, mat, arg, shape=None, dtype=None):
+        """Create a new matrix with the same non-zero structure as mat.
+
+        Parameters
+        ----------
+        mat : sparse matrix
+            The reference matrix whose structure to copy.
+        arg : array_like
+            The data for the new matrix.
+        shape : tuple, optional
+            The shape of the new matrix. If None, uses mat.shape.
+        dtype : dtype, optional
+            The data type of the new matrix. If None, uses mat.dtype.
+
+        Returns
+        -------
+        sparse matrix
+            A new matrix with the same structure as mat but with data from arg.
+        """
         if shape is None:
             shape = mat.shape
         if dtype is None:
@@ -269,6 +395,21 @@ def make_with_same_nnz_structure(cls, mat, arg, shape=None, dtype=None):
 
 # unpack_rect1_store unpacks a rect1 store into two int64 stores.
 def unpack_rect1_store(pos):
+    """Unpack a rect1 store into two int64 stores.
+
+    This function unpacks the compressed position array used in CSR/CSC
+    formats into separate start and end position arrays.
+
+    Parameters
+    ----------
+    pos : LogicalStore
+        The rect1 store containing packed position information.
+
+    Returns
+    -------
+    tuple
+        (lo, hi) where lo contains start positions and hi contains end positions.
+    """
     out1 = runtime.create_store(int64, shape=pos.shape)
     out2 = runtime.create_store(int64, shape=pos.shape)
     task = runtime.create_auto_task(SparseOpCode.UNZIP_RECT1)
@@ -283,6 +424,25 @@ def unpack_rect1_store(pos):
 
 # pack_to_rect1_store packs two int64 stores into a rect1 store.
 def pack_to_rect1_store(lo, hi, output=None):
+    """Pack two int64 stores into a rect1 store.
+
+    This function packs separate start and end position arrays into the
+    compressed rect1 format used in CSR/CSC formats.
+
+    Parameters
+    ----------
+    lo : LogicalStore
+        Store containing start positions.
+    hi : LogicalStore
+        Store containing end positions.
+    output : LogicalStore, optional
+        Output store for the packed result. If None, creates a new store.
+
+    Returns
+    -------
+    LogicalStore
+        The packed rect1 store.
+    """
     if output is None:
         output = runtime.create_store(rect1, shape=(lo.shape[0],))
     task = runtime.create_auto_task(SparseOpCode.ZIP_TO_RECT1)
diff --git a/legate_sparse/config.py b/legate_sparse/config.py
index 3c9a3780..8c601981 100644
--- a/legate_sparse/config.py
+++ b/legate_sparse/config.py
@@ -23,6 +23,12 @@
 
 
 class _LegateSparseSharedLib:
+    """Internal class representing the shared library interface.
+
+    This class defines the interface to the C++ shared library that
+    implements the core sparse matrix operations.
+    """
+
     LEGATE_SPARSE_DENSE_TO_CSR: int
     LEGATE_SPARSE_DENSE_TO_CSR_NNZ: int
     LEGATE_SPARSE_ZIP_TO_RECT_1: int
@@ -46,6 +52,26 @@ class _LegateSparseSharedLib:
 
 
 def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
+    """Load a shared library without automatic closing.
+
+    Parameters
+    ----------
+    ffi : Any
+        The CFFI interface object.
+    lib_path : str
+        Path to the shared library to load.
+
+    Returns
+    -------
+    Any
+        The loaded library object.
+
+    Notes
+    -----
+    This function loads a shared library using CDLL and converts it to
+    a CFFI object without automatic closing. This prevents issues with
+    symbol cleanup during shutdown.
+    """
     # Use an already-opened library handle, which cffi will convert to a
     # regular FFI object (using the definitions previously added using
     # ffi.cdef), but will not automatically dlclose() on collection.
@@ -55,8 +81,21 @@ def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
 
 # Load the LegateSparse library first so we have a shard object that
 # we can use to initialize all these configuration enumerations
-class LegateSparseLib(Library):
+class LegateSparseLib:
+    """Legate sparse matrix library loader.
+
+    This class handles loading and registering the Legate sparse matrix
+    library with the Legate runtime.
+    """
+
     def __init__(self, name):
+        """Initialize the Legate sparse library.
+
+        Parameters
+        ----------
+        name : str
+            The name of the library to load.
+        """
         self.name = name
         self.runtime = None
         self.shared_object = None
@@ -78,24 +117,58 @@ def __init__(self, name):
         self.shared_object = cast(_LegateSparseSharedLib, shared_lib)
 
     def register(self) -> None:
+        """Register the library with the Legate runtime."""
         callback = getattr(self.shared_object, "legate_sparse_perform_registration")
         callback()
 
     def get_shared_library(self) -> str:
+        """Get the path to the shared library.
+
+        Returns
+        -------
+        str
+            The full path to the shared library file.
+        """
         from legate_sparse.install_info import libpath
 
         return os.path.join(libpath, "liblegate_sparse" + self.get_library_extension())
 
     def get_legate_library(self) -> Library:
+        """Get the Legate library object.
+
+        Returns
+        -------
+        Library
+            The Legate library object.
+        """
         return get_legate_runtime().find_library(self.name)
 
     def get_c_header(self) -> str:
+        """Get the C header for the library.
+
+        Returns
+        -------
+        str
+            The C header content.
+        """
         from legate_sparse.install_info import header
 
         return header
 
     @staticmethod
     def get_library_extension() -> str:
+        """Get the appropriate library extension for the current platform.
+
+        Returns
+        -------
+        str
+            The library extension ('.so' for Linux, '.dylib' for macOS).
+
+        Raises
+        ------
+        RuntimeError
+            If the platform is not supported.
+        """
         os_name = platform.system()
         if os_name == "Linux":
             return ".so"
@@ -105,6 +178,8 @@ def get_library_extension() -> str:
 
 
 SPARSE_LIB_NAME = "legate.sparse"
+"""Name of the Legate sparse library."""
+
 sparse_lib = LegateSparseLib(SPARSE_LIB_NAME)
 sparse_lib.register()
 _sparse = sparse_lib.shared_object
@@ -115,6 +190,13 @@ def get_library_extension() -> str:
 # Match these to entries in sparse_c.h
 @unique
 class SparseOpCode(IntEnum):
+    """Enumeration of sparse matrix operation codes.
+
+    These codes correspond to the operations implemented in the C++
+    shared library and are used to dispatch tasks to the appropriate
+    kernels.
+    """
+
     LOAD_CUDALIBS = _sparse.LEGATE_SPARSE_LOAD_CUDALIBS
     UNLOAD_CUDALIBS = _sparse.LEGATE_SPARSE_UNLOAD_CUDALIBS
 
@@ -146,3 +228,4 @@ class SparseOpCode(IntEnum):
 
 # Register some types for us to use.
 rect1 = types.rect_type(1)
+"""1-dimensional rectangle type used for compressed storage formats."""
diff --git a/legate_sparse/csr.py b/legate_sparse/csr.py
index 6b1d69a4..3008356e 100644
--- a/legate_sparse/csr.py
+++ b/legate_sparse/csr.py
@@ -91,7 +91,140 @@
 
 @clone_scipy_arr_kind(scipy.sparse.csr_array)
 class csr_array(CompressedBase, DenseSparseBase):
+    """Compressed Sparse Row array.
+
+    This can be instantiated in several ways:
+        csr_array(D)
+            where D is a 2-D ndarray or cupynumeric.ndarray
+
+        csr_array(S)
+            with another sparse array or matrix S (equivalent to S.tocsr())
+
+        csr_array((M, N), [dtype])
+            to construct an empty array with shape (M, N)
+            dtype is optional, defaulting to dtype='d'.
+
+        csr_array((data, (row_ind, col_ind)), [shape=(M, N)])
+            where ``data``, ``row_ind`` and ``col_ind`` satisfy the
+            relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
+
+        csr_array((data, indices, indptr), [shape=(M, N)])
+            is the standard CSR representation where the column indices for
+            row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
+            corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
+            If the shape parameter is not supplied, the array dimensions
+            are inferred from the index arrays.
+
+    Attributes
+    ----------
+    dtype : dtype
+        Data type of the array
+    shape : 2-tuple
+        Shape of the array
+    ndim : int
+        Number of dimensions (this is always 2)
+    nnz : int
+        Number of stored values, including explicit zeros
+    data : cupynumeric.ndarray
+        CSR format data array of the array
+    indices : cupynumeric.ndarray
+        CSR format index array of the array
+    indptr : cupynumeric.ndarray
+        CSR format index pointer array of the array
+    has_sorted_indices : bool
+        Whether the indices are sorted
+    has_canonical_format : bool
+        Whether the matrix is in canonical format
+    T : csr_array
+        Transpose of the matrix
+
+    Notes
+    -----
+    Sparse arrays can be used in arithmetic operations: they support
+    addition, subtraction, multiplication, division, and matrix power.
+
+    Advantages of the CSR format:
+        - fast matrix vector products
+
+    Disadvantages of the CSR format:
+        - changes to the sparsity structure are expensive (consider LIL or DOK)
+
+    Canonical Format:
+        - Within each row, indices are sorted by column.
+        - There are no duplicate entries.
+
+    Differences from SciPy:
+        - Uses cupynumeric arrays instead of numpy arrays
+        - GPU acceleration via cuSPARSE when available
+        - Limited to supported datatypes on GPU: float32, float64, complex64, complex128
+        - Some operations may create implicit copies due to transformed arrays
+        - Element-wise operations with scalars only operate on existing non-zero elements
+        - Indexing with boolean masks only updates existing non-zero elements
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import csr_array
+    >>> csr_array((3, 4), dtype=np.int8).todense()
+    array([[0, 0, 0, 0],
+           [0, 0, 0, 0],
+           [0, 0, 0, 0]], dtype=int8)
+
+    >>> row = np.array([0, 0, 1, 2, 2, 2])
+    >>> col = np.array([0, 2, 2, 0, 1, 2])
+    >>> data = np.array([1, 2, 3, 4, 5, 6])
+    >>> csr_array((data, (row, col)), shape=(3, 3)).todense()
+    array([[1, 0, 2],
+           [0, 0, 3],
+           [4, 5, 6]])
+
+    >>> indptr = np.array([0, 2, 3, 6])
+    >>> indices = np.array([0, 2, 2, 0, 1, 2])
+    >>> data = np.array([1, 2, 3, 4, 5, 6])
+    >>> csr_array((data, indices, indptr), shape=(3, 3)).todense()
+    array([[1, 0, 2],
+           [0, 0, 3],
+           [4, 5, 6]])
+    """
+
     def __init__(self, arg, shape=None, dtype=None, copy=False):
+        """Initialize a CSR array.
+
+        Parameters
+        ----------
+        arg : array_like, tuple, or csr_array
+            The input data. Can be:
+            - A 2-D dense array (numpy.ndarray or cupynumeric.ndarray)
+            - A sparse array/matrix to convert to CSR format
+            - A tuple (M, N) for an empty array of shape (M, N)
+            - A tuple (data, (row_ind, col_ind)) for COO format data
+            - A tuple (data, indices, indptr) for CSR format data
+        shape : tuple, optional
+            Shape of the array (M, N). Required if not inferrable from input.
+        dtype : dtype, optional
+            Data type of the array. If None, inferred from input data.
+            Defaults to float64 if not specified.
+        copy : bool, optional
+            Whether to copy the input data. Default is False.
+
+        Raises
+        ------
+        NotImplementedError
+            If the input type is not supported for conversion to CSR.
+        AssertionError
+            If shape cannot be inferred and is not provided.
+        ValueError
+            If input data is inconsistent or invalid.
+
+        Notes
+        -----
+        When converting from dense arrays, the implementation uses a two-pass
+        algorithm that first counts non-zeros per row, then fills them in.
+        This may not scale well on distributed systems due to alignment constraints.
+
+        When converting from COO format, the data is automatically sorted by
+        rows and then by columns to ensure canonical format.
+        """
         self.ndim = 2
         self.indices_sorted = False
         self.canonical_format = False
@@ -189,7 +322,39 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         self._dtype = dtype
 
     def _init_from_tuple_inputs(self, arg, dtype, shape, copy):
+        """Initialize CSR array from tuple inputs.
+
+        This internal method handles the various tuple-based constructor formats:
+        - (M, N) for empty arrays
+        - (data, (row_ind, col_ind)) for COO format
+        - (data, indices, indptr) for CSR format
+
+        Parameters
+        ----------
+        arg : tuple
+            The input tuple in one of the supported formats.
+        dtype : dtype, optional
+            The desired data type.
+        shape : tuple, optional
+            The shape of the array.
+        copy : bool
+            Whether to copy the input data.
+
+        Returns
+        -------
+        tuple
+            (dtype, shape) for the constructed array.
+
+        Raises
+        ------
+        AssertionError
+            If shape cannot be inferred or input is invalid.
+        NotImplementedError
+            If the tuple format is not supported.
+        """
+
         def _get_empty_csr(dtype, nrows_plus_one):
+            """Helper function to create empty CSR arrays."""
             return (
                 cupynumeric.zeros(0, dtype=dtype),
                 cupynumeric.zeros(0, dtype=coord_ty),
@@ -326,36 +491,96 @@ def _get_empty_csr(dtype, nrows_plus_one):
 
     @property
     def dim(self):
+        """Number of dimensions (always 2 for CSR arrays)."""
         return self.ndim
 
     @property
     def nnz(self):
+        """Number of stored values, including explicit zeros.
+
+        Returns
+        -------
+        int
+            The number of non-zero elements in the matrix.
+        """
         return self.vals.shape[0]
 
     @property
     def dtype(self):
+        """Data type of the array.
+
+        Returns
+        -------
+        dtype
+            The data type of the array elements.
+        """
         # We can just return self.vals.type, but bookkeep type separately now
         return self._dtype
 
     # Enable direct operation on the values array.
     def get_data(self):
+        """Get the data array of the CSR matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The data array containing the non-zero values.
+        """
         return store_to_cupynumeric_array(self.vals)
 
     # From array,
     def set_data(self, data):
+        """Set the data array of the CSR matrix.
+
+        Parameters
+        ----------
+        data : cupynumeric.ndarray
+            The new data array. Must have the same length as the current data array.
+
+        Raises
+        ------
+        AssertionError
+            If data is not a cupynumeric.ndarray.
+        """
         if isinstance(data, numpy.ndarray):
             data = cupynumeric.array(data)
         assert isinstance(data, cupynumeric.ndarray)
         self.vals = get_store_from_cupynumeric_array(data)
         self._dtype = data.dtype
 
-    data = property(fget=get_data, fset=set_data)
+    data = property(
+        fget=get_data, fset=set_data, doc="CSR format data array of the matrix"
+    )
 
     # Enable direct operation on the indices array.
     def get_indices(self):
+        """Get the column indices array of the CSR matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The column indices array.
+        """
         return store_to_cupynumeric_array(self.crd)
 
     def set_indices(self, indices):
+        """Set the column indices array of the CSR matrix.
+
+        Parameters
+        ----------
+        indices : cupynumeric.ndarray
+            The new column indices array. Must have the same length as the current indices array.
+
+        Raises
+        ------
+        AssertionError
+            If indices is not a cupynumeric.ndarray.
+
+        Notes
+        -----
+        Setting new indices will mark the matrix as not having sorted indices
+        and not being in canonical format.
+        """
         if isinstance(indices, numpy.ndarray):
             indices = cupynumeric.array(indices)
         assert isinstance(indices, cupynumeric.ndarray)
@@ -364,22 +589,46 @@ def set_indices(self, indices):
         self.canonical_format = False
         self.indices_sorted = False
 
-    indices = property(fget=get_indices, fset=set_indices)
+    indices = property(
+        fget=get_indices, fset=set_indices, doc="CSR format index array of the matrix"
+    )
 
     def get_indptr(self):
+        """Get the index pointer array of the CSR matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The index pointer array. For row i, the column indices are stored in
+            indices[indptr[i]:indptr[i+1]] and their corresponding values are
+            stored in data[indptr[i]:indptr[i+1]].
+        """
         row_start_st, row_end_st = unpack_rect1_store(self.pos)
         row_start = store_to_cupynumeric_array(row_start_st)
         return cupynumeric.append(row_start, [self.nnz])
 
     # Disallow changing intptrs directly
-    indptr = property(fget=get_indptr)
+    indptr = property(
+        fget=get_indptr, doc="CSR format index pointer array of the matrix"
+    )
 
     def _get_row_indices(self):
-        """Helper routine that converts pos to row indices"""
+        """Helper routine that converts pos to row indices.
+
+        This internal method expands the compressed row storage format's position
+        array into explicit row indices for each non-zero element.
 
-        # TODO: Add an option that caches the row_indices so that other binary
-        # operations don't have to recompute it.
+        Returns
+        -------
+        cupynumeric.ndarray
+            Array of row indices corresponding to each non-zero element.
 
+        Notes
+        -----
+        This method is used internally by comparison operations and other
+        methods that need explicit row indices. The result could be cached
+        for performance, but currently is recomputed each time.
+        """
         row_indices = runtime.create_store(coord_ty, shape=self.crd.shape)
         task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES)
         src_part = task.add_input(self.pos)
@@ -390,13 +639,56 @@ def _get_row_indices(self):
         return store_to_cupynumeric_array(row_indices)
 
     def has_sorted_indices(self):
+        """Determine whether the matrix has sorted indices.
+
+        Returns
+        -------
+        bool
+            True if the indices are sorted, False otherwise.
+        """
         return self.indices_sorted
 
     def has_canonical_format(self):
+        """Determine whether the matrix is in canonical format.
+
+        Returns
+        -------
+        bool
+            True if the matrix is in canonical format, False otherwise.
+
+        Notes
+        -----
+        A matrix is in canonical format if:
+        - Within each row, indices are sorted by column
+        - There are no duplicate entries
+        """
         return self.canonical_format
 
     # The rest of the methods
     def diagonal(self, k=0):
+        """Return the k-th diagonal of the matrix.
+
+        Parameters
+        ----------
+        k : int, optional
+            Which diagonal to retrieve. Default is 0 (main diagonal).
+            k > 0 for upper diagonals, k < 0 for lower diagonals.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The k-th diagonal of the matrix.
+
+        Raises
+        ------
+        NotImplementedError
+            If k != 0 (only main diagonal is currently supported).
+
+        Notes
+        -----
+        Currently only supports k=0 (main diagonal). Other diagonals
+        are not implemented.
+        """
         rows, cols = self.shape
         if k <= -rows or k >= cols:
             return cupynumeric.empty(0, dtype=self.dtype)
@@ -422,6 +714,33 @@ def diagonal(self, k=0):
         return store_to_cupynumeric_array(output)
 
     def todense(self, order=None, out=None):
+        """Return a dense matrix representation of this matrix.
+
+        Parameters
+        ----------
+        order : str, optional
+            Not supported. Must be None.
+        out : cupynumeric.ndarray, optional
+            Output array for the result. Must have the same shape and dtype
+            as the expected output.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            A dense matrix with the same shape and dtype as this matrix.
+
+        Raises
+        ------
+        NotImplementedError
+            If order is not None.
+        ValueError
+            If out is provided but has incompatible dtype or shape.
+
+        Notes
+        -----
+        The order parameter is not supported and must be None.
+        If out is provided, it must have the correct shape and dtype.
+        """
         if order is not None:
             raise NotImplementedError
         if out is not None:
@@ -444,13 +763,63 @@ def todense(self, order=None, out=None):
         return store_to_cupynumeric_array(out)
 
     def multiply(self, other):
+        """Point-wise multiplication by another matrix, vector, or scalar.
+
+        Parameters
+        ----------
+        other : csr_array, cupynumeric.ndarray, or scalar
+            The object to multiply with.
+
+        Returns
+        -------
+        csr_array or cupynumeric.ndarray
+            The result of the multiplication.
+
+        Notes
+        -----
+        This is equivalent to the * operator.
+        """
         return self * other
 
     def __rmul__(self, other):
+        """Right multiplication by a scalar.
+
+        Parameters
+        ----------
+        other : scalar
+            The scalar to multiply with.
+
+        Returns
+        -------
+        csr_array
+            The result of the multiplication.
+        """
         return self * other
 
     # This is an element-wise operation now.
     def __mul__(self, other):
+        """Element-wise multiplication.
+
+        Parameters
+        ----------
+        other : scalar or array_like
+            The object to multiply with.
+
+        Returns
+        -------
+        csr_array
+            The result of the multiplication.
+
+        Raises
+        ------
+        NotImplementedError
+            If other is not a scalar.
+
+        Notes
+        -----
+        Currently only supports scalar multiplication. Array multiplication
+        is not implemented.
+        """
         if isinstance(other, numpy.ndarray):
             other = cupynumeric.array(other)
 
@@ -464,10 +833,48 @@ def __mul__(self, other):
 
     # rmatmul represents the operation other @ self.
     def __rmatmul__(self, other):
+        """Right matrix multiplication (other @ self).
+
+        Parameters
+        ----------
+        other : array_like
+            The left operand for matrix multiplication.
+
+        Returns
+        -------
+        cupynumeric.ndarray or csr_array
+            The result of the matrix multiplication.
+
+        Raises
+        ------
+        NotImplementedError
+            Currently not implemented.
+
+        Notes
+        -----
+        This method handles the case where a dense matrix is multiplied
+        with a CSR matrix from the left. Currently not implemented.
+        """
         # Handle dense @ CSR
         raise NotImplementedError
 
     def __matmul__(self, other):
+        """Matrix multiplication (self @ other).
+
+        Parameters
+        ----------
+        other : array_like or csr_array
+            The right operand for matrix multiplication.
+
+        Returns
+        -------
+        cupynumeric.ndarray or csr_array
+            The result of the matrix multiplication.
+
+        Notes
+        -----
+        This is equivalent to the dot method.
+        """
         return self.dot(other)
 
     def _compare_scalar(self, other, op):
@@ -742,15 +1149,54 @@ def dot(self, other, out=None):
 
         Parameters
         ----------
-        other : array_like
-            The object to compute dot product with
-        out : ndarray, optional
-            Output array for the result
+        other : array_like or csr_array
+            The object to compute dot product with. Can be:
+            - A dense vector (1-D array) for sparse matrix-vector multiplication (SpMV)
+            - A dense matrix (2-D array) for sparse matrix-matrix multiplication (SpMM)
+            - A CSR matrix for sparse matrix-sparse matrix multiplication (SpGEMM)
+        out : cupynumeric.ndarray, optional
+            Output array for the result. Only supported for SpMV operations.
+            Must have the correct shape and dtype.
 
         Returns
         -------
-        output : csr_array or cupynumeric.ndarray
-            Sparse matrix or dense array depending on input
+        cupynumeric.ndarray or csr_array
+            The result of the dot product:
+            - For SpMV: dense vector
+            - For SpMM: dense matrix
+            - For SpGEMM: CSR matrix
+
+        Raises
+        ------
+        NotImplementedError
+            If the operation is not supported or datatypes are not supported on GPU.
+        ValueError
+            If out is provided for SpGEMM operations or has incompatible dtype/shape.
+        RuntimeWarning
+            If an implicit copy is created due to transformed input arrays.
+
+        Notes
+        -----
+        Supported operations:
+        - SpMV (sparse matrix-vector): A @ x where x is a dense vector
+        - SpGEMM (sparse-sparse): A @ B where B is a CSR matrix
+
+        GPU limitations:
+        - Only floating point datatypes are supported: float32, float64, complex64, complex128
+        - Some operations may create implicit copies due to transformed arrays
+
+        The implementation automatically chooses the appropriate algorithm:
+        - For vectors: uses cuSPARSE SpMV when available
+        - For CSR matrices: uses cuSPARSE SpGEMM on GPU, custom implementation on CPU
+
+        Examples
+        --------
+        >>> import cupynumeric as np
+        >>> from legate_sparse import csr_array
+        >>> A = csr_array([[1, 2, 0], [0, 0, 3], [4, 0, 5]])
+        >>> v = np.array([1, 0, -1])
+        >>> A.dot(v)
+        array([ 1, -3, -1])
         """
         # If output specified - it should be cupynumeric array
         if out is not None:
@@ -840,7 +1286,9 @@ def _getpos(self):
         Returns
         -------
         list of tuple
-            List of (start, end) position tuples for each row in the matrix
+            List of (start, end) position tuples for each row in the matrix.
+            For row i, the non-zero elements are stored in positions
+            [start, end) in the data and indices arrays.
         """
         row_start_st, row_end_st = unpack_rect1_store(self.pos)
         row_start = store_to_cupynumeric_array(row_start_st)
@@ -853,7 +1301,7 @@ def copy(self):
         Returns
         -------
         csr_array
-            A copy of the matrix
+            A copy of the matrix with the same data and structure.
         """
         return csr_array(self, dtype=self.dtype)
 
@@ -863,12 +1311,17 @@ def conj(self, copy=True):
         Parameters
         ----------
         copy : bool, optional
-            Whether to create a new matrix or modify in-place
+            Whether to create a new matrix or modify in-place. Default is True.
 
         Returns
         -------
         csr_array
-            The conjugate matrix
+            The conjugate matrix.
+
+        Notes
+        -----
+        If copy=True, returns a new matrix. If copy=False, modifies the
+        current matrix in-place.
         """
         if copy:
             return self.copy().conj(copy=False)
@@ -882,14 +1335,25 @@ def transpose(self, axes=None, copy=False):
         Parameters
         ----------
         axes : None, optional
-            This argument is not supported
+            This argument is not supported and must be None.
         copy : bool, optional
-            Whether to create a copy (ignored - CSR transpose always creates copy)
+            Whether to create a copy. Ignored - CSR transpose always creates a copy.
 
         Returns
         -------
         csr_array
-            Transposed matrix
+            Transposed matrix with shape (N, M) where the original shape was (M, N).
+
+        Raises
+        ------
+        AssertionError
+            If axes is not None.
+
+        Notes
+        -----
+        The axes parameter is not supported and must be None.
+        CSR transpose always creates a copy due to the format conversion.
+        The implementation sorts the data by columns to maintain canonical format.
         """
         if axes is not None:
             raise AssertionError("axes parameter should be None")
@@ -922,22 +1386,31 @@ def transpose(self, axes=None, copy=False):
             copy=False,
         )
 
-    T = property(transpose)
+    T = property(transpose, doc="Transpose of the matrix")
 
-    def asformat(seld, format, copy=False):
+    def asformat(self, format, copy=False):
         """Convert this matrix to a specified format.
 
         Parameters
         ----------
         format : str
-            Desired sparse format ('csr' only)
+            Desired sparse format. Currently only 'csr' is supported.
         copy : bool, optional
-            Whether to create a copy
+            Whether to create a copy. Default is False.
 
         Returns
         -------
         csr_array
-            Matrix in the requested format
+            Matrix in the requested format.
+
+        Raises
+        ------
+        NotImplementedError
+            If format is not 'csr'.
+
+        Notes
+        -----
+        Currently only CSR format is supported. Other formats are not implemented.
         """
         if format == "csr":
             return self.copy() if copy else self
@@ -950,12 +1423,17 @@ def tocsr(self, copy=False):
         Parameters
         ----------
         copy : bool, optional
-            Whether to create a copy
+            Whether to create a copy. Default is False.
 
         Returns
         -------
         csr_array
-            The converted CSR matrix
+            The converted CSR matrix.
+
+        Notes
+        -----
+        Since this matrix is already in CSR format, this method simply
+        returns a copy if requested, or the matrix itself otherwise.
         """
         if copy:
             return self.copy().tocsr(copy=False)
@@ -967,7 +1445,13 @@ def nonzero(self):
         Returns
         -------
         (row, col) : tuple of cupynumeric.ndarrays
-            Row and column indices of non-zeros
+            Row and column indices of non-zeros. Only returns indices
+            where the values are actually non-zero (not just stored).
+
+        Notes
+        -----
+        This method filters out explicit zeros that may be stored in the
+        sparse matrix structure.
         """
         task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES)
 
@@ -986,23 +1470,30 @@ def nonzero(self):
 
 
 csr_matrix = csr_array
+"""Alias for csr_array for backward compatibility with SciPy naming conventions."""
 
 
 # spmv computes y = A @ x.
 def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
+    """Perform sparse matrix vector product y = A @ x.
+
+    Parameters
+    ----------
+    A : csr_array
+        Input sparse matrix of shape (M, N).
+    x : cupynumeric.ndarray
+        Dense vector of shape (N,) for the dot product.
+    y : cupynumeric.ndarray
+        Output array of shape (M,) to store the result.
+
+    Notes
+    -----
+    This function computes the sparse matrix-vector multiplication y = A @ x.
+    The implementation uses an auto-parallelized kernel that distributes
+    the computation across available processors.
+
+    The function modifies y in-place to store the result.
     """
-    Perform sparse matrix vector product y = A @ x
-
-    Parameters:
-    -----------
-    A: csr_array
-        Input sparse matrix
-    x: cupynumeric.ndarray
-        Dense vector for the dot product
-    y: cupynumeric.ndarray
-        Output array
-    """
-
     x_store = get_store_from_cupynumeric_array(x)
     y_store = get_store_from_cupynumeric_array(y)
 
@@ -1026,20 +1517,34 @@ def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
 # spgemm_csr_csr_csr computes C = A @ B when A and B and
 # both csr matrices, and returns the result C as a csr matrix.
 def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
-    """
-    Perform sparse matrix multiplication C = A @ B
+    """Perform sparse matrix multiplication C = A @ B.
 
-    Parameters:
-    -----------
-    A: csr_array
-        Input sparse matrix A
-    B: csr_array
-        Input sparse matrix B
+    Parameters
+    ----------
+    A : csr_array
+        Input sparse matrix A of shape (M, K).
+    B : csr_array
+        Input sparse matrix B of shape (K, N).
 
-    Returns:
-    --------
+    Returns
+    -------
     csr_array
-        The result of the sparse matrix multiplication
+        The result of the sparse matrix multiplication with shape (M, N).
+
+    Notes
+    -----
+    This function computes the sparse matrix-sparse matrix multiplication C = A @ B.
+
+    The implementation differs based on the available hardware:
+    - On GPU: Uses cuSPARSE SpGEMM with local CSR matrices that are aggregated
+    - On CPU: Uses a custom implementation with two-pass algorithm
+
+    The GPU implementation creates a set of local CSR matrices that are
+    aggregated into a global CSR matrix. The CPU implementation uses a
+    query phase to determine the number of non-zeros per row, followed
+    by the actual computation phase.
+
+    Both implementations maintain the CSR format throughout the computation.
     """
     # Due to limitations in cuSPARSE, we cannot use a uniform task
     # implementation for CSRxCSRxCSR SpGEMM across CPUs, OMPs and GPUs.
diff --git a/legate_sparse/dia.py b/legate_sparse/dia.py
index 0dd93735..20f2dc5c 100644
--- a/legate_sparse/dia.py
+++ b/legate_sparse/dia.py
@@ -63,7 +63,107 @@
 # Temporary implementation for matrix generation in examples
 @clone_scipy_arr_kind(scipy.sparse.dia_array)
 class dia_array(CompressedBase):
+    """Sparse matrix with DIAgonal storage.
+
+    This can be instantiated in several ways:
+        dia_array(D)
+            where D is a 2-D ndarray or cupynumeric.ndarray
+
+        dia_array((data, offsets), shape=(M, N))
+            where data is a 2-D array and offsets is a 1-D array of diagonal offsets
+
+        dia_array((data, offset), shape=(M, N))
+            where data is a 1-D array and offset is a single integer
+
+    Attributes
+    ----------
+    dtype : dtype
+        Data type of the array
+    shape : 2-tuple
+        Shape of the array
+    ndim : int
+        Number of dimensions (this is always 2)
+    nnz : int
+        Number of stored values, including explicit zeros
+    data : cupynumeric.ndarray
+        DIA format data array of the array
+    offsets : cupynumeric.ndarray
+        DIA format offset array of the array
+    T : dia_array
+        Transpose of the matrix
+
+    Notes
+    -----
+    The DIA (Diagonal) format stores a sparse matrix by diagonals.
+    The data array has shape (n_diagonals, max_diagonal_length) where
+    each row represents a diagonal. The offsets array contains the
+    diagonal offsets (k > 0 for upper diagonals, k < 0 for lower diagonals).
+
+    Advantages of the DIA format:
+        - efficient for matrices with few diagonals
+        - fast matrix-vector products
+        - simple structure
+
+    Disadvantages of the DIA format:
+        - inefficient for irregular sparsity patterns
+        - not suitable for general sparse matrices
+        - limited arithmetic operations
+
+    Differences from SciPy:
+        - Uses cupynumeric arrays instead of numpy arrays
+        - Limited functionality (mainly for matrix generation in examples)
+        - Some operations may not be fully optimized
+        - Primarily used as an intermediate format for conversion to CSR
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import dia_array
+    >>> data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> offsets = np.array([-1, 0, 1])
+    >>> A = dia_array((data, offsets), shape=(3, 3))
+    >>> A.todense()
+    array([[5, 2, 0],
+           [4, 8, 3],
+           [0, 7, 9]])
+    """
+
     def __init__(self, arg, shape=None, dtype=None, copy=False):
+        """Initialize a DIA array.
+
+        Parameters
+        ----------
+        arg : tuple
+            The input data. Must be a tuple (data, offsets) where:
+            - data is a 2-D array containing the diagonal values
+            - offsets is a 1-D array or integer specifying diagonal offsets
+        shape : tuple, optional
+            Shape of the array (M, N). Required if not inferrable from input.
+        dtype : dtype, optional
+            Data type of the array. If None, inferred from input data.
+        copy : bool, optional
+            Whether to copy the input data. Default is False.
+
+        Raises
+        ------
+        NotImplementedError
+            If shape is not provided (shape is required for DIA arrays).
+        AssertionError
+            If arg is not a tuple or has invalid format.
+        ValueError
+            If input data is inconsistent or invalid.
+
+        Notes
+        -----
+        The DIA format is primarily used for matrix generation in examples
+        and as an intermediate format for conversion to CSR. The shape
+        parameter is required as it cannot be inferred from the diagonal data.
+
+        The offsets array specifies which diagonals are stored:
+        - k > 0: upper diagonal (kth diagonal above main diagonal)
+        - k = 0: main diagonal
+        - k < 0: lower diagonal (kth diagonal below main diagonal)
+        """
         if shape is None:
             raise NotImplementedError
         assert isinstance(arg, tuple)
@@ -89,6 +189,18 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
 
     @property
     def nnz(self):
+        """Number of stored values, including explicit zeros.
+
+        Returns
+        -------
+        int
+            The number of non-zero elements in the matrix.
+
+        Notes
+        -----
+        This property computes the number of non-zeros by iterating through
+        each diagonal and counting the valid elements within the matrix bounds.
+        """
         M, N = self.shape
         nnz = 0
         for k in self.offsets:
@@ -100,18 +212,73 @@ def nnz(self):
 
     @property
     def data(self):
+        """Get the data array of the DIA matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The data array containing the diagonal values. Each row represents
+            a diagonal, with shape (n_diagonals, max_diagonal_length).
+        """
         return store_to_cupynumeric_array(self._data)
 
     @property
     def offsets(self):
+        """Get the offsets array of the DIA matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The offsets array specifying which diagonals are stored.
+            Positive values indicate upper diagonals, negative values
+            indicate lower diagonals, and zero indicates the main diagonal.
+        """
         return store_to_cupynumeric_array(self._offsets)
 
     def copy(self):
+        """Returns a copy of this matrix.
+
+        Returns
+        -------
+        dia_array
+            A copy of the matrix with the same data and structure.
+        """
         data = cupynumeric.array(self.data)
         offsets = cupynumeric.array(self.offsets)
         return dia_array((data, offsets), shape=self.shape, dtype=self.dtype)
 
     def transpose(self, axes=None, copy=False):
+        """Reverses the dimensions of the sparse matrix.
+
+        Parameters
+        ----------
+        axes : None, optional
+            This argument is not supported and must be None.
+        copy : bool, optional
+            Whether to create a copy. Not supported - must be False.
+
+        Returns
+        -------
+        dia_array
+            Transposed matrix with shape (N, M) where the original shape was (M, N).
+
+        Raises
+        ------
+        ValueError
+            If axes is not None.
+        AssertionError
+            If copy is True (not supported).
+
+        Notes
+        -----
+        The axes parameter is not supported and must be None.
+        The copy parameter is not supported and must be False.
+
+        Transposing a DIA matrix involves:
+        1. Flipping the diagonal offsets (negating them)
+        2. Re-aligning the data matrix to account for the new offsets
+        3. Adjusting the shape from (M, N) to (N, M)
+        """
         if axes is not None:
             raise ValueError(
                 "Sparse matrices do not support "
@@ -147,9 +314,27 @@ def transpose(self, axes=None, copy=False):
             dtype=self.dtype,
         )
 
-    T = property(transpose)
+    T = property(transpose, doc="Transpose of the matrix")
 
     def tocsr(self, copy=False):
+        """Convert this matrix to a CSR matrix.
+
+        Parameters
+        ----------
+        copy : bool, optional
+            Whether to create a copy. Default is False.
+
+        Returns
+        -------
+        csr_array
+            The converted CSR matrix.
+
+        Notes
+        -----
+        The conversion to CSR is done by first transposing the matrix
+        and then converting the transposed matrix to CSR format.
+        This approach is used to simplify the conversion process.
+        """
         if copy:
             return self.copy().tocsr(copy=False)
         # we don't need secondary copy
@@ -157,6 +342,33 @@ def tocsr(self, copy=False):
 
     # This routine is lifted from scipy.sparse's converter.
     def _tocsr_transposed(self, copy=False):
+        """Convert the transposed DIA matrix to CSR format.
+
+        This internal method converts a transposed DIA matrix to CSR format.
+        It is used by the tocsr method after transposing the original matrix.
+
+        Parameters
+        ----------
+        copy : bool, optional
+            Whether to create a copy. Default is False.
+
+        Returns
+        -------
+        csr_array
+            The CSR representation of the transposed matrix.
+
+        Notes
+        -----
+        This method is adapted from SciPy's DIA to CSR converter.
+        It handles the conversion by:
+        1. Creating masks for valid diagonal elements
+        2. Computing the indptr array using cumulative sums
+        3. Extracting indices and data for non-zero elements
+        4. Constructing the CSR matrix
+
+        The method ensures that only elements within the matrix bounds
+        and with non-zero values are included in the CSR representation.
+        """
         if self.nnz == 0:
             return csr_array(self.shape, self.dtype)
 
@@ -192,3 +404,4 @@ def _tocsr_transposed(self, copy=False):
 
 # Declare an alias for this type.
 dia_matrix = dia_array
+"""Alias for dia_array for backward compatibility with SciPy naming conventions."""
diff --git a/legate_sparse/gallery.py b/legate_sparse/gallery.py
index 56e71054..371a4c44 100644
--- a/legate_sparse/gallery.py
+++ b/legate_sparse/gallery.py
@@ -75,8 +75,8 @@
 
 
 def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
-    """
-    Construct a sparse matrix from diagonals.
+    """Construct a sparse matrix from diagonals.
+
     Parameters
     ----------
     diagonals : sequence of array_like
@@ -90,44 +90,69 @@ def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
     shape : tuple of int, optional
         Shape of the result. If omitted, a square matrix large enough
         to contain the diagonals is returned.
-    format : {"dia", "csr", "csc", "lil", ...}, optional
-        Matrix format of the result. By default (format=None) an
-        appropriate sparse matrix format is returned. This choice is
-        subject to change.
+    format : {"dia", "csr"}, optional
+        Matrix format of the result. By default (format=None) a DIA
+        matrix is returned. Currently only "dia" and "csr" are supported.
     dtype : dtype, optional
-        Data type of the matrix.
+        Data type of the matrix. Must be specified.
+
+    Returns
+    -------
+    sparse matrix
+        A sparse matrix in the specified format with the given diagonals.
+
+    Raises
+    ------
+    ValueError
+        If the number of diagonals and offsets don't match, or if
+        diagonal lengths don't agree with matrix size.
+    NotImplementedError
+        If dtype is not specified or format is not supported.
+
     See Also
     --------
     spdiags : construct matrix from diagonals
+
     Notes
     -----
     This function differs from `spdiags` in the way it handles
     off-diagonals.
+
     The result from `diags` is the sparse equivalent of::
         np.diag(diagonals[0], offsets[0])
         + ...
         + np.diag(diagonals[k], offsets[k])
+
     Repeated diagonal offsets are disallowed.
-    .. versionadded:: 0.11
+
+    Differences from SciPy:
+        - Uses cupynumeric arrays instead of numpy arrays
+        - dtype parameter is required (cannot be None)
+        - Limited format support (only "dia" and "csr")
+        - Primarily used for matrix generation in examples
+
     Examples
     --------
-    >>> from scipy.sparse import diags
+    >>> import cupynumeric as np
+    >>> from legate_sparse import diags
     >>> diagonals = [[1, 2, 3, 4], [1, 2, 3], [1, 2]]
-    >>> diags(diagonals, [0, -1, 2]).toarray()
+    >>> diags(diagonals, [0, -1, 2], dtype=np.float64).todense()
     array([[1, 0, 1, 0],
            [1, 2, 0, 2],
            [0, 2, 3, 0],
            [0, 0, 3, 4]])
+
     Broadcasting of scalars is supported (but shape needs to be
     specified):
-    >>> diags([1, -2, 1], [-1, 0, 1], shape=(4, 4)).toarray()
+    >>> diags([1, -2, 1], [-1, 0, 1], shape=(4, 4), dtype=np.float64).todense()
     array([[-2.,  1.,  0.,  0.],
            [ 1., -2.,  1.,  0.],
            [ 0.,  1., -2.,  1.],
            [ 0.,  0.,  1., -2.]])
+
     If only one diagonal is wanted (as in `numpy.diag`), the following
     works as well:
-    >>> diags([1, 2, 3], 1).toarray()
+    >>> diags([1, 2, 3], 1, dtype=np.float64).todense()
     array([[ 0.,  1.,  0.,  0.],
            [ 0.,  0.,  2.,  0.],
            [ 0.,  0.,  0.,  3.],
diff --git a/legate_sparse/install_info.pyi b/legate_sparse/install_info.pyi
new file mode 100644
index 00000000..16574ccf
--- /dev/null
+++ b/legate_sparse/install_info.pyi
@@ -0,0 +1,16 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+# See the LICENSE file for details.
+#
+
+# Stub file for install_info module to satisfy mypy
+# This module is generated during build time
+
+libpath: str
+header: str
diff --git a/legate_sparse/io.py b/legate_sparse/io.py
index 2b6a09e3..ecaf8e3c 100644
--- a/legate_sparse/io.py
+++ b/legate_sparse/io.py
@@ -24,6 +24,38 @@
 
 @track_provenance(runtime.sparse_library)
 def mmread(source):
+    """Read a sparse matrix from a Matrix Market (.mtx) file.
+
+    Parameters
+    ----------
+    source : str
+        The filename or path to the Matrix Market file to read.
+
+    Returns
+    -------
+    csr_array
+        A sparse matrix in CSR format loaded from the file.
+
+    Notes
+    -----
+    This function reads Matrix Market format files and converts them
+    to CSR format. The Matrix Market format is a standard format for
+    storing sparse matrices. For more information on the format, see
+    https://math.nist.gov/MatrixMarket/formats.html.
+
+    The function assumes that all nodes in the system can access the
+    file, so no special file distribution is needed.
+
+    The implementation reads the file in COO format and then converts
+    to CSR format for efficient storage and operations.
+
+    Examples
+    --------
+    >>> from legate_sparse import mmread
+    >>> A = mmread("matrix.mtx")
+    >>> print(A.shape)
+    (1000, 1000)
+    """
     # TODO (rohany): We'll assume for now that all of the nodes in the system
     # can access the file passed in, so we don't need to worry about where this
     # task gets mapped to.
diff --git a/legate_sparse/linalg.py b/legate_sparse/linalg.py
index 6d515502..82aa0edb 100644
--- a/legate_sparse/linalg.py
+++ b/legate_sparse/linalg.py
@@ -66,6 +66,32 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
+"""
+Sparse linear algebra (:mod:`legate_sparse.linalg`)
+===================================================
+
+.. currentmodule:: legate_sparse.linalg
+
+Abstract linear operators
+-------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   LinearOperator -- abstract representation of a linear operator
+
+Solving linear problems
+-----------------------
+
+Iterative methods for linear equation systems:
+
+.. autosummary::
+   :toctree: generated/
+
+   cg -- Use Conjugate Gradient iteration to solve Ax = b
+   gmres -- Use Generalized Minimal RESidual iteration to solve Ax = b
+
+"""
 
 import inspect
 import warnings
@@ -414,6 +440,23 @@ def _rmatvec(self, x, out=None):
 
 
 def make_linear_operator(A):
+    """Convert a matrix to a LinearOperator.
+
+    Parameters
+    ----------
+    A : array_like, sparse matrix, or LinearOperator
+        The matrix to convert.
+
+    Returns
+    -------
+    LinearOperator
+        A LinearOperator representation of A.
+
+    Notes
+    -----
+    If A is already a LinearOperator, it is returned unchanged.
+    Otherwise, A is wrapped in a _SparseMatrixLinearOperator.
+    """
     if isinstance(A, LinearOperator):
         return A
     else:
@@ -431,6 +474,39 @@ def make_linear_operator(A):
 # allocating unnecessary futures.
 @track_provenance(nested=True)
 def cg_axpby(y, x, a, b, isalpha=True, negate=False):
+    """Perform fused vector operation for CG solvers.
+
+    This function performs the operation y = alpha * x + beta * y where
+    alpha and beta are computed as a/b within the task. This avoids
+    unnecessary future operations and memory allocations.
+
+    Parameters
+    ----------
+    y : cupynumeric.ndarray
+        Output vector that will be modified in-place.
+    x : cupynumeric.ndarray
+        Input vector for the operation.
+    a : cupynumeric.ndarray
+        Numerator for computing alpha or beta.
+    b : cupynumeric.ndarray
+        Denominator for computing alpha or beta.
+    isalpha : bool, optional
+        If True, a/b is interpreted as alpha. If False, as beta.
+        Default is True.
+    negate : bool, optional
+        If True, negate the computed coefficient. Default is False.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The modified y vector (same as input y).
+
+    Notes
+    -----
+    This is a specialized implementation for CG solvers that fuses
+    coefficient computation with vector operations to avoid unnecessary
+    memory allocations and future operations in the Legion runtime.
+    """
     y_store = get_store_from_cupynumeric_array(y)
     x_store = get_store_from_cupynumeric_array(x)
     task = runtime.create_auto_task(SparseOpCode.AXPBY)
@@ -451,6 +527,29 @@ def cg_axpby(y, x, a, b, isalpha=True, negate=False):
 
 
 def _get_atol_rtol(b_norm, tol=None, atol=0.0, rtol=1e-5):
+    """Compute absolute and relative tolerances for convergence.
+
+    Parameters
+    ----------
+    b_norm : float
+        Norm of the right-hand side vector.
+    tol : float, optional
+        Legacy tolerance parameter. If provided, overrides rtol.
+    atol : float, optional
+        Absolute tolerance. Default is 0.0.
+    rtol : float, optional
+        Relative tolerance. Default is 1e-5.
+
+    Returns
+    -------
+    tuple
+        (atol, rtol) - computed absolute and relative tolerances.
+
+    Notes
+    -----
+    If atol is None, it is set to rtol. The final atol is the maximum
+    of the provided atol and rtol * b_norm.
+    """
     rtol = float(tol) if tol is not None else rtol
 
     if atol is None:
@@ -473,6 +572,60 @@ def cg(
     rtol=1e-5,
     conv_test_iters=25,
 ):
+    """Solve a linear system using the Conjugate Gradient method.
+
+    Parameters
+    ----------
+    A : sparse matrix or LinearOperator
+        The coefficient matrix of the linear system.
+    b : cupynumeric.ndarray
+        Right-hand side of the linear system.
+    x0 : cupynumeric.ndarray, optional
+        Initial guess for the solution. If None, uses zero vector.
+    tol : float, optional
+        Legacy tolerance parameter. If provided, overrides rtol.
+    maxiter : int, optional
+        Maximum number of iterations. If None, uses 10 * n.
+    M : sparse matrix or LinearOperator, optional
+        Preconditioner for A. If None, uses identity.
+    callback : callable, optional
+        User-specified function called after each iteration.
+    atol : float, optional
+        Absolute tolerance for convergence. Default is 0.0.
+    rtol : float, optional
+        Relative tolerance for convergence. Default is 1e-5.
+    conv_test_iters : int, optional
+        Number of iterations between convergence tests. Default is 25.
+
+    Returns
+    -------
+    tuple
+        (x, info) where x is the solution and info is zero if solution is
+        converged else number of iterations
+
+    Raises
+    ------
+    AssertionError
+        If b is not 1D or A is not square.
+
+    Notes
+    -----
+    This implementation follows SciPy's CG solver semantics closely.
+    The method uses fused vector operations to avoid unnecessary
+    memory allocations and improve performance.
+
+    Convergence is tested every conv_test_iters iterations to avoid
+    the overhead of computing the residual norm in every iteration.
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import csr_array, linalg
+    >>> A = csr_array([[4, 1], [1, 3]])
+    >>> b = np.array([1, 2])
+    >>> x, iters = linalg.cg(A, b)
+    >>> print(f"Solution: {x}, Iterations: {iters}")
+    """
     # We keep semantics as close as possible to scipy.cg.
     # https://github.com/scipy/scipy/blob/v1.9.0/scipy/sparse/linalg/_isolve/iterative.py#L298-L385
     assert len(b.shape) == 1 or (len(b.shape) == 2 and b.shape[1] == 1)
@@ -503,6 +656,7 @@ def cg(
     z = None
     q = None
 
+    converged = False
     while iters < maxiter:
         z = M.matvec(r, out=z)
         rho1 = rho
@@ -528,10 +682,15 @@ def cg(
         if (iters % conv_test_iters == 0 or iters == (maxiter - 1)) and np.linalg.norm(
             r
         ) < atol:
+            converged = True
             # Test convergence every conv_test_iters iterations.
             break
 
-    return x, iters
+    info = 0
+    if iters == maxiter and not converged:
+        info = iters
+
+    return x, info
 
 
 # This implementation of GMRES is lifted from the cupy implementation:
@@ -550,43 +709,77 @@ def gmres(
     callback_type=None,
     rtol=1e-5,
 ):
-    """Uses Generalized Minimal RESidual iteration to solve ``Ax = b``.
-    Args:
-        A (ndarray, spmatrix or LinearOperator): The real or complex
-            matrix of the linear system with shape ``(n, n)``. ``A`` must be
-            :class:`cupy.ndarray`, :class:`cupyx.scipy.sparse.spmatrix` or
-            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
-        b (cupy.ndarray): Right hand side of the linear system with shape
-            ``(n,)`` or ``(n, 1)``.
-        x0 (cupy.ndarray): Starting guess for the solution.
-        tol (float): Tolerance for convergence. This argument is optional,
-            deprecated in favour of ``rtol``.
-        restart (int): Number of iterations between restarts. Larger values
-            increase iteration cost, but may be necessary for convergence.
-        maxiter (int): Maximum number of iterations.
-        M (ndarray, spmatrix or LinearOperator): Preconditioner for ``A``.
-            The preconditioner should approximate the inverse of ``A``.
-            ``M`` must be :class:`cupy.ndarray`,
-            :class:`cupyx.scipy.sparse.spmatrix` or
-            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
-        callback (function): User-specified function to call on every restart.
-            It is called as ``callback(arg)``, where ``arg`` is selected by
-            ``callback_type``.
-        callback_type (str): 'x' or 'pr_norm'. If 'x', the current solution
-            vector is used as an argument of callback function. if 'pr_norm',
-            relative (preconditioned) residual norm is used as an arugment.
-        atol, rtol (float): Tolerance for convergence. For convergence,
-            ``norm(b - A @ x) <= max(rtol*norm(b), atol)`` should be satisfied.
-            The default is ``atol=0.`` and ``rtol=1e-5``.
-    Returns:
-        tuple:
-            It returns ``x`` (cupy.ndarray) and ``info`` (int) where ``x`` is
-            the converged solution and ``info`` provides convergence
-            information.
-    Reference:
-        M. Wang, H. Klie, M. Parashar and H. Sudan, "Solving Sparse Linear
-        Systems on NVIDIA Tesla GPUs", ICCS 2009 (2009).
-    .. seealso:: :func:`scipy.sparse.linalg.gmres`
+    """Solve a linear system using the Generalized Minimal Residual method.
+
+    Parameters
+    ----------
+    A : sparse matrix or LinearOperator
+        The coefficient matrix of the linear system.
+    b : cupynumeric.ndarray
+        Right-hand side of the linear system with shape (n,) or (n, 1).
+    x0 : cupynumeric.ndarray, optional
+        Starting guess for the solution. If None, uses zero vector.
+    tol : float, optional
+        Legacy tolerance parameter. If provided, overrides rtol.
+    restart : int, optional
+        Number of iterations between restarts. Larger values increase
+        iteration cost but may be necessary for convergence. Default is 20.
+    maxiter : int, optional
+        Maximum number of iterations. If None, uses 10 * n.
+    M : sparse matrix or LinearOperator, optional
+        Preconditioner for A. The preconditioner should approximate
+        the inverse of A. If None, uses identity.
+    callback : callable, optional
+        User-specified function called on every restart.
+    restrt : int, optional
+        Deprecated alias for restart parameter.
+    atol : float, optional
+        Absolute tolerance for convergence. Default is 0.0.
+    callback_type : str, optional
+        Type of callback argument: 'x' for current solution vector,
+        'pr_norm' for relative preconditioned residual norm. Default is 'pr_norm'.
+    rtol : float, optional
+        Relative tolerance for convergence. Default is 1e-5.
+
+    Returns
+    -------
+    tuple
+        (x, info) where x is the converged solution and info provides
+        convergence information.
+
+    Raises
+    ------
+    AssertionError
+        If b is not 1D or A is not square.
+    ValueError
+        If callback_type is not 'x' or 'pr_norm'.
+
+    Notes
+    -----
+    This implementation is adapted from CuPy's GMRES solver.
+    The method uses Arnoldi iteration to build a Krylov subspace
+    and solves the least squares problem in that subspace.
+
+    For convergence, the residual must satisfy:
+    norm(b - A @ x) <= max(rtol * norm(b), atol)
+
+    The restart parameter controls the trade-off between memory usage
+    and convergence rate. Larger restart values may improve convergence
+    but require more memory.
+
+    References
+    ----------
+    M. Wang, H. Klie, M. Parashar and H. Sudan, "Solving Sparse Linear
+    Systems on NVIDIA Tesla GPUs", ICCS 2009 (2009).
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import csr_array, linalg
+    >>> A = csr_array([[4, 1, 0], [1, 3, 1], [0, 1, 2]])
+    >>> b = np.array([1, 2, 3])
+    >>> x, info = linalg.gmres(A, b, restart=10)
+    >>> print(f"Solution: {x}, Info: {info}")
     """
     assert len(b.shape) == 1 or (len(b.shape) == 2 and b.shape[1] == 1)
     assert len(A.shape) == 2 and A.shape[0] == A.shape[1]
@@ -625,6 +818,27 @@ def gmres(
     e = np.zeros((restart + 1,), dtype=A.dtype)
 
     def compute_hu(u, j):
+        """Compute Householder transformation for Arnoldi iteration.
+
+        Parameters
+        ----------
+        u : cupynumeric.ndarray
+            Vector to be transformed.
+        j : int
+            Current iteration index.
+
+        Returns
+        -------
+        tuple
+            (h, u) where h contains the Householder coefficients and
+            u is the transformed vector.
+
+        Notes
+        -----
+        This function computes the Householder transformation that
+        orthogonalizes the current vector against the previous basis
+        vectors in the Arnoldi iteration.
+        """
         h = V[:, : j + 1].conj().T @ u
         u -= V[:, : j + 1] @ h
         return h, u
diff --git a/legate_sparse/module.py b/legate_sparse/module.py
index a84abb0f..56f22fa1 100644
--- a/legate_sparse/module.py
+++ b/legate_sparse/module.py
@@ -55,16 +55,66 @@
 from .types import coord_ty, nnz_ty  # noqa: F401
 
 
-# is_sparse_matrix returns whether or not an object is a legate
-# sparse created sparse matrix.
-def is_sparse_matrix(o):
-    return any((isinstance(o, csr_array),))
+# returns whether or not an object is a legate sparse created sparse matrix.
+def _is_sparse_matrix(obj) -> bool:
+    return any((isinstance(obj, csr_array), isinstance(obj, dia_array)))
 
+def isspmatrix(obj) -> bool:
+    """Check if an object is a legate sparse matrix.
 
-issparse = is_sparse_matrix
-isspmatrix = is_sparse_matrix
+    Parameters
+    ----------
+    obj : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if the object is a legate sparse matrix, False otherwise.
+
+    Notes
+    -----
+    This function checks if the object is an instance of any supported
+    sparse matrix format in legate_sparse. Currently, only
+    CSR and DIA formats for supported.
+    """
+    return _is_sparse_matrix(obj)
+
+
+def issparse(obj) -> bool:
+    """Check if an object is a legate sparse matrix.
+
+    Parameters
+    ----------
+    obj : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if the object is a legate sparse matrix, False otherwise.
+
+    Notes
+    -----
+    This function checks if the object is an instance of any supported
+    sparse matrix format in legate_sparse. Currently, only
+    CSR and DIA formats for supported.
+    """
+    return _is_sparse_matrix(obj)
 
 
 # Variants for each particular format type.
-def isspmatrix_csr(o):
-    return isinstance(o, csr_array)
+def isspmatrix_csr(obj):
+    """Check if an object is a CSR sparse matrix.
+
+    Parameters
+    ----------
+    obj : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if the object is a CSR sparse matrix, False otherwise.
+    """
+    return isinstance(obj, csr_array)
diff --git a/legate_sparse/types.py b/legate_sparse/types.py
index 376fe116..923767f2 100644
--- a/legate_sparse/types.py
+++ b/legate_sparse/types.py
@@ -18,8 +18,19 @@
 # progress in generalizing the compute kernels, we can
 # remove this code.
 coord_ty = numpy.dtype(numpy.int64)
+"""Data type for coordinate indices in sparse matrices (int64)."""
+
 nnz_ty = numpy.dtype(numpy.uint64)
+"""Data type for non-zero counts in sparse matrices (uint64)."""
+
 float64 = numpy.dtype(numpy.float64)
+"""64-bit floating point data type."""
+
 int32 = numpy.dtype(numpy.int32)
+"""32-bit integer data type."""
+
 int64 = numpy.dtype(numpy.int64)
+"""64-bit integer data type."""
+
 uint64 = numpy.dtype(numpy.uint64)
+"""64-bit unsigned integer data type."""
diff --git a/legate_sparse/utils.py b/legate_sparse/utils.py
index 11daf9fd..2c072f2b 100644
--- a/legate_sparse/utils.py
+++ b/legate_sparse/utils.py
@@ -31,11 +31,25 @@
     numpy.complex64,
     numpy.complex128,
 )
+"""Supported datatypes for sparse matrix operations (SpMV and SpGEMM)."""
 
 
 # find_last_user_stacklevel gets the last stack frame index
 # within legate sparse.
 def find_last_user_stacklevel() -> int:
+    """Find the last stack frame index within legate sparse.
+
+    Returns
+    -------
+    int
+        The stack level of the last user code frame.
+
+    Notes
+    -----
+    This function walks the stack to find the first frame that is not
+    within the legate_sparse module, which is useful for determining
+    the appropriate stack level for warnings.
+    """
     stacklevel = 1
     for frame, _ in traceback.walk_stack(None):
         if not frame.f_globals["__name__"].startswith("sparse"):
@@ -46,6 +60,18 @@ def find_last_user_stacklevel() -> int:
 
 # store_to_cupynumeric_array converts a store to a cuPyNumeric array.
 def store_to_cupynumeric_array(store: LogicalStore):
+    """Convert a LogicalStore to a cupynumeric array.
+
+    Parameters
+    ----------
+    store : LogicalStore
+        The store to convert.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The cupynumeric array representation of the store.
+    """
     return cupynumeric.asarray(store)
 
 
@@ -54,6 +80,20 @@ def get_store_from_cupynumeric_array(
     arr: cupynumeric.ndarray,
     copy=False,
 ) -> LogicalStore:
+    """Extract a LogicalStore from a cupynumeric array.
+
+    Parameters
+    ----------
+    arr : cupynumeric.ndarray
+        The cupynumeric array to extract the store from.
+    copy : bool, optional
+        Whether to create a copy of the array first. Default is False.
+
+    Returns
+    -------
+    LogicalStore
+        The LogicalStore representation of the array.
+    """
     if copy:
         # If requested to make a copy, do so.
         arr = cupynumeric.array(arr)
@@ -67,6 +107,23 @@ def get_store_from_cupynumeric_array(
 
 # cast_to_store attempts to cast an arbitrary object into a store.
 def cast_to_store(arr):
+    """Cast an arbitrary object to a LogicalStore.
+
+    Parameters
+    ----------
+    arr : array_like or LogicalStore
+        The object to cast.
+
+    Returns
+    -------
+    LogicalStore
+        The LogicalStore representation of the input.
+
+    Raises
+    ------
+    NotImplementedError
+        If the object cannot be cast to a LogicalStore.
+    """
     if isinstance(arr, LogicalStore):
         return arr
     if isinstance(arr, numpy.ndarray):
@@ -79,6 +136,20 @@ def cast_to_store(arr):
 # cast_arr attempts to cast an arbitrary object into a cupynumeric
 # ndarray, with an optional desired type.
 def cast_arr(arr, dtype=None):
+    """Cast an arbitrary object to a cupynumeric array.
+
+    Parameters
+    ----------
+    arr : array_like or LogicalStore
+        The object to cast.
+    dtype : dtype, optional
+        The desired data type. If None, preserves the original type.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The cupynumeric array representation of the input.
+    """
     if isinstance(arr, LogicalStore):
         arr = store_to_cupynumeric_array(arr)
     elif not isinstance(arr, cupynumeric.ndarray):
@@ -88,14 +159,32 @@ def cast_arr(arr, dtype=None):
     return arr
 
 
-# find_common_type performs a similar analysis to
-# cupynumeric.ndarray.find_common_type to find a common type
-# between all of the arguments.
 def find_common_type(*args):
+    """Find the common data type for a set of arrays.
+
+    This function performs a similar analysis to cupynumeric.ndarray.find_common_type
+    to find a common type between all of the arguments.
+
+    Parameters
+    ----------
+    *args : array_like
+        Arrays to find the common type for.
+
+    Returns
+    -------
+    numpy.dtype
+        The common data type that can represent all input arrays.
+
+    Notes
+    -----
+    The function handles sparse matrices, dense arrays, and scalars.
+    For sparse matrices, it uses their dtype. For scalars (size == 1),
+    they are treated separately from arrays.
+    """
     array_types = list()
     scalar_types = list()
     for array in args:
-        if legate_sparse.is_sparse_matrix(array):
+        if legate_sparse.isspmatrix(array):
             array_types.append(array.dtype)
         elif array.size == 1:
             scalar_types.append(array.dtype)
@@ -104,18 +193,47 @@ def find_common_type(*args):
     return numpy.result_type(*array_types, *scalar_types)
 
 
-# cast_to_common_type casts all arguments to the same common dtype.
 def cast_to_common_type(*args):
-    # Find a common type for all of the arguments.
+    """Cast all arguments to the same common data type.
+
+    Parameters
+    ----------
+    *args : array_like
+        Arrays to cast to a common type.
+
+    Returns
+    -------
+    tuple
+        Tuple of arrays, all cast to the same common data type.
+
+    Notes
+    -----
+    This function first finds the common type using find_common_type,
+    then casts each input to that type. If all arguments are already
+    the common type, this will be a no-op.
+    """
     common_type = find_common_type(*args)
-    # Cast each input to the common type. Ideally, if all of the
-    # arguments are already the common type then this will
-    # be a no-op.
     return tuple(arg.astype(common_type, copy=False) for arg in args)
 
 
-# factor_int decomposes an integer into a close to square grid.
 def factor_int(n):
+    """Split an integer into two close factors.
+
+    Parameters
+    ----------
+    n : int
+        The integer to factor.
+
+    Returns
+    -------
+    tuple
+        (val, val2) where val * val2 = n and val is close to sqrt(n).
+
+    Notes
+    -----
+    This function finds two factors of n such that their product equals n
+    and the first factor is close to the square root of n.
+    """
     val = math.ceil(math.sqrt(n))
     val2 = int(n / val)
     while val2 * val != float(n):
@@ -124,9 +242,31 @@ def factor_int(n):
     return val, val2
 
 
-# broadcast_store broadcasts a store to the desired input shape,
-# or throws an error if the broadcast is not possible.
 def broadcast_store(store: LogicalStore, shape: Any) -> LogicalStore:
+    """Broadcast a LogicalStore to the desired shape.
+
+    Parameters
+    ----------
+    store : LogicalStore
+        The store to broadcast.
+    shape : tuple
+        The target shape to broadcast to.
+
+    Returns
+    -------
+    LogicalStore
+        The broadcasted store.
+
+    Raises
+    ------
+    ValueError
+        If the broadcast is not possible.
+
+    Notes
+    -----
+    This function handles both dimension promotion (adding new dimensions)
+    and broadcasting (expanding dimensions of size 1).
+    """
     diff = len(shape) - store.ndim
     for dim in range(diff):
         store = store.promote(dim, shape[dim])
@@ -142,13 +282,43 @@ def broadcast_store(store: LogicalStore, shape: Any) -> LogicalStore:
 
 
 def copy_store(store: LogicalStore) -> LogicalStore:
+    """Create a copy of a LogicalStore.
+
+    Parameters
+    ----------
+    store : LogicalStore
+        The store to copy.
+
+    Returns
+    -------
+    LogicalStore
+        A new LogicalStore with the same data as the input.
+    """
     res = runtime.create_store(store.type, store.shape)  # type: ignore
     runtime.legate_runtime.issue_copy(res, store)
     return res
 
 
 def store_from_store_or_array(src, copy=False) -> LogicalStore:  # type: ignore
-    "Get LogicalStore from a LogicalStore or array, potentially creating a copy"
+    """Get LogicalStore from a LogicalStore or array, potentially creating a copy.
+
+    Parameters
+    ----------
+    src : LogicalStore or cupynumeric.ndarray
+        The source object to convert.
+    copy : bool, optional
+        Whether to create a copy. Default is False.
+
+    Returns
+    -------
+    LogicalStore
+        The LogicalStore representation of the input.
+
+    Raises
+    ------
+    AssertionError
+        If the input type is not supported.
+    """
     if isinstance(src, cupynumeric.ndarray):
         return get_store_from_cupynumeric_array(src, copy)
     elif isinstance(src, LogicalStore):
@@ -158,7 +328,25 @@ def store_from_store_or_array(src, copy=False) -> LogicalStore:  # type: ignore
 
 
 def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type: ignore
-    "Get array from a LogicalStore or array, potentially creating a copy"
+    """Get array from a LogicalStore or array, potentially creating a copy.
+
+    Parameters
+    ----------
+    src : LogicalStore or cupynumeric.ndarray
+        The source object to convert.
+    copy : bool, optional
+        Whether to create a copy. Default is False.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The cupynumeric array representation of the input.
+
+    Raises
+    ------
+    AssertionError
+        If the input type is not supported.
+    """
     if isinstance(src, cupynumeric.ndarray):
         return src.copy() if copy else src
     elif isinstance(src, LogicalStore):
@@ -173,6 +361,23 @@ def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type:
 
 
 def get_storage_type(src):
+    """Get the storage type of an object.
+
+    Parameters
+    ----------
+    src : LogicalStore or cupynumeric.ndarray
+        The object to get the storage type for.
+
+    Returns
+    -------
+    numpy.dtype
+        The data type of the object.
+
+    Raises
+    ------
+    AssertionError
+        If the input type is not supported.
+    """
     if isinstance(src, cupynumeric.ndarray):
         return src.dtype
     elif isinstance(src, LogicalStore):
@@ -185,33 +390,58 @@ def get_storage_type(src):
 
 
 def is_dtype_supported(dtype: numpy.dtype) -> bool:
-    """
-    Does this datatype support spMV and spGEMM operations
+    """Check if a datatype supports SpMV and SpGEMM operations.
 
     Parameters
     ----------
-    dtype: np.dtype
-        Input datatype to check if it supports spMV and spGEMM
+    dtype : numpy.dtype
+        Input datatype to check if it supports SpMV and SpGEMM.
 
     Returns
     -------
-    valid: bool
-        True if  dtype supports spMV and spGEMM
-    """
+    bool
+        True if dtype supports SpMV and SpGEMM operations.
 
+    Notes
+    -----
+    Currently supported datatypes are float32, float64, complex64, and complex128.
+    """
     return dtype in SUPPORTED_DATATYPES
 
 
 def is_dense(x) -> bool:
-    """
-    Is this object a dense cupynumeric array
+    """Check if an object is a dense cupynumeric array.
+
+    Parameters
+    ----------
+    x : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if x is a cupynumeric.ndarray, False otherwise.
     """
     return isinstance(x, cupynumeric.ndarray)
 
 
 def is_scalar_like(x) -> bool:
-    """
-    Is this object a scalar like type
+    """Check if an object is a scalar-like type.
+
+    Parameters
+    ----------
+    x : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if x is a scalar or 0-dimensional array, False otherwise.
+
+    Notes
+    -----
+    This function returns False for strings, even though they are scalar-like
+    in some contexts, to avoid confusion with numeric scalars.
     """
     if isinstance(x, str):
         return False
@@ -219,32 +449,52 @@ def is_scalar_like(x) -> bool:
 
 
 def is_sparse(x) -> bool:
+    """Check if an object is a legate sparse matrix.
+
+    Parameters
+    ----------
+    x : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if x is a legate sparse matrix, False otherwise.
     """
-    Is this object a legate sparse matrix
-    """
-    return legate_sparse.is_sparse_matrix(x)
+    return legate_sparse.isspmatrix(x)
 
 
 def sort_by_rows_then_cols(rows: cupynumeric.ndarray, cols: cupynumeric.ndarray):
-    """
+    """Sort indices by rows first, then by columns.
+
     This function is a quick and dirty hack that does what np.lexsort does
-    using argsort, but only for two keys.
-    This is primarily used to to get the indices that we can use to sort data
-    first by rows and then by columns
+    using argsort, but only for two keys. This is primarily used to get
+    the indices that we can use to sort data first by rows and then by columns.
 
     Parameters
     ----------
-
-    rows: cupynumeric.ndarray
-        Indices of rows
-
-    cols: cupynumeric.ndarray
-        Indices of cols
+    rows : cupynumeric.ndarray
+        Indices of rows.
+    cols : cupynumeric.ndarray
+        Indices of columns.
 
     Returns
     -------
-    sorted_indices:cupynumeric.ndarray
-        Indices sorted by rows and then by columns, as given by numpy's lexsort
+    cupynumeric.ndarray
+        Indices sorted by rows and then by columns, as given by numpy's lexsort.
+
+    Notes
+    -----
+    This function is equivalent to np.lexsort((cols, rows)) but implemented
+    using stable sorting to ensure consistent results.
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> rows = np.array([1, 0, 1, 0])
+    >>> cols = np.array([2, 1, 1, 2])
+    >>> indices = sort_by_rows_then_cols(rows, cols)
+    >>> print(indices)  # [1, 3, 2, 0] - sorted by (row, col)
     """
     assert rows.size == cols.size
 
diff --git a/setup.py b/setup.py
index daa17216..68efb75c 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 
 setup(
     name="legate-sparse",
-    version="25.03.00",
+    version="25.07.00",
     description="An Aspiring Drop-In Replacement for SciPy Sparse module at Scale",
     author="NVIDIA Corporation",
     license="Apache 2.0",
@@ -48,8 +48,9 @@
         "Topic :: Scientific/Engineering",
         "License :: OSI Approved :: Apache Software License",
         "Programming Language :: Python",
-        "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
     ],
     packages=find_packages(
         where=".",
diff --git a/src/legate_sparse/array/conv/csr_to_dense.cc b/src/legate_sparse/array/conv/csr_to_dense.cc
index d865a805..de9a8958 100644
--- a/src/legate_sparse/array/conv/csr_to_dense.cc
+++ b/src/legate_sparse/array/conv/csr_to_dense.cc
@@ -55,7 +55,11 @@ struct CSRToDenseImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { CSRToDense::register_variants(); }
+
+static const auto sparse_reg_task_ = []() -> char {
+  CSRToDense::register_variants();
+  return 0;
+}();
 
 }  // namespace
 
diff --git a/src/legate_sparse/array/conv/csr_to_dense.h b/src/legate_sparse/array/conv/csr_to_dense.h
index 7b5a947e..58ec4479 100644
--- a/src/legate_sparse/array/conv/csr_to_dense.h
+++ b/src/legate_sparse/array/conv/csr_to_dense.h
@@ -31,7 +31,8 @@ struct CSRToDenseArgs {
 
 class CSRToDense : public SparseTask<CSRToDense> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_TO_DENSE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_TO_DENSE}};
 
  public:
   static void cpu_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/conv/dense_to_csr.cc b/src/legate_sparse/array/conv/dense_to_csr.cc
index 97a86fe7..3304b558 100644
--- a/src/legate_sparse/array/conv/dense_to_csr.cc
+++ b/src/legate_sparse/array/conv/dense_to_csr.cc
@@ -77,11 +77,12 @@ struct DenseToCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+
+static const auto sparse_reg_task_ = []() -> char {
   DenseToCSRNNZ::register_variants();
   DenseToCSR::register_variants();
-}
+  return 0;
+}();
 
 }  // namespace
 
diff --git a/src/legate_sparse/array/conv/dense_to_csr.h b/src/legate_sparse/array/conv/dense_to_csr.h
index a0ebddc0..c9cf504a 100644
--- a/src/legate_sparse/array/conv/dense_to_csr.h
+++ b/src/legate_sparse/array/conv/dense_to_csr.h
@@ -29,7 +29,8 @@ struct DenseToCSRNNZArgs {
 
 class DenseToCSRNNZ : public SparseTask<DenseToCSRNNZ> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR_NNZ};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR_NNZ}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
@@ -48,7 +49,8 @@ struct DenseToCSRArgs {
 
 class DenseToCSR : public SparseTask<DenseToCSR> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cc b/src/legate_sparse/array/conv/pos_to_coordinates.cc
index 6b781134..7cadb10e 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.cc
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.cc
@@ -44,10 +44,10 @@ struct ExpandPosToCoordinatesImplBody<VariantKind::CPU, INDEX_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   ExpandPosToCoordinates::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.h b/src/legate_sparse/array/conv/pos_to_coordinates.h
index 70e351a6..ad21ff95 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.h
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.h
@@ -29,7 +29,8 @@ struct ExpandPosToCoordinatesArgs {
 
 class ExpandPosToCoordinates : public SparseTask<ExpandPosToCoordinates> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_EXPAND_POS_TO_COORDINATES};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_EXPAND_POS_TO_COORDINATES}};
 
  public:
   static void cpu_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/csr/get_diagonal.cc b/src/legate_sparse/array/csr/get_diagonal.cc
index 1e39b82b..cace6438 100644
--- a/src/legate_sparse/array/csr/get_diagonal.cc
+++ b/src/legate_sparse/array/csr/get_diagonal.cc
@@ -50,10 +50,11 @@ struct GetCSRDiagonalImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   GetCSRDiagonal::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/get_diagonal.h b/src/legate_sparse/array/csr/get_diagonal.h
index 6dd842bf..0c3d44a7 100644
--- a/src/legate_sparse/array/csr/get_diagonal.h
+++ b/src/legate_sparse/array/csr/get_diagonal.h
@@ -32,7 +32,8 @@ struct GetCSRDiagonalArgs {
 
 class GetCSRDiagonal : public SparseTask<GetCSRDiagonal> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_DIAGONAL};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_DIAGONAL}};
   // TODO (rohany): We could rewrite this having each implementation just make
   //  a call to thrust::transform, but the implementations are simple enough
   //  anyway.
diff --git a/src/legate_sparse/array/csr/indexing.cc b/src/legate_sparse/array/csr/indexing.cc
index 8fc0c11b..f40c901b 100644
--- a/src/legate_sparse/array/csr/indexing.cc
+++ b/src/legate_sparse/array/csr/indexing.cc
@@ -90,10 +90,11 @@ struct CSRIndexingCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   CSRIndexingCSR::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/indexing.h b/src/legate_sparse/array/csr/indexing.h
index 7bd6240c..8962370c 100644
--- a/src/legate_sparse/array/csr/indexing.h
+++ b/src/legate_sparse/array/csr/indexing.h
@@ -33,7 +33,8 @@ struct CSRIndexingCSRArgs {
 
 class CSRIndexingCSR : public SparseTask<CSRIndexingCSR> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_INDEXING_CSR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_INDEXING_CSR}};
 
   // TODO: The implementatio of the below three variants are
   // identical and hence need to be templated (DRY)
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
index 47ed6d34..6c4945de 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
@@ -177,12 +177,13 @@ struct SpGEMMCSRxCSRxCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   SpGEMMCSRxCSRxCSRNNZ::register_variants();
   SpGEMMCSRxCSRxCSR::register_variants();
   SpGEMMCSRxCSRxCSRGPU::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h
index bf5d526d..c1b004ce 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h
@@ -32,7 +32,8 @@ struct SpGEMMCSRxCSRxCSRNNZArgs {
 
 class SpGEMMCSRxCSRxCSRNNZ : public SparseTask<SpGEMMCSRxCSRxCSRNNZ> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_NNZ};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_NNZ}};
 
   static constexpr legate::VariantOptions CPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
@@ -60,7 +61,8 @@ struct SpGEMMCSRxCSRxCSRArgs {
 
 class SpGEMMCSRxCSRxCSR : public SparseTask<SpGEMMCSRxCSRxCSR> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR}};
 
   static constexpr legate::VariantOptions CPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
@@ -94,7 +96,8 @@ struct SpGEMMCSRxCSRxCSRGPUArgs {
 // we take a different approach than on CPUs and OMPs.
 class SpGEMMCSRxCSRxCSRGPU : public SparseTask<SpGEMMCSRxCSRxCSRGPU> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU}};
 
   static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/array/csr/spmv.cc b/src/legate_sparse/array/csr/spmv.cc
index 63c305c6..d9efa4fd 100644
--- a/src/legate_sparse/array/csr/spmv.cc
+++ b/src/legate_sparse/array/csr/spmv.cc
@@ -51,10 +51,11 @@ struct CSRSpMVRowSplitImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   CSRSpMVRowSplit::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spmv.h b/src/legate_sparse/array/csr/spmv.h
index 8c46ba7f..7d718990 100644
--- a/src/legate_sparse/array/csr/spmv.h
+++ b/src/legate_sparse/array/csr/spmv.h
@@ -32,7 +32,8 @@ struct CSRSpMVRowSplitArgs {
 
 class CSRSpMVRowSplit : public SparseTask<CSRSpMVRowSplit> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_SPMV_ROW_SPLIT};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_SPMV_ROW_SPLIT}};
 
   static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/array/util/scale_rect.cc b/src/legate_sparse/array/util/scale_rect.cc
index bad54157..c2d2df90 100644
--- a/src/legate_sparse/array/util/scale_rect.cc
+++ b/src/legate_sparse/array/util/scale_rect.cc
@@ -39,7 +39,11 @@ struct ScaleRect1ImplBody<VariantKind::CPU> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ScaleRect1::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  ScaleRect1::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/scale_rect.h b/src/legate_sparse/array/util/scale_rect.h
index e9e7ffda..0b559036 100644
--- a/src/legate_sparse/array/util/scale_rect.h
+++ b/src/legate_sparse/array/util/scale_rect.h
@@ -29,7 +29,8 @@ struct ScaleRect1Args {
 
 class ScaleRect1 : public SparseTask<ScaleRect1> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SCALE_RECT_1};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SCALE_RECT_1}};
   static void cpu_variant(legate::TaskContext context);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext context);
diff --git a/src/legate_sparse/array/util/unzip_rect.cc b/src/legate_sparse/array/util/unzip_rect.cc
index 9a9e8708..1272e9cc 100644
--- a/src/legate_sparse/array/util/unzip_rect.cc
+++ b/src/legate_sparse/array/util/unzip_rect.cc
@@ -42,7 +42,11 @@ struct UnZipRect1ImplBody<VariantKind::CPU> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { UnZipRect1::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  UnZipRect1::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/unzip_rect.h b/src/legate_sparse/array/util/unzip_rect.h
index e470c541..08293ef2 100644
--- a/src/legate_sparse/array/util/unzip_rect.h
+++ b/src/legate_sparse/array/util/unzip_rect.h
@@ -30,7 +30,8 @@ struct UnZipRect1Args {
 
 class UnZipRect1 : public SparseTask<UnZipRect1> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_UNZIP_RECT_1};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_UNZIP_RECT_1}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/util/zip_to_rect.cc b/src/legate_sparse/array/util/zip_to_rect.cc
index 39634664..c8871583 100644
--- a/src/legate_sparse/array/util/zip_to_rect.cc
+++ b/src/legate_sparse/array/util/zip_to_rect.cc
@@ -41,7 +41,11 @@ struct ZipToRect1ImplBody<VariantKind::CPU, VAL> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ZipToRect1::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  ZipToRect1::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/zip_to_rect.h b/src/legate_sparse/array/util/zip_to_rect.h
index 3851a195..4bc5ac70 100644
--- a/src/legate_sparse/array/util/zip_to_rect.h
+++ b/src/legate_sparse/array/util/zip_to_rect.h
@@ -30,7 +30,8 @@ struct ZipToRect1Args {
 
 class ZipToRect1 : public SparseTask<ZipToRect1> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_ZIP_TO_RECT_1};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_ZIP_TO_RECT_1}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/cudalibs.cu b/src/legate_sparse/cudalibs.cu
index 733ea8f1..6ec45bd5 100644
--- a/src/legate_sparse/cudalibs.cu
+++ b/src/legate_sparse/cudalibs.cu
@@ -77,7 +77,8 @@ cusparseHandle_t get_cusparse()
 
 class LoadCUDALibsTask : public SparseTask<LoadCUDALibsTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_LOAD_CUDALIBS};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_LOAD_CUDALIBS}};
 
  public:
   static void gpu_variant(legate::TaskContext context)
@@ -90,7 +91,8 @@ class LoadCUDALibsTask : public SparseTask<LoadCUDALibsTask> {
 
 class UnloadCUDALibsTask : public SparseTask<UnloadCUDALibsTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_UNLOAD_CUDALIBS};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_UNLOAD_CUDALIBS}};
 
  public:
   static void gpu_variant(legate::TaskContext context)
@@ -101,10 +103,10 @@ class UnloadCUDALibsTask : public SparseTask<UnloadCUDALibsTask> {
   }
 };
 
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   LoadCUDALibsTask::register_variants();
   UnloadCUDALibsTask::register_variants();
-}
+  return 0;
+}();
 
 }  // namespace sparse
diff --git a/src/legate_sparse/io/mtx_to_coo.cc b/src/legate_sparse/io/mtx_to_coo.cc
index d2c85667..71afd22e 100644
--- a/src/legate_sparse/io/mtx_to_coo.cc
+++ b/src/legate_sparse/io/mtx_to_coo.cc
@@ -35,13 +35,13 @@ using val_ty   = double;
   // within DISTAL.
   assert(ctx.is_single_task());
   // Regardless of how inputs are added, scalar future return values are at the front.
-  auto& m_store   = ctx.outputs()[0];
-  auto& n_store   = ctx.outputs()[1];
-  auto& nnz_store = ctx.outputs()[2];
-  auto& rows      = ctx.outputs()[3];
-  auto& cols      = ctx.outputs()[4];
-  auto& vals      = ctx.outputs()[5];
-  auto filename   = ctx.scalars()[0].value<std::string>();
+  auto m_store   = ctx.output(0);
+  auto n_store   = ctx.output(1);
+  auto nnz_store = ctx.output(2);
+  auto rows      = ctx.output(3);
+  auto cols      = ctx.output(4);
+  auto vals      = ctx.output(5);
+  auto filename  = ctx.scalar(0).value<std::string>();
   std::fstream file;
   file.open(filename, std::fstream::in);
 
@@ -148,7 +148,11 @@ using val_ty   = double;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ReadMTXToCOO::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  ReadMTXToCOO::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/io/mtx_to_coo.h b/src/legate_sparse/io/mtx_to_coo.h
index 4dde28fa..c8e1d7ff 100644
--- a/src/legate_sparse/io/mtx_to_coo.h
+++ b/src/legate_sparse/io/mtx_to_coo.h
@@ -24,7 +24,8 @@ namespace sparse {
 
 class ReadMTXToCOO : public SparseTask<ReadMTXToCOO> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_READ_MTX_TO_COO};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_READ_MTX_TO_COO}};
 
   static constexpr legate::VariantOptions CPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/linalg/axpby.cc b/src/legate_sparse/linalg/axpby.cc
index 1e61a1bf..547ad927 100644
--- a/src/legate_sparse/linalg/axpby.cc
+++ b/src/legate_sparse/linalg/axpby.cc
@@ -52,7 +52,11 @@ struct AXPBYImplBody<VariantKind::CPU, VAL_CODE, IS_ALPHA, NEGATE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { AXPBY::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  AXPBY::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/linalg/axpby.h b/src/legate_sparse/linalg/axpby.h
index 132c9f22..256e8070 100644
--- a/src/legate_sparse/linalg/axpby.h
+++ b/src/legate_sparse/linalg/axpby.h
@@ -33,7 +33,8 @@ struct AXPBYArgs {
 
 class AXPBY : public SparseTask<AXPBY> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_AXPBY};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_AXPBY}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/mapper/mapper.cc b/src/legate_sparse/mapper/mapper.cc
index 43ed8b14..6357d898 100644
--- a/src/legate_sparse/mapper/mapper.cc
+++ b/src/legate_sparse/mapper/mapper.cc
@@ -32,9 +32,9 @@ std::vector<StoreMapping> LegateSparseMapper::store_mappings(
   const Task& task, const std::vector<StoreTarget>& options)
 {
   const auto& inputs = task.inputs();
-  std::vector<StoreMapping> mappings(inputs.size());
+  std::vector<StoreMapping> mappings;
   for (size_t i = 0; i < inputs.size(); i++) {
-    mappings[i] = StoreMapping::default_mapping(inputs[i].data(), options.front());
+    mappings.push_back(StoreMapping::default_mapping(inputs[i].data(), options.front()));
   }
   return std::move(mappings);
 }
@@ -64,9 +64,10 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
         auto crd  = task.inputs()[1];
         auto vals = task.inputs()[2];
 
-        std::size_t nrows_plus_one   = pos.domain().get_volume() + 1;
-        std::size_t nnz              = vals.domain().get_volume();
-        std::size_t factor_of_safety = 1.15;  // make sure we don't fail; 1.15 is arbitrary
+        std::size_t nrows_plus_one = pos.domain().get_volume() + 1;
+        std::size_t nnz            = vals.domain().get_volume();
+        // make sure we don't fail; 1.15 is arbitrary
+        std::size_t factor_of_safety = static_cast<std::size_t>(1.15);
         std::size_t cusparseSpMV_buffer_size =
           factor_of_safety * std::ceil(nnz / 32.0) * sizeof(double);
         std::size_t legate_buffer_size = nrows_plus_one * (vals.type().size() + crd.type().size());
@@ -124,10 +125,13 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
       // and then update the estimate here
       return std::nullopt;
     }
-  }
 
-  LEGATE_ABORT("Unsupported Legate Sparse task_id: " + std::to_string(task_id));
-  return {};
+    default: {
+      // Handle any unhandled enum values
+      LEGATE_ABORT("Unsupported Legate Sparse task_id: " + std::to_string(task_id));
+      return {};
+    }
+  }
 }
 
 Scalar LegateSparseMapper::tunable_value(legate::TunableID tunable_id)
diff --git a/src/legate_sparse/partition/fast_image_partition.cc b/src/legate_sparse/partition/fast_image_partition.cc
index c9ee27ef..13801faa 100644
--- a/src/legate_sparse/partition/fast_image_partition.cc
+++ b/src/legate_sparse/partition/fast_image_partition.cc
@@ -23,10 +23,10 @@ using namespace legate;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   FastImageRange::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace sparse
\ No newline at end of file
+}  // namespace sparse
diff --git a/src/legate_sparse/partition/fast_image_partition.h b/src/legate_sparse/partition/fast_image_partition.h
index 2d09ff08..992843c5 100644
--- a/src/legate_sparse/partition/fast_image_partition.h
+++ b/src/legate_sparse/partition/fast_image_partition.h
@@ -31,7 +31,8 @@ struct FastImageRangeArgs {
 // only for CSR SpGEMM on GPU right now
 class FastImageRange : public SparseTask<FastImageRange> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_FAST_IMAGE_RANGE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_FAST_IMAGE_RANGE}};
 
   static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/util/upcast_future.cc b/src/legate_sparse/util/upcast_future.cc
index c7e8f4ec..ad66b755 100644
--- a/src/legate_sparse/util/upcast_future.cc
+++ b/src/legate_sparse/util/upcast_future.cc
@@ -82,10 +82,12 @@ void upcast_impl(legate::TaskContext ctx)
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+
+static const auto sparse_reg_task_ = []() -> char {
   UpcastFutureToRegion::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/util/upcast_future.h b/src/legate_sparse/util/upcast_future.h
index 7c78df88..b38dbab9 100644
--- a/src/legate_sparse/util/upcast_future.h
+++ b/src/legate_sparse/util/upcast_future.h
@@ -24,7 +24,8 @@ namespace sparse {
 
 class UpcastFutureToRegion : public SparseTask<UpcastFutureToRegion> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_UPCAST_FUTURE_TO_REGION};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_UPCAST_FUTURE_TO_REGION}};
   static void cpu_variant(legate::TaskContext ctx);
 
  private:
diff --git a/test.py b/test.py
index 581dac4b..72165ebf 100755
--- a/test.py
+++ b/test.py
@@ -44,5 +44,4 @@ def stage_env(self, feature: FeatureType) -> EnvDict:
 
     plan = TestPlan(config, system)
 
-    plan.execute()
-    sys.exit(0)
+    sys.exit(plan.execute())
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 8c7ffa2f..a8b2f17e 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -8,8 +8,30 @@
 
 @pytest.fixture
 def create_mask():
-    """
-    Create a boolean mask matrix with a random sparsity pattern
+    """Create a boolean mask matrix with a random sparsity pattern.
+
+    This fixture creates equivalent boolean mask matrices in both SciPy and
+    Legate Sparse formats for testing purposes.
+
+    Parameters
+    ----------
+    rows : int
+        Number of rows (and columns) in the square matrix.
+    density : float, optional
+        Density of non-zero elements. Default is 0.3.
+
+    Returns
+    -------
+    tuple
+        (A_scipy, A_sparse) - Equivalent boolean matrices in SciPy and
+        Legate Sparse formats.
+
+    Notes
+    -----
+    The fixture ensures that both matrices have identical sparsity patterns
+    and values. It verifies equivalence by converting both to dense format
+    and checking that they are numerically close.
+
     """
 
     def _create_mask(rows, density=0.3):
@@ -39,8 +61,32 @@ def _create_mask(rows, density=0.3):
 
 @pytest.fixture
 def create_matrix():
-    """
-    Create matrices in SciPy and Legate Sparse that are equivalent
+    """Create matrices in SciPy and Legate Sparse that are equivalent.
+
+    This fixture creates equivalent sparse matrices in both SciPy and
+    Legate Sparse formats for testing purposes.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+    tol : float, optional
+        Threshold for sparsity. Values below this threshold are set to zero.
+        Default is 0.5.
+
+    Returns
+    -------
+    tuple
+        (A_scipy, A_sparse) - Equivalent sparse matrices in SciPy and
+        Legate Sparse formats.
+
+    Notes
+    -----
+    The fixture uses simple_system_gen to create a dense matrix, then
+    converts it to sparse format in both libraries. It verifies equivalence
+    by converting both to dense format and checking that they are numerically
+    close.
+
     """
 
     def _create_matrix(N, tol=0.5):
diff --git a/tests/integration/test_cg_solve.py b/tests/integration/test_cg_solve.py
index fe8b3578..d8e046e3 100644
--- a/tests/integration/test_cg_solve.py
+++ b/tests/integration/test_cg_solve.py
@@ -21,8 +21,26 @@
 
 
 def test_cg_solve():
-    N, D = 1000, 1000
-    seed = 471014
+    """Test conjugate gradient solver with a positive definite matrix.
+
+    This test verifies that the conjugate gradient solver correctly
+    solves the linear system Ax = b for a positive definite matrix A.
+
+    Notes
+    -----
+    The test creates a random sparse matrix A and ensures it is positive
+    definite by:
+    1. Making it symmetric: A = 0.5 * (A + A.T)
+    2. Adding a multiple of the identity: A = A + N * I
+
+    It then generates a random solution vector x and computes b = Ax.
+    The CG solver is used to solve Ax = b, and the result is verified
+    by checking that A * x_pred ≈ b.
+
+    The test uses a tolerance of 1e-8 for convergence and verification.
+    """
+    N, D = 20, 20
+    seed = 42
     A = sample_dense(N, D, 0.1, seed)
     A = 0.5 * (A + A.T)
     A = A + N * np.eye(N)
@@ -36,8 +54,28 @@ def test_cg_solve():
 
 
 def test_cg_solve_with_callback():
-    N, D = 1000, 1000
-    seed = 471014
+    """Test conjugate gradient solver with a callback function.
+
+    This test verifies that the conjugate gradient solver correctly
+    handles callback functions during iteration.
+
+    Notes
+    -----
+    The test creates a positive definite matrix and solves the linear
+    system Ax = b using CG with a callback function. The callback
+    computes the residual at each iteration and stores it in a list.
+
+    The test verifies that:
+    1. The solver converges to the correct solution
+    2. The callback function is called during iteration
+    3. The residuals are computed correctly
+
+    This ensures that the callback mechanism works properly and can
+    be used for monitoring convergence or implementing custom stopping
+    criteria.
+    """
+    N, D = 20, 20
+    seed = 42
     A = sample_dense(N, D, 0.1, seed)
     A = 0.5 * (A + A.T)
     A = A + N * np.eye(N)
@@ -59,8 +97,8 @@ def callback(x):
 
 
 # def test_cg_solve_with_identity_preconditioner():
-#     N, D = 1000, 1000
-#     seed = 471014
+#     N, D = 20, 20
+#     seed = 42
 #     A = sample_dense(N, D, 0.1, seed)
 #     A = 0.5 * (A + A.T)
 #     A = A + N * np.eye(N)
@@ -75,8 +113,28 @@ def callback(x):
 
 
 def test_cg_solve_with_linear_operator():
-    N, D = 1000, 1000
-    seed = 471014
+    """Test conjugate gradient solver with LinearOperator objects.
+
+    This test verifies that the conjugate gradient solver correctly
+    works with LinearOperator objects that provide matrix-vector
+    multiplication functionality.
+
+    Notes
+    -----
+    The test creates a positive definite matrix A and wraps it in
+    a LinearOperator object. It then solves the linear system using
+    CG with the LinearOperator instead of the sparse matrix directly.
+
+    The test verifies two different LinearOperator implementations:
+    1. Using the @ operator: matvec(x) = A @ x
+    2. Using the dot method: matvec(x, out=None) = A.dot(x, out=out)
+
+    This ensures that the solver can work with any object that provides
+    the required matrix-vector multiplication interface, not just
+    sparse matrices.
+    """
+    N, D = 20, 20
+    seed = 42
     A = sample_dense(N, D, 0.1, seed)
     A = 0.5 * (A + A.T)
     A = A + N * np.eye(N)
diff --git a/tests/integration/test_comparison.py b/tests/integration/test_comparison.py
index 03c28128..3e58d9c6 100644
--- a/tests/integration/test_comparison.py
+++ b/tests/integration/test_comparison.py
@@ -33,18 +33,32 @@
 @pytest.mark.parametrize("threshold", [0.3, 0.5])
 @pytest.mark.parametrize("op_name, op_func", COMPARISON_OPS)
 def test_comparison_operation(N, threshold, op_name, op_func):
-    """Test element-wise comparison operations on non-zero entries of the matrix
+    """Test element-wise comparison operations on non-zero entries of the matrix.
+
+    This test verifies that comparison operations work correctly on sparse
+    matrices by comparing results with dense matrix operations.
 
     Parameters
     ----------
     N : int
-        Size of the test matrix
+        Size of the test matrix.
     threshold : float
-        Value to compare against
+        Value to compare against.
     op_name : str
-        Name of the comparison operation
+        Name of the comparison operation.
     op_func : callable
-        The comparison function to test
+        The comparison function to test.
+
+    Notes
+    -----
+    The test creates a sparse matrix and applies a comparison operation
+    against a threshold value. It then compares the number of True values
+    in the sparse result with the dense result (considering only non-zero
+    elements).
+
+    This verifies that sparse comparison operations produce the same
+    logical result as dense operations when applied to non-zero elements.
+
     """
     A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.7)
 
@@ -58,12 +72,30 @@ def test_comparison_operation(N, threshold, op_name, op_func):
 def test_comparison_error_cases(op_name, op_func):
     """Test error cases for comparison operations.
 
+    This test verifies that comparison operations properly handle invalid
+    input types by raising appropriate exceptions.
+
     Parameters
     ----------
     op_name : str
-        Name of the comparison operation
+        Name of the comparison operation.
     op_func : callable
-        The comparison function to test
+        The comparison function to test.
+
+    Notes
+    -----
+    The test attempts to compare a sparse matrix with various invalid
+    types including:
+    - 1D arrays
+    - 2D arrays
+    - Strings
+    - Lists
+
+    All of these should raise AssertionError since sparse matrix
+    comparison operations only support scalar values.
+
+    This ensures that the implementation properly validates input
+    types and provides clear error messages for unsupported operations.
     """
     N = 8
     _, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.7)
diff --git a/tests/integration/test_diagonal.py b/tests/integration/test_diagonal.py
index 8c53aa00..3ed7dd54 100644
--- a/tests/integration/test_diagonal.py
+++ b/tests/integration/test_diagonal.py
@@ -24,6 +24,36 @@
 @pytest.mark.parametrize("N", [7, 13])
 @pytest.mark.parametrize("with_zeros", [True, False])
 def test_csr_diagonal(N, with_zeros):
+    """Test diagonal extraction from CSR matrices.
+
+    This test verifies that the diagonal() method correctly extracts
+    the main diagonal from CSR matrices, comparing results with dense
+    matrix diagonal extraction.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrix (N x N).
+    with_zeros : bool
+        Whether to include zeros on the diagonal (True) or ensure
+        non-zero diagonal elements (False).
+
+    Notes
+    -----
+    The test creates a random sparse matrix and optionally adds the
+    identity matrix to ensure non-zero diagonal elements. It then
+    extracts the diagonal using both the sparse matrix's diagonal()
+    method and numpy's diagonal() function on the dense version.
+
+    The test verifies that:
+    1. The diagonal elements are extracted correctly
+    2. The results match between sparse and dense implementations
+    3. The method works for both sparse and dense diagonals
+
+    This is important because diagonal extraction is a common operation
+    in linear algebra and should work consistently across different
+    matrix formats.
+    """
     M = N
     np.random.seed(0)
     A_dense, _, _ = simple_system_gen(N, M, None, tol=0.2)
diff --git a/tests/integration/test_gmres_solve.py b/tests/integration/test_gmres_solve.py
index 1acd8d68..37c8dd76 100644
--- a/tests/integration/test_gmres_solve.py
+++ b/tests/integration/test_gmres_solve.py
@@ -21,6 +21,28 @@
 
 
 def test_gmres_solve():
+    """Test GMRES solver with a positive definite matrix.
+
+    This test verifies that the GMRES solver correctly solves the linear
+    system Ax = b for a positive definite matrix A.
+
+    Notes
+    -----
+    The test creates a random sparse matrix A and ensures it is positive
+    definite by:
+    1. Making it symmetric: A = 0.5 * (A + A.T)
+    2. Adding a multiple of the identity: A = A + N * I
+
+    It then generates a random solution vector x and computes b = Ax.
+    The GMRES solver is used to solve Ax = b, and the result is verified
+    by checking that A * x_pred ≈ b.
+
+    The test uses:
+    - atol=1e-5: Absolute tolerance for convergence
+    - tol=1e-5: Relative tolerance (legacy parameter)
+    - maxiter=300: Maximum number of iterations
+    - atol=1e-8: Tolerance for final verification
+    """
     N, D = 1000, 1000
     seed = 471014
     A = sample_dense(N, D, 0.1, seed)
diff --git a/tests/integration/test_indexing.py b/tests/integration/test_indexing.py
index 80d078ae..259c7996 100644
--- a/tests/integration/test_indexing.py
+++ b/tests/integration/test_indexing.py
@@ -20,15 +20,39 @@
 
 
 class TestIndexingSetItem:
+    """Test class for sparse matrix indexing and assignment operations.
+
+    This class contains tests for various indexing scenarios including
+    boolean masking, derived masks, and edge cases for sparse matrix
+    assignment operations.
+    """
+
     @pytest.mark.parametrize("N", [6, 9, 17])
     def test_incompatible_mask(self, N, create_matrix, create_mask):
-        """
+        """Test indexing with incompatible mask sparsity patterns.
+
         This test checks that the mask is applied correctly to the matrix when
-        the sparsity of mask is from that of the matrix.
+        the sparsity of mask is different from that of the matrix.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+        create_mask : fixture
+            Fixture to create boolean masks.
+
+        Notes
+        -----
         While SciPy will apply the mask to all entries, Legate Sparse will only
         apply the mask to the non-zero entries of the matrix, so we can't compare
-        to SciPy results for all entries. Instead, we check that the number of
-        non-zero entries are updated correctly and the values are updated correctly.
+        to SciPy results for all entries. Instead, we check that:
+        1. The number of non-zero entries are updated correctly
+        2. The values are updated correctly for masked positions
+
+        This test verifies that the sparse implementation correctly handles
+        cases where the mask has a different sparsity pattern than the matrix.
         """
         _, A = create_matrix(N)
         _, mask = create_mask(N)
@@ -54,11 +78,25 @@ def test_incompatible_mask(self, N, create_matrix, create_mask):
 
     @pytest.mark.parametrize("N", [8, 13, 24])
     def test_mask_derived_from_self(self, N, create_matrix):
-        """
+        """Test indexing with mask derived from the matrix itself.
+
         This test checks that the mask is applied correctly to the matrix when
-        the sparsity of mask is derived from the matrix. Our behavior
-        matches that of SciPy, so we can compare against SciPy
-        results for all entries.
+        the sparsity of mask is derived from the matrix.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+
+        Notes
+        -----
+        Our behavior matches that of SciPy when the mask is derived from
+        the matrix itself, so we can compare against SciPy results for all entries.
+
+        The test creates a mask based on a threshold comparison (A > threshold)
+        and verifies that both SciPy and Legate Sparse produce identical results.
         """
         A_scipy, A_sparse = create_matrix(N)
         threshold = 0.2
@@ -81,9 +119,23 @@ def test_mask_derived_from_self(self, N, create_matrix):
 
     @pytest.mark.parametrize("N", [8, 13, 24])
     def test_mask_all_true(self, N, create_matrix):
-        """
+        """Test indexing behavior with a mask that is all True.
+
         This test checks indexing behavior when using a mask that is all True.
         Every non-zero element should be updated to the new value.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+
+        Notes
+        -----
+        The test creates a mask with the same sparsity pattern as the matrix
+        but with all True values. This should result in all non-zero elements
+        being updated to the specified value.
         """
         _, A = create_matrix(N)
         value = 10.0
@@ -99,9 +151,24 @@ def test_mask_all_true(self, N, create_matrix):
 
     @pytest.mark.parametrize("N", [8, 13, 24])
     def test_mask_all_false(self, N, create_matrix, create_mask):
-        """
+        """Test indexing behavior with a mask that is all False.
+
         This test checks indexing behavior when using a mask that is all False.
         No elements should be modified.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+        create_mask : fixture
+            Fixture to create boolean masks.
+
+        Notes
+        -----
+        The test creates a mask with density=0 (all False values) and verifies
+        that the matrix remains unchanged after the assignment operation.
         """
         _, A = create_matrix(N)
         _, mask_all_false = create_mask(N, density=0)
@@ -114,7 +181,25 @@ def test_mask_all_false(self, N, create_matrix, create_mask):
         assert numpy.all(A_copy.get_data() == A.get_data())
 
     def test_random_column_order(self):
-        "The ordering of the matrix is random" ""
+        """Test indexing with randomly ordered column indices.
+
+        This test verifies that indexing works correctly even when the
+        column indices are not in sorted order within each row.
+
+        Notes
+        -----
+        The test creates a matrix with randomly ordered column indices
+        within rows. During instantiation, these indices get sorted to
+        ensure proper indexing behavior.
+
+        The test verifies that:
+        1. The matrix is created correctly despite random column ordering
+        2. Boolean indexing operations work correctly
+        3. The number of elements replaced matches the expected count
+
+        This is important because CSR format requires column indices to be
+        sorted within each row for efficient operations.
+        """
         row_indices = cupynumeric.array(
             [
                 2,
diff --git a/tests/integration/test_io.py b/tests/integration/test_io.py
index 8d237ba4..c307941c 100644
--- a/tests/integration/test_io.py
+++ b/tests/integration/test_io.py
@@ -25,6 +25,29 @@
 
 @pytest.fixture
 def test_mtx_files():
+    """Fixture providing paths to test Matrix Market files.
+
+    This fixture returns a list of paths to various Matrix Market (.mtx)
+    files that are used for testing the mmread functionality.
+
+    Returns
+    -------
+    list
+        List of file paths to test Matrix Market files.
+
+    Notes
+    -----
+    The fixture includes various types of matrices:
+    - test.mtx: Basic test matrix
+    - GlossGT.mtx: Graph theory matrix
+    - Ragusa18.mtx: Scientific computing matrix
+    - cage4.mtx: Graph matrix
+    - karate.mtx: Social network matrix
+
+    These files are located in the testdata directory and provide
+    different sparsity patterns and matrix properties for comprehensive
+    testing of the Matrix Market reader.
+    """
     mtx_files = [
         "test.mtx",
         "GlossGT.mtx",
@@ -36,6 +59,31 @@ def test_mtx_files():
 
 
 def test_mmread(test_mtx_files):
+    """Test Matrix Market file reading functionality.
+
+    This test verifies that the legate_sparse Matrix Market reader
+    produces the same results as SciPy's mmread function.
+
+    Parameters
+    ----------
+    test_mtx_files : list
+        List of Matrix Market file paths to test.
+
+    Notes
+    -----
+    The test reads each Matrix Market file using both legate_sparse.io.mmread
+    and scipy.io.mmread, then compares the results by converting both to
+    dense format and checking for equality.
+
+    This ensures that:
+    1. The Matrix Market format is parsed correctly
+    2. The sparse matrix structure is preserved
+    3. The numerical values are read accurately
+    4. The implementation is compatible with SciPy's reference implementation
+
+    The test covers various matrix types and sizes to ensure robust
+    parsing of the Matrix Market format.
+    """
     for mtx_file in test_mtx_files:
         arr = legate_io.mmread(mtx_file)
         s = sci_io.mmread(mtx_file)
diff --git a/tests/integration/test_spgemm.py b/tests/integration/test_spgemm.py
index 7106281c..5954df79 100644
--- a/tests/integration/test_spgemm.py
+++ b/tests/integration/test_spgemm.py
@@ -25,6 +25,25 @@
 
 @pytest.mark.parametrize("N", [5, 29])
 def test_csr_spgemm(N):
+    """Test sparse matrix-matrix multiplication with CSR matrices.
+
+    This test verifies that sparse matrix-matrix multiplication works
+    correctly for different matrix sizes.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrices (N x N).
+
+    Notes
+    -----
+    The test creates a random sparse matrix A and computes A @ A using
+    the sparse implementation. It then compares the result with the
+    dense matrix multiplication A_dense @ A_dense to verify correctness.
+
+    The test uses different matrix sizes to ensure the implementation
+    works correctly for both small and larger matrices.
+    """
     np.random.seed(0)
     A_dense, A, _ = simple_system_gen(N, N, sparse.csr_array)
 
@@ -38,6 +57,29 @@ def test_csr_spgemm(N):
 @pytest.mark.parametrize("N", [5, 29])
 @pytest.mark.parametrize("unsupported_dtype", ["int", "bool"])
 def test_csr_spgemm_unsupported_dtype(N, unsupported_dtype):
+    """Test that unsupported datatypes raise appropriate exceptions for SpGEMM.
+
+    This test verifies that sparse matrix-matrix multiplication
+    properly handles unsupported datatypes by raising NotImplementedError
+    when running on GPU.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrices.
+    unsupported_dtype : str
+        Datatype that is not supported for SpGEMM operations.
+
+    Notes
+    -----
+    The test creates banded matrices with unsupported datatypes and
+    attempts to perform matrix-matrix multiplication. On GPU systems,
+    this should raise NotImplementedError since only floating-point
+    and complex datatypes are supported for SpGEMM.
+
+    Currently supported datatypes are float32, float64, complex64,
+    and complex128.
+    """
     np.random.seed(0)
 
     nnz_per_row = 3
diff --git a/tests/integration/test_spmv.py b/tests/integration/test_spmv.py
index 0aca095e..0c3590df 100644
--- a/tests/integration/test_spmv.py
+++ b/tests/integration/test_spmv.py
@@ -27,6 +27,32 @@
 @pytest.mark.parametrize("M", [7, 17])
 @pytest.mark.parametrize("inline", [True, False])
 def test_csr_spmv(N, M, inline):
+    """Test sparse matrix-vector multiplication with CSR matrices.
+
+    This test verifies that sparse matrix-vector multiplication works
+    correctly for different matrix sizes and computation methods.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    M : int
+        Number of columns in the matrix.
+    inline : bool
+        Whether to use inline computation (A.dot(x, out=y)) or
+        standard multiplication (A @ x).
+
+    Notes
+    -----
+    The test creates a random sparse matrix and vector, then computes
+    the matrix-vector product using both the sparse implementation
+    and a dense reference. It verifies that the results are numerically
+    close.
+
+    The inline parameter tests two different computation methods:
+    - inline=True: Uses A.dot(x, out=y) with pre-allocated output
+    - inline=False: Uses A @ x with automatic output allocation
+    """
     np.random.seed(0)
     A_dense, A, x = simple_system_gen(N, M, sparse.csr_array)
 
@@ -43,6 +69,31 @@ def test_csr_spmv(N, M, inline):
 @pytest.mark.parametrize("nnz_per_row", [3, 9])
 @pytest.mark.parametrize("unsupported_dtype", ["int", "bool"])
 def test_csr_spmv_unsupported_dtype(N, nnz_per_row, unsupported_dtype):
+    """Test that unsupported datatypes raise appropriate exceptions.
+
+    This test verifies that sparse matrix-vector multiplication
+    properly handles unsupported datatypes by raising NotImplementedError
+    when running on GPU.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrix.
+    nnz_per_row : int
+        Number of non-zeros per row in the banded matrix.
+    unsupported_dtype : str
+        Datatype that is not supported for SpMV operations.
+
+    Notes
+    -----
+    The test creates a banded matrix with an unsupported datatype
+    and attempts to perform matrix-vector multiplication. On GPU
+    systems, this should raise NotImplementedError since only
+    floating-point and complex datatypes are supported for SpMV.
+
+    Currently supported datatypes are float32, float64, complex64,
+    and complex128.
+    """
     np.random.seed(0)
 
     A = banded_matrix(N, nnz_per_row).astype(unsupported_dtype)
diff --git a/tests/integration/utils/banded_matrix.py b/tests/integration/utils/banded_matrix.py
index 4cff897a..fda5ef5f 100644
--- a/tests/integration/utils/banded_matrix.py
+++ b/tests/integration/utils/banded_matrix.py
@@ -24,22 +24,51 @@ def banded_matrix(
     init_with_ones: bool = True,
     verbose: bool = False,
 ):
-    """
+    """Create a banded sparse matrix for testing purposes.
+
     Parameters
     ----------
-    N: int
-        Size of the NxN sparse matrix
-    nnz_per_row: int
-        Number of non-zero elements per row (odd number)
-    from_diags: bool
-        use sparse.diags to generate the banded matrix (default = True)
-    init_with_ones: bool
-        Initialize the matrix with ones instead of arange
+    N : int
+        Size of the NxN sparse matrix.
+    nnz_per_row : int
+        Number of non-zero elements per row (must be odd).
+    from_diags : bool, optional
+        Use sparse.diags to generate the banded matrix. Default is True.
+    init_with_ones : bool, optional
+        Initialize the matrix with ones instead of arange. Default is True.
+    verbose : bool, optional
+        Print detailed information about the matrix construction. Default is False.
 
     Returns
     -------
-    csr_array:
-        Return a sparse matrix
+    csr_array
+        A banded sparse matrix in CSR format.
+
+    Raises
+    ------
+    AssertionError
+        If N <= nnz_per_row or nnz_per_row is not odd.
+
+    Notes
+    -----
+    This function creates a banded matrix with a specific sparsity pattern.
+    When from_diags=True, it uses the sparse.diags function which is simpler
+    but may be slower. When from_diags=False, it constructs the CSR matrix
+    directly for better performance.
+
+    The matrix has a banded structure with nnz_per_row non-zeros per row,
+    centered around the main diagonal. The function handles the boundary
+    conditions by masking out-of-bounds indices.
+
+    Examples
+    --------
+    >>> A = banded_matrix(5, 3, from_diags=True)
+    >>> print(A.toarray())
+    [[1. 1. 0. 0. 0.]
+     [1. 1. 1. 0. 0.]
+     [0. 1. 1. 1. 0.]
+     [0. 0. 1. 1. 1.]
+     [0. 0. 0. 1. 1.]]
     """
 
     if from_diags:
diff --git a/tests/integration/utils/sample.py b/tests/integration/utils/sample.py
index a201d6f2..e5444987 100644
--- a/tests/integration/utils/sample.py
+++ b/tests/integration/utils/sample.py
@@ -19,11 +19,62 @@
 
 
 class Normal(stats.rv_continuous):
+    """Custom normal distribution class for reproducible random sampling.
+
+    This class extends scipy.stats.rv_continuous to provide a custom
+    normal distribution that can be used with scipy.sparse.random for
+    generating sparse matrices with reproducible random values.
+
+    Notes
+    -----
+    The _rvs method generates standard normal random variates using
+    the provided random_state for reproducibility.
+    """
+
     def _rvs(self, *args, size=None, random_state=None):
+        """Generate standard normal random variates.
+
+        Parameters
+        ----------
+        size : int or tuple, optional
+            Number of random variates to generate.
+        random_state : numpy.random.RandomState, optional
+            Random state for reproducibility.
+
+        Returns
+        -------
+        numpy.ndarray
+            Array of standard normal random variates.
+        """
         return random_state.standard_normal(size)
 
 
 def sample(N: int, D: int, density: float, seed: int):
+    """Generate a sparse matrix with random values from a normal distribution.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    D : int
+        Number of columns in the matrix.
+    density : float
+        Density of non-zero elements (between 0 and 1).
+    seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    scipy.sparse.csr_matrix
+        A sparse matrix in CSR format with random normal values.
+
+    Notes
+    -----
+    This function uses scipy.sparse.random with a custom normal distribution
+    to generate sparse matrices with reproducible random values. The matrix
+    is returned in CSR format.
+
+    """
     NormalType = Normal(seed=seed)
     SeededNormal = NormalType()
     return scpy.random(
@@ -38,14 +89,91 @@ def sample(N: int, D: int, density: float, seed: int):
 
 
 def sample_dense(N: int, D: int, density: float, seed: int):
+    """Generate a dense matrix with random values from a normal distribution.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    D : int
+        Number of columns in the matrix.
+    density : float
+        Density of non-zero elements (between 0 and 1).
+    seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    numpy.ndarray
+        A dense matrix with random normal values.
+
+    Notes
+    -----
+    This function generates a sparse matrix using sample() and then
+    converts it to dense format. This is useful for creating test
+    matrices that can be compared with sparse implementations.
+
+    """
     return numpy.asarray(sample(N, D, density, seed).todense())
 
 
 def sample_dense_vector(N: int, density: float, seed: int):
+    """Generate a dense vector with random values from a normal distribution.
+
+    Parameters
+    ----------
+    N : int
+        Length of the vector.
+    density : float
+        Density of non-zero elements (between 0 and 1).
+    seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    numpy.ndarray
+        A dense vector with random normal values.
+
+    Notes
+    -----
+    This function generates a dense matrix with one column using
+    sample_dense() and then squeezes it to a 1D vector.
+
+    """
     return sample_dense(N, 1, density, seed).squeeze()
 
 
 def simple_system_gen(N, M, cls, tol=0.5):
+    """Generate a simple linear system for testing.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    M : int
+        Number of columns in the matrix.
+    cls : type or None
+        Class to use for creating the sparse matrix. If None, no sparse
+        matrix is created.
+    tol : float, optional
+        Threshold for sparsity. Values below this threshold are set to zero.
+        Default is 0.5.
+
+    Returns
+    -------
+    tuple
+        (a_dense, a_sparse, x) where:
+        - a_dense: Dense matrix
+        - a_sparse: Sparse matrix (or None if cls is None)
+        - x: Dense vector
+
+    Notes
+    -----
+    This function generates a random dense matrix and vector, then
+    applies a threshold to create sparsity. The sparse matrix is
+    created using the provided class if specified.
+
+    """
     a_dense = np.random.rand(N, M)
     x = np.random.rand(M)
     a_dense = np.where(a_dense < tol, a_dense, 0)

From 89d12e6979b95c7358776a184a33b339d0324e12 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Fri, 13 Feb 2026 19:19:56 -0800
Subject: [PATCH 2/3] v26.02.00

---
 .clang-format                                 |  54 +-
 .flake8                                       |  10 -
 .pre-commit-config.yaml                       |  71 ++-
 .style.yapf                                   |   1 -
 CMakeLists.txt                                |   3 +-
 CONTRIBUTING.md                               |  16 +-
 LICENSE                                       |   2 +-
 README.md                                     |  36 +-
 cmake/thirdparty/get_cudss.cmake              |  28 +
 cmake/thirdparty/get_legate.cmake             |  45 +-
 cmake/versions.json                           |  42 +-
 conda/conda-build/build.sh                    |   0
 conda/conda-build/conda_build_config.yaml     |  11 +
 conda/conda-build/meta.yaml                   |  29 +-
 examples/common.py                            | 108 ++--
 examples/direct_solve_banded_system.py        |  86 +++
 examples/gmg.py                               | 492 ---------------
 examples/matrix_power.py                      |  38 +-
 examples/pde.py                               |  63 +-
 examples/poisson_5point_example.py            | 193 ++++++
 examples/spgemm_microbenchmark.py             |  32 +-
 examples/spmv_microbenchmark.py               |  30 +-
 install.py                                    |  20 +-
 legate_sparse/__init__.py                     |   7 +-
 legate_sparse/_version.py                     |  28 +-
 legate_sparse/base.py                         | 178 +++---
 legate_sparse/config.py                       |  34 +-
 legate_sparse/construct.py                    | 260 ++++++++
 legate_sparse/coverage.py                     |  18 +-
 legate_sparse/csr.py                          | 585 +++++++++++++-----
 legate_sparse/dia.py                          |  71 ++-
 legate_sparse/gallery.py                      |  57 +-
 legate_sparse/install_info.py.in              |  14 +-
 legate_sparse/io.py                           |  47 +-
 legate_sparse/linalg.py                       | 502 ++++++++++++---
 legate_sparse/module.py                       |  11 +-
 legate_sparse/runtime.py                      |  27 +-
 legate_sparse/settings.py                     |  17 +-
 legate_sparse/types.py                        |  15 +-
 legate_sparse/utils.py                        | 131 ++--
 legate_sparse_cpp.cmake                       |  20 +-
 pyproject.toml                                | 118 ++++
 scripts/memlog_analysis.py                    |  28 +-
 scripts/memlog_cli.py                         |  14 +-
 scripts/memlog_parser.py                      |  21 +-
 scripts/pre-commit/yamllint.yml               |   6 +
 setup.cfg                                     |   7 +
 setup.py                                      |   7 +-
 src/legate_sparse/array/conv/csr_to_dense.cc  |   3 +
 src/legate_sparse/array/conv/csr_to_dense.cu  |   5 +-
 .../array/conv/csr_to_dense_omp.cc            |   3 +
 .../array/conv/csr_to_dense_template.inl      |   7 +-
 src/legate_sparse/array/conv/dense_to_csr.cc  |   6 +
 src/legate_sparse/array/conv/dense_to_csr.cu  |  10 +-
 .../array/conv/dense_to_csr_omp.cc            |   6 +
 .../array/conv/dense_to_csr_template.inl      |  14 +-
 .../array/conv/pos_to_coordinates.cc          |   3 +
 .../array/conv/pos_to_coordinates.cu          |   5 +-
 .../array/conv/pos_to_coordinates_omp.cc      |   3 +
 .../conv/pos_to_coordinates_template.inl      |   8 +-
 src/legate_sparse/array/csr/geam.cc           |  79 +++
 src/legate_sparse/array/csr/geam.cu           | 144 +++++
 src/legate_sparse/array/csr/geam.h            |  91 +++
 src/legate_sparse/array/csr/geam_kernels.h    | 129 ++++
 src/legate_sparse/array/csr/geam_omp.cc       |  87 +++
 src/legate_sparse/array/csr/geam_template.inl | 139 +++++
 src/legate_sparse/array/csr/get_diagonal.cc   |   3 +
 src/legate_sparse/array/csr/get_diagonal.cu   |   5 +-
 .../array/csr/get_diagonal_omp.cc             |   3 +
 .../array/csr/get_diagonal_template.inl       |   7 +-
 src/legate_sparse/array/csr/indexing.cc       |   3 +
 src/legate_sparse/array/csr/indexing.cu       |   7 +-
 src/legate_sparse/array/csr/indexing_omp.cc   |   4 +-
 .../array/csr/indexing_template.inl           |   7 +-
 .../array/csr/spgemm_csr_csr_csr.cc           |   6 +
 .../array/csr/spgemm_csr_csr_csr.cu           |  15 +-
 .../array/csr/spgemm_csr_csr_csr_omp.cc       |   6 +
 .../array/csr/spgemm_csr_csr_csr_template.inl |  34 +-
 src/legate_sparse/array/csr/spmv.cc           |   3 +
 src/legate_sparse/array/csr/spmv.cu           |  11 +-
 src/legate_sparse/array/csr/spmv_omp.cc       |   3 +
 src/legate_sparse/array/csr/spmv_template.inl |   7 +-
 src/legate_sparse/array/util/scale_rect.cc    |   3 +
 src/legate_sparse/array/util/scale_rect.cu    |   5 +-
 .../array/util/scale_rect_omp.cc              |   3 +
 .../array/util/scale_rect_template.inl        |   7 +-
 src/legate_sparse/array/util/unzip_rect.cc    |   3 +
 src/legate_sparse/array/util/unzip_rect.cu    |   5 +-
 .../array/util/unzip_rect_omp.cc              |   3 +
 .../array/util/unzip_rect_template.inl        |   7 +-
 src/legate_sparse/array/util/zip_to_rect.cc   |   3 +
 src/legate_sparse/array/util/zip_to_rect.cu   |   5 +-
 .../array/util/zip_to_rect_omp.cc             |   3 +
 .../array/util/zip_to_rect_template.inl       |   9 +-
 src/legate_sparse/cffi.h                      |   7 +
 src/legate_sparse/cudalibs.cu                 |  30 +-
 src/legate_sparse/cudalibs.h                  |   3 +
 src/legate_sparse/linalg/axpby.cc             |   3 +
 src/legate_sparse/linalg/axpby.cu             |   5 +-
 src/legate_sparse/linalg/axpby_omp.cc         |   3 +
 src/legate_sparse/linalg/axpby_template.inl   |  13 +-
 src/legate_sparse/linalg/spsolve.cc           |  34 +
 src/legate_sparse/linalg/spsolve.cu           | 184 ++++++
 src/legate_sparse/linalg/spsolve.h            |  48 ++
 src/legate_sparse/mapper/mapper.cc            |   4 +
 .../partition/fast_image_partition.cu         |   5 +-
 .../fast_image_partition_template.inl         |   7 +-
 src/legate_sparse/util/cuda_help.h            |  37 +-
 src/legate_sparse/util/cudss_utils.h          |  73 +++
 src/legate_sparse/util/cusparse_utils.h       |  20 +-
 tests/integration/conftest.py                 | 323 +++++++++-
 tests/integration/test_block_array.py         | 176 ++++++
 tests/integration/test_cg_solve.py            |   8 +-
 tests/integration/test_csr_from_csr.py        |   8 +-
 tests/integration/test_csr_to_dense.py        |  12 +-
 tests/integration/test_diags.py               |   4 +-
 tests/integration/test_eigsh.py               | 392 ++++++++++++
 tests/integration/test_geam.py                | 269 ++++++++
 tests/integration/test_indexing.py            |  32 +-
 tests/integration/test_manual_sorting.py      |   1 +
 tests/integration/test_negate.py              |  38 ++
 tests/integration/test_spgemm.py              |   2 +-
 tests/integration/test_spmv.py                |  36 +-
 tests/integration/test_spsolve.py             | 199 ++++++
 tests/integration/test_unary_operation.py     |  12 +-
 tests/integration/utils/banded_matrix.py      |   4 +-
 tests/testdata/GlossGT.mtx                    |  14 +-
 tests/testdata/Ragusa18.mtx                   |   4 +-
 tests/testdata/karate.mtx                     |  12 +-
 129 files changed, 5134 insertions(+), 1443 deletions(-)
 delete mode 100644 .flake8
 create mode 100644 cmake/thirdparty/get_cudss.cmake
 mode change 100644 => 100755 conda/conda-build/build.sh
 create mode 100644 examples/direct_solve_banded_system.py
 delete mode 100644 examples/gmg.py
 create mode 100644 examples/poisson_5point_example.py
 create mode 100644 legate_sparse/construct.py
 create mode 100644 pyproject.toml
 mode change 100644 => 100755 scripts/memlog_analysis.py
 mode change 100644 => 100755 scripts/memlog_cli.py
 mode change 100644 => 100755 scripts/memlog_parser.py
 create mode 100644 scripts/pre-commit/yamllint.yml
 create mode 100644 setup.cfg
 mode change 100644 => 100755 setup.py
 create mode 100644 src/legate_sparse/array/csr/geam.cc
 create mode 100644 src/legate_sparse/array/csr/geam.cu
 create mode 100644 src/legate_sparse/array/csr/geam.h
 create mode 100644 src/legate_sparse/array/csr/geam_kernels.h
 create mode 100644 src/legate_sparse/array/csr/geam_omp.cc
 create mode 100644 src/legate_sparse/array/csr/geam_template.inl
 create mode 100644 src/legate_sparse/linalg/spsolve.cc
 create mode 100644 src/legate_sparse/linalg/spsolve.cu
 create mode 100644 src/legate_sparse/linalg/spsolve.h
 create mode 100644 src/legate_sparse/util/cudss_utils.h
 create mode 100644 tests/integration/test_block_array.py
 create mode 100644 tests/integration/test_eigsh.py
 create mode 100644 tests/integration/test_geam.py
 create mode 100644 tests/integration/test_negate.py
 create mode 100644 tests/integration/test_spsolve.py

diff --git a/.clang-format b/.clang-format
index 6d5353f9..4f33a094 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,3 +1,4 @@
+---
 Language: Cpp
 # BasedOnStyle: Google
 AccessModifierOffset: -1
@@ -17,22 +18,22 @@ AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments:  false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
 BinPackParameters: false
 BraceWrapping:
-  AfterClass:            false
+  AfterClass: false
   AfterControlStatement: false
-  AfterEnum:             false
-  AfterFunction:         false
-  AfterNamespace:        false
-  AfterObjCDeclaration:  false
-  AfterStruct:           false
-  AfterUnion:            false
-  AfterExternBlock:      false
-  BeforeCatch:           false
-  BeforeElse:            false
-  IndentBraces:          false
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  IndentBraces: false
   # disabling the below splits, else, they'll just add to the vertical length of source files!
   SplitEmptyFunction: false
   SplitEmptyRecord: false
@@ -61,23 +62,23 @@ FixNamespaceComments: true
 ForEachMacros:
 IncludeBlocks: Preserve
 IncludeCategories:
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
+  - Regex: ^<.*\.h>
+    Priority: 1
+  - Regex: ^<.*
+    Priority: 2
+  - Regex: .*
+    Priority: 3
+IncludeIsMainRegex: ([-_](test|unittest))?$
 IndentCaseLabels: true
 IndentPPDirectives: None
-IndentWidth:     2
+IndentWidth: 2
 IndentWrappedFunctionNames: false
 InsertBraces: true
 JavaScriptQuotes: Leave
 JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
-MacroBlockEnd:   ''
+MacroBlockEnd: ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBinPackProtocolList: Never
@@ -95,14 +96,7 @@ PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
 RawStringFormats:
   - Language: Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
+    Delimiters: [cc, CC, cpp, Cpp, CPP, c++, C++]
     CanonicalDelimiter: ''
 # Enabling comment reflow causes doxygen comments to be messed up in their formats!
 ReflowComments: true
diff --git a/.flake8 b/.flake8
deleted file mode 100644
index 04eea09b..00000000
--- a/.flake8
+++ /dev/null
@@ -1,10 +0,0 @@
-[flake8]
-ignore = 
-  # 'foo' is too complex (N)
-  C901, 
-  # continuation line missing indentation or outdented
-  E122, 
-  E203, E501,
-  F403, F821, W503
-max-line-length = 80
-max-complexity = 18
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b24026ed..6083ac22 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,31 +1,44 @@
+---
 repos:
-    - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: 'v1.5.1'
-      hooks:
-            - id: mypy
-              language: system
-              pass_filenames: false
-              args: ['legate_sparse']
-    - repo: https://github.com/psf/black
-      rev: 23.9.1
-      hooks:
-            - id: black
-    - repo: https://github.com/PyCQA/isort
-      rev: 5.12.0
-      hooks:
-            - id: isort
-              args: ["--profile", "black"]
-    - repo: https://github.com/PyCQA/flake8
-      rev: 6.1.0
-      hooks:
-            - id: flake8
-              args: [--config=.flake8]
-    - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v16.0.6'  # Use the sha / tag you want to point at
-      hooks:
-        - id: clang-format
-          files: \.(cu|cuh|h|cc|inl)$
-          types_or: []
-
+  - repo: https://github.com/adrienverge/yamllint
+    rev: v1.37.1
+    hooks:
+      - id: yamllint
+        types: [yaml]
+        args: [-c, ./scripts/pre-commit/yamllint.yml]
+        exclude: meta\.yaml$
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-json  # checks that all json files have proper syntax
+      - id: check-toml  # checks that all toml files have proper syntax
+      - id: end-of-file-fixer  # check all files end in a newline
+        # handled by clang-format
+        exclude_types: [c, c++, cuda]
+      - id: pretty-format-json
+        args: [--autofix, --indent=4]
+      - id: trailing-whitespace  # remove trailing whitespace
+        # don't mess up diff files
+        exclude: ^src/cmake/patches/.*\.diff$
+        # handled by clang-format
+        exclude_types: [c, c++, cuda]
+      - id: check-symlinks
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.12.4
+    hooks:
+      - id: ruff-format
+      - id: ruff
+        args: ["--config=./pyproject.toml", "--fix"]
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v16.0.6  # Use the sha / tag you want to point at
+    hooks:
+      - id: clang-format
+        files: \.(cu|cuh|h|cc|inl)$
+        types_or: []
+ci:
+  autoupdate_schedule: quarterly
 default_language_version:
-    python: python3
+  python: python3
diff --git a/.style.yapf b/.style.yapf
index 02b96779..df2b8071 100644
--- a/.style.yapf
+++ b/.style.yapf
@@ -339,4 +339,3 @@ split_penalty_logical_operator=300
 
 # Use the Tab character for indentation.
 use_tabs=False
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c32254c3..d91a3119 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ include(rapids-find)
 ###################################
 # Project
 
-set(legate_sparse_version 25.07.00)
+set(legate_sparse_version 26.02.00)
 
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
 set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g")
@@ -110,4 +110,3 @@ if(CMAKE_GENERATOR STREQUAL "Ninja")
   endfunction()
   add_touch_legate_sparse_ninja_build_target()
 endif()
-
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7b663aad..8b55877c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -55,25 +55,25 @@ git push -u origin <local-branch>:<remote-branch>
   ```
     Developer Certificate of Origin
     Version 1.1
-    
+
     Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
     1 Letterman Drive
     Suite D4700
     San Francisco, CA, 94129
-    
+
     Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
   ```
 
   ```
     Developer's Certificate of Origin 1.1
-    
+
     By making a contribution to this project, I certify that:
-    
+
     (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
-    
+
     (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
-    
+
     (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
-    
+
     (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
-  ```
\ No newline at end of file
+  ```
diff --git a/LICENSE b/LICENSE
index 4947287f..f433b1a5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -174,4 +174,4 @@
       incurred by, or claims asserted against, such Contributor by reason
       of your accepting any such warranty or additional liability.
 
-   END OF TERMS AND CONDITIONS
\ No newline at end of file
+   END OF TERMS AND CONDITIONS
diff --git a/README.md b/README.md
index b03ea8c6..5cc45556 100644
--- a/README.md
+++ b/README.md
@@ -21,27 +21,27 @@ limitations under the License.
 Legate Sparse is a [Legate](https://github.com/nv-legate/legate) library
 that aims to provide a distributed and accelerated drop-in replacement for the
 [scipy.sparse](https://docs.scipy.org/doc/scipy/reference/sparse.html) library
-on top of the [Legate](https://github.com/nv-legate/legate) runtime. 
-Legate Sparse interoperates with 
+on top of the [Legate](https://github.com/nv-legate/legate) runtime.
+Legate Sparse interoperates with
 [cuPyNumeric](https://github.com/nv-legate/cupynumeric),
-a distributed and accelerated drop-in replacement 
+a distributed and accelerated drop-in replacement
 for [NumPy](https://numpy.org/doc/stable/reference/index.html#reference), to
 enable writing programs that operate on distributed dense and sparse arrays.
-Take a look at the `examples` directory for some applications that can 
+Take a look at the `examples` directory for some applications that can
 use Legate Sparse. We have implemented
 an explicit partial-differential equation (PDE) [solver](examples/pde.py).
 More complex and interesting applications are on the way -- stay tuned!
 
-Legate Sparse is currently in alpha and supports a subset of APIs 
-and options from scipy.sparse, so if you need an API, please open 
-an issue and give us a summary of its usage. 
+Legate Sparse is currently in alpha and supports a subset of APIs
+and options from scipy.sparse, so if you need an API, please open
+an issue and give us a summary of its usage.
 
 # Installation
 
-To use Legate Sparse, `legate` and `cupynumeric` libraries have to be installed. 
-They can be installed either by pulling the respective conda packages 
-or by manually building from source. For more information, 
-see build instructions for [Legate](https://github.com/nv-legate/legate) 
+To use Legate Sparse, `legate` and `cupynumeric` libraries have to be installed.
+They can be installed either by pulling the respective conda packages
+or by manually building from source. For more information,
+see build instructions for [Legate](https://github.com/nv-legate/legate)
 and [cuPyNumeric](https://github.com/nv-legate/cupynumeric).
 
 Follow the steps in this section.
@@ -51,7 +51,7 @@ Follow the steps in this section.
 The `legate-sparse` conda package already depends on `legate` and `cupynumeric`,
 and it will install these dependencies automatically.
 
-To create a new environment and install: 
+To create a new environment and install:
 ```
 conda create -n myenv -c legate -c conda-forge legate-sparse
 ```
@@ -65,9 +65,9 @@ conda install -c legate -c conda-forge legate-sparse
 
 To write programs using Legate Sparse, import the `legate_sparse` module, which
 contains methods and types found in `scipy.sparse`. Note that the module is imported as `legate_sparse`
-and not `legate.sparse`. Here is an example program saved as `main.py`. 
+and not `legate.sparse`. Here is an example program saved as `main.py`.
 
-For more details on how to run legate programs, check 
+For more details on how to run legate programs, check
 our [documentation](https://docs.nvidia.com/cupynumeric).
 To run the application on a single GPU, use this command:
 
@@ -79,10 +79,10 @@ import legate_sparse as sparse
 import cupynumeric as np
 
 # number of diagonals in the matrix (including main diagonal)
-n_diagonals = 3 
+n_diagonals = 3
 
 # number of rows in the matrix
-nrows = 5 
+nrows = 5
 
 # generate two tridiaonal matrices (n_diagonals=3) and multiply them
 A = sparse.diags(
@@ -102,13 +102,13 @@ B = sparse.diags(
 )
 
 # spGEMM operation: multiplication of two sparse matrices
-C = A @ B 
+C = A @ B
 print(C.todense())
 print()
 
 # spMV operation: multiplication of a sparse matrix and a dense vector
 x = np.ones(nrows)
-C = A @ x 
+C = A @ x
 print(C)
 
 assert np.array_equal(A.todense().sum(axis=1), C)
diff --git a/cmake/thirdparty/get_cudss.cmake b/cmake/thirdparty/get_cudss.cmake
new file mode 100644
index 00000000..0ebfc199
--- /dev/null
+++ b/cmake/thirdparty/get_cudss.cmake
@@ -0,0 +1,28 @@
+#=============================================================================
+# Copyright 2022-2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_or_configure_cudss)
+
+    if(TARGET cudss)
+        return()
+    endif()
+
+    # cuDSS provides its own CMake config, so we use find_package directly
+    find_package(cudss REQUIRED)
+
+endfunction()
+
+find_or_configure_cudss()
diff --git a/cmake/thirdparty/get_legate.cmake b/cmake/thirdparty/get_legate.cmake
index 727671fd..142bd22a 100644
--- a/cmake/thirdparty/get_legate.cmake
+++ b/cmake/thirdparty/get_legate.cmake
@@ -18,56 +18,19 @@ function(find_or_configure_legate)
   set(oneValueArgs VERSION REPOSITORY BRANCH EXCLUDE_FROM_ALL)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  include("${rapids-cmake-dir}/export/detail/parse_version.cmake")
-  rapids_export_parse_version(${PKG_VERSION} legate PKG_VERSION)
-
   include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
   rapids_cpm_package_details(legate version git_repo git_branch shallow exclude_from_all)
 
-  set(version ${PKG_VERSION})
-  set(exclude_from_all ${PKG_EXCLUDE_FROM_ALL})
-  if(PKG_BRANCH)
-    set(git_branch "${PKG_BRANCH}")
-  endif()
-  if(PKG_REPOSITORY)
-    set(git_repo "${PKG_REPOSITORY}")
-  endif()
+  # Normalize version to match conda pkg naming (e.g., 26.01.00 -> 26.01.0)
+  string(REPLACE "00" "0" version "${version}")
 
   set(FIND_PKG_ARGS
       GLOBAL_TARGETS     legate::legate
       BUILD_EXPORT_SET   legate-sparse-exports
       INSTALL_EXPORT_SET legate-sparse-exports)
 
-  # First try to find legate via find_package()
-  # so the `Legion_USE_*` variables are visible
-  # Use QUIET find by default.
-  set(_find_mode QUIET)
-  # If legate_DIR/legate_ROOT are defined as something other than empty or NOTFOUND
-  # use a REQUIRED find so that the build does not silently download legate.
-  if(legate_DIR OR legate_ROOT)
-    set(_find_mode REQUIRED)
-  endif()
-  rapids_find_package(legate ${version} EXACT CONFIG ${_find_mode} ${FIND_PKG_ARGS})
-
-  if(legate_FOUND)
-    message(STATUS "CPM: using local package legate@${version}")
-  else()
-    include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cpm_helpers.cmake)
-    get_cpm_git_args(legate_cpm_git_args REPOSITORY ${git_repo} BRANCH ${git_branch})
-
-    message(VERBOSE "legate.sparse: legate version: ${version}")
-    message(VERBOSE "legate.sparse: legate git_repo: ${git_repo}")
-    message(VERBOSE "legate.sparse: legate git_branch: ${git_branch}")
-    message(VERBOSE "legate.sparse: legate exclude_from_all: ${exclude_from_all}")
-    message(VERBOSE "legate.sparse: legate legate_cpm_git_args: ${legate_cpm_git_args}")
-
-    rapids_cpm_find(legate ${version} ${FIND_PKG_ARGS}
-        CPM_ARGS
-          ${legate_cpm_git_args}
-          FIND_PACKAGE_ARGUMENTS EXACT
-          EXCLUDE_FROM_ALL       ${exclude_from_all}
-    )
-  endif()
+  # Require legate to be pre-installed; do not fall back to cloning.
+  rapids_find_package(legate ${version} EXACT CONFIG REQUIRED ${FIND_PKG_ARGS})
 
   set(Legion_USE_CUDA ${Legion_USE_CUDA} PARENT_SCOPE)
   set(Legion_USE_OpenMP ${Legion_USE_OpenMP} PARENT_SCOPE)
diff --git a/cmake/versions.json b/cmake/versions.json
index 6c5440f4..85c7e7ae 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -1,24 +1,24 @@
 {
-  "packages" : {
-    "legate" : {
-      "repo": "legate.internal",
-      "org": "nv-legate",
-      "version": "25.07.00",
-      "git_url" : "git@github.com:nv-legate/legate.git",
-      "git_shallow": false,
-      "always_download": false,
-      "git_tag" : "a46dc3d5b176ff9546bc831409c394c1bbc3b936",
-      "anaconda_label": "main"
-    },
-    "cupynumeric" : {
-      "repo": "cupynumeric.internal",
-      "org": "nv-legate",
-      "version": "25.07.00",
-      "git_url" : "git@github.com:nv-legate/cupynumeric",
-      "git_shallow": false,
-      "always_download": false,
-      "git_tag" : "6132d8450049a7abd7786fb4d60444eb5b4e25db",
-      "anaconda_label": "main"
+    "packages": {
+        "cupynumeric": {
+            "always_download": false,
+            "anaconda_label": "main",
+            "git_shallow": false,
+            "git_tag": "ae1c787828a9327ad00a076739706f41d196a043",
+            "git_url": "git@github.com:nv-legate/cupynumeric.internal",
+            "org": "nv-legate",
+            "repo": "cupynumeric.internal",
+            "version": "26.01.00"
+        },
+        "legate": {
+            "always_download": false,
+            "anaconda_label": "main",
+            "git_shallow": false,
+            "git_tag": "3ccb639605eecd8e9fee52c2d7d56ea799f4864e",
+            "git_url": "git@github.com:nv-legate/legate.internal.git",
+            "org": "nv-legate",
+            "repo": "legate.internal",
+            "version": "26.01.00"
+        }
     }
-  }
 }
diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
old mode 100644
new mode 100755
diff --git a/conda/conda-build/conda_build_config.yaml b/conda/conda-build/conda_build_config.yaml
index ada8dda2..a67aaba9 100644
--- a/conda/conda-build/conda_build_config.yaml
+++ b/conda/conda-build/conda_build_config.yaml
@@ -10,6 +10,14 @@ python:
   - 3.12
   - 3.13
 
+# Pin sysroot glibc to match Legate's current baseline and avoid newer
+# toolchains needing RELR-aware binutils.
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
+  - "2.28"
+
 numpy_version:
   # Not 2.1.0 which segfaults on asarray() sometimes, see
   # https://github.com/numpy/numpy/pull/27249
@@ -17,3 +25,6 @@ numpy_version:
 
 cmake_version:
   - ">=3.20.1,!=3.23.0"
+
+cuda_compiler:
+  - cuda-nvcc
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 9bdad28f..209abd04 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -10,8 +10,11 @@
 ## The placeholder version is strictly for making two-pass conda build process.
 ## It should not be used for any other purpose, and this is not a default version.
 {% set placeholder_version = '0.0.0.dev' %}
-{% set default_cuda_version = '12.2.2' %}
-{% set cuda_version='.'.join(environ.get('CUDA', default_cuda_version).split('.')[:2]) %}
+{% set legate_cuda_version = environ.get('LEGATE_CUDA_VERSION') %}
+{% if not legate_cuda_version %}
+invalid_yaml_missing_cuda_version: LEGATE_CUDA_VERSION must be set
+{% endif %}
+{% set cuda_version='.'.join(legate_cuda_version.split('.')[:2]) %}
 {% set cuda_major=cuda_version.split('.')[0]|int %}
 {% set py_version=environ.get('CONDA_PY', '') %}
 
@@ -100,14 +103,16 @@ requirements:
     - make
     - ninja
     - cmake {{ cmake_version }}
-    - {{ compiler('c') }} =11.2
-    - {{ compiler('cxx') }} =11.2
-    # the nvcc requirement is necessary because it contains crt/host_config.h used by cuda runtime. This is a packaging bug that has been reported.
-    - cuda-nvcc
-    # cudart needed for CPU and GPU builds because of curand
-    - cuda-cudart-dev
+    - {{ stdlib("c") }}
+    - {{ compiler('c') }} =14
+    - {{ compiler('cxx') }} =14
+    - pkg-config
+{% if gpu_enabled_bool %}
     - cuda-version ={{ cuda_version }}
+    - {{ compiler('cuda') }}
+    - cuda-cudart-dev
     - libcusparse-dev
+{% endif %}
 
 
   host:
@@ -127,11 +132,12 @@ requirements:
     # legate, there may not be a cupynumeric package that is compatible.  So, we
     # list cupynumeric here to get a pair of legate and cupynumeric that are
     # compatible.
+    - cuda-version ={{ cuda_version }}
     - cuda-cccl
     - libcusparse
-    - cuda-version ={{ cuda_version }}
+    - libcudss-dev
+    - nccl <2.29
     - cuda-cudart
-    - nccl
 {% endif %}
 
   run:
@@ -142,6 +148,9 @@ requirements:
 {% if gpu_enabled_bool %}
     - libnvjitlink
     - libcusparse
+    # ship the NCCL comm layer so multi-GPU cudss runs can load libcudss_commlayer_nccl.so
+    - libcudss-commlayer-nccl
+    - nccl >=2.0,<2.29
     # Pin to all minor versions of CUDA newer than the one built against, within the same major version.
     # cuda-version constrains the CUDA runtime version and ensures a compatible driver is available
     - {{ pin_compatible('cuda-version', min_pin='x.x', max_pin='x') }}
diff --git a/examples/common.py b/examples/common.py
index 99174ed6..e7cfb396 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -11,15 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import argparse
 import importlib
+from typing import TYPE_CHECKING, Any, Protocol, cast
 
-import numpy
-from typing_extensions import Protocol
+if TYPE_CHECKING:
+    from types import ModuleType
 
+    import numpy.typing as npt
+    from legate.timing._lib.timing import PyTime
+    from legate_sparse import csr_array
 
-def get_arg_number(arg):
+np: ModuleType
+sparse: ModuleType
+linalg: ModuleType
+
+
+def get_arg_number(arg: str) -> int:
     """Parse a string argument that may contain size suffixes.
 
     Parameters
@@ -68,11 +78,11 @@ class Timer(Protocol):
     for measuring execution time in the examples.
     """
 
-    def start(self):
+    def start(self) -> None:
         """Start timing."""
         ...
 
-    def stop(self):
+    def stop(self) -> float:
         """Stop timing and return duration.
 
         Blocks execution until everything before it has completed.
@@ -92,19 +102,21 @@ class LegateTimer(Timer):
     measurement of GPU operations.
     """
 
-    def __init__(self):
-        self._start = None
+    def __init__(self) -> None:
+        self._start: PyTime | None = None
 
-    def start(self):
+    def start(self) -> None:
         """Start timing using Legate's time function."""
         from legate.timing import time
 
         self._start = time()
 
-    def stop(self):
+    def stop(self) -> float:
         """Stop timing and return duration in milliseconds."""
         from legate.timing import time
 
+        assert self._start is not None
+
         _end = time()
         return (_end - self._start) / 1000.0
 
@@ -116,24 +128,26 @@ class CuPyTimer(Timer):
     in CuPy applications.
     """
 
-    def __init__(self):
-        self._start_event = None
+    def __init__(self) -> None:
+        self._start_event: Any | None = None
 
-    def start(self):
+    def start(self) -> None:
         """Start timing using CUDA events."""
-        from cupy import cuda
+        from cupy import cuda  # type: ignore [import-untyped]
 
         self._start_event = cuda.Event()
         self._start_event.record()
 
-    def stop(self):
+    def stop(self) -> float:
         """Stop timing and return duration in milliseconds."""
         from cupy import cuda
 
+        assert self._start_event is not None
+
         end_event = cuda.Event()
         end_event.record()
         end_event.synchronize()
-        return cuda.get_elapsed_time(self._start_event, end_event)
+        return cast(float, cuda.get_elapsed_time(self._start_event, end_event))
 
 
 class NumPyTimer(Timer):
@@ -143,19 +157,21 @@ class NumPyTimer(Timer):
     of CPU operations in NumPy/SciPy applications.
     """
 
-    def __init__(self):
-        self._start_time = None
+    def __init__(self) -> None:
+        self._start_time: float | None = None
 
-    def start(self):
+    def start(self) -> None:
         """Start timing using perf_counter_ns."""
         from time import perf_counter_ns
 
         self._start_time = perf_counter_ns() / 1000.0
 
-    def stop(self):
+    def stop(self) -> float:
         """Stop timing and return duration in milliseconds."""
         from time import perf_counter_ns
 
+        assert self._start_time is not None
+
         end_time = perf_counter_ns() / 1000.0
         return (end_time - self._start_time) / 1000.0
 
@@ -171,32 +187,31 @@ class DummyScope:
     that may or may not use resource scoping.
     """
 
-    def __init__(self):
-        ...
+    def __init__(self) -> None: ...
 
-    def __enter__(self):
+    def __enter__(self) -> None:
         """Enter the context (no-op)."""
         ...
 
-    def __exit__(self, _, __, ___):
+    def __exit__(self, _: Any, __: Any, ___: Any) -> None:
         """Exit the context (no-op)."""
         ...
 
-    def __getitem__(self, item):
+    def __getitem__(self, item: Any) -> DummyScope:
         """Return self for any indexing (no-op)."""
         return self
 
-    def count(self, _):
+    def count(self, _: Any) -> int:
         """Return 1 for any count operation."""
         return 1
 
     @property
-    def preferred_kind(self):
+    def preferred_kind(self) -> None:
         """Return None for preferred kind."""
         return None
 
 
-def get_phase_procs(use_legate: bool):
+def get_phase_procs(use_legate: bool) -> tuple[Any, Any]:
     """Get processor configurations for different phases of computation.
 
     Parameters
@@ -252,7 +267,9 @@ def get_phase_procs(use_legate: bool):
         return DummyScope(), DummyScope()
 
 
-def parse_common_args():
+def parse_common_args() -> tuple[
+    str, Timer, ModuleType, ModuleType, ModuleType, bool
+]:
     """Parse common command line arguments for example scripts.
 
     Returns
@@ -274,6 +291,8 @@ def parse_common_args():
     - "cupy": Uses cupy, cupyx.scipy.sparse, and cupyx.scipy.sparse.linalg
     - "scipy": Uses numpy, scipy.sparse, and scipy.sparse.linalg
     """
+    global np, sparse, linalg
+
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--package",
@@ -283,6 +302,8 @@ def parse_common_args():
     )
     args, _ = parser.parse_known_args()
 
+    timer: Timer
+
     if args.package == "legate":
         timer = LegateTimer()
         np_name = "cupynumeric"
@@ -306,9 +327,9 @@ def parse_common_args():
 
         use_legate = False
 
-    globals()["np"] = importlib.import_module(np_name)
-    globals()["sparse"] = importlib.import_module(sp_name)
-    globals()["linalg"] = importlib.import_module(lg_name)
+    np = importlib.import_module(np_name)
+    sparse = importlib.import_module(sp_name)
+    linalg = importlib.import_module(lg_name)
 
     return args.package, timer, np, sparse, linalg, use_legate
 
@@ -317,7 +338,9 @@ def parse_common_args():
 #
 # `diags` construct csr from dia array, while when from_diags=False
 # we construct csr arrya directly - might be slightly faster
-def banded_matrix(N, nnz_per_row, from_diags=False):
+def banded_matrix(
+    N: int, nnz_per_row: int, from_diags: bool = False
+) -> csr_array:
     """Construct a banded matrix with 1.0 as values.
 
     Parameters
@@ -375,7 +398,9 @@ def banded_matrix(N, nnz_per_row, from_diags=False):
 
         pred = np.arange(nnz_per_row - half_nnz, nnz_per_row + 1)
         post = np.flip(pred)
-        nnz_arr = np.concatenate((pred, np.ones(main_rows) * nnz_per_row, post))
+        nnz_arr = np.concatenate(
+            (pred, np.ones(main_rows) * nnz_per_row, post)
+        )
         row_offsets = np.zeros(N + 1).astype(sparse.coord_ty)
         row_offsets[1 : N + 1] = np.cumsum(nnz_arr)
         nnz = row_offsets[-1]
@@ -399,7 +424,12 @@ def banded_matrix(N, nnz_per_row, from_diags=False):
         )
 
 
-def stencil_grid(S, grid, dtype=None, format=None):
+def stencil_grid(
+    S: Any,
+    grid: tuple[int, int],
+    dtype: npt.dtype[Any] | None = None,
+    format: str | None = None,
+) -> csr_array:
     """Construct a sparse matrix resulting from a stencil
     discretization on rectilinear grids.
 
@@ -437,6 +467,8 @@ def stencil_grid(S, grid, dtype=None, format=None):
     >>> A = stencil_grid(S, (3, 3))
     >>> print(A.toarray())
     """
+    import numpy
+
     N_v = int(numpy.prod(grid))  # number of vertices in the mesh
     N_s = int((S != 0).sum(dtype=int))  # number of nonzero stencil entries
 
@@ -497,7 +529,7 @@ def stencil_grid(S, grid, dtype=None, format=None):
     return sparse.dia_array((data, diags), shape=(N_v, N_v)).tocsr()
 
 
-def poisson2D(N):
+def poisson2D(N: int) -> csr_array:
     """Construct the 2D Poisson matrix.
 
     Parameters
@@ -536,7 +568,9 @@ def poisson2D(N):
     diag_size = N * N - 1
     first = np.full((N - 1), -1.0)
     chunks = np.concatenate([np.zeros(1), first])
-    diag_a = np.concatenate([first, np.tile(chunks, (diag_size - (N - 1)) // N)])
+    diag_a = np.concatenate(
+        [first, np.tile(chunks, (diag_size - (N - 1)) // N)]
+    )
     diag_g = -1.0 * np.ones(N * (N - 1))
     diag_c = 4.0 * np.ones(N * N)
 
@@ -549,7 +583,7 @@ def poisson2D(N):
     return sparse.diags(diagonals, offsets, dtype=np.float64).tocsr()
 
 
-def diffusion2D(N, epsilon=1.0, theta=0.0):
+def diffusion2D(N: int, epsilon: float = 1.0, theta: float = 0.0) -> csr_array:
     """Construct a 2D diffusion matrix with anisotropy.
 
     Parameters
diff --git a/examples/direct_solve_banded_system.py b/examples/direct_solve_banded_system.py
new file mode 100644
index 00000000..1876acfa
--- /dev/null
+++ b/examples/direct_solve_banded_system.py
@@ -0,0 +1,86 @@
+import argparse
+from common import get_arg_number, parse_common_args
+
+"""Sparse Direct Solve Benchmark.
+
+This script benchmarks sparse direct solve for a banded system of equations
+
+"""
+
+
+def create_system_of_eqns(nrows, dtype):
+    """
+    Creates a system of equations A*x = b where:
+    - A has 4 on the main diagonal (k=0), 1 on the first and second upper diagonal (k=1, 2)
+    - and 1 on the first lower diagonal (k=-1)
+    - The solution x is [1, 2, 3, ..., nrows]
+    - b is computed as A @ x
+    """
+
+    main_diag = np.full(nrows, 4.0)
+    upper1_diag = np.ones(nrows - 1)
+    upper2_diag = np.ones(nrows - 2)
+    lower1_diag = np.ones(nrows - 1)
+
+    A = sparse.diags(
+        [lower1_diag, main_diag, upper1_diag, upper2_diag],
+        offsets=[-1, 0, 1, 2],
+        shape=(nrows, nrows),
+        dtype=np.float64,
+        format="csr",
+    )
+    x_expected = np.arange(1, nrows + 1, dtype=dtype)
+    b = A @ x_expected
+
+    return (A, b, x_expected)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-n",
+        "--nrows",
+        type=str,
+        default="12",
+        dest="nrows",
+        help="Number of rows in the generated matrix (accepts suffixes 'k', 'm', 'g')",
+    )
+
+    parser.add_argument(
+        "--nwarmups",
+        type=int,
+        default=2,
+        dest="nwarmups",
+        help="Number of warmup iterations before spsolve is timed",
+    )
+
+    args, _ = parser.parse_known_args()
+    package, timer, np, sparse, _, _ = parse_common_args()
+
+    nrows = get_arg_number(args.nrows)
+    nwarmups = args.nwarmups
+
+    assert nrows > 0, "Matrix must contain atleast one row"
+    assert nwarmups >= 0, "Warmup iterations must be >= 0"
+
+    timer.start()
+    A, b, x_expected = create_system_of_eqns(nrows, np.float64)
+    elapsed_time_setup = timer.stop()
+
+    for _ in range(nwarmups):
+        x = sparse.linalg.spsolve(A, b)
+
+    timer.start()
+    x = sparse.linalg.spsolve(A, b)
+    elapsed_time_solve = timer.stop()
+
+    error_l2_norm = np.linalg.norm(x_expected - x) / np.linalg.norm(x_expected)
+
+    print(f"Dimension of A              : {A.shape}")
+    print(f"Dimension of b              : {b.shape}")
+    print(f"Dimension of x              : {x.shape}")
+    print(f"NNZ of A                    : {A.nnz}")
+    print(f"Elapsed time for setup (ms) : {elapsed_time_setup}")
+    print(f"Elapsed time for solve (ms) : {elapsed_time_solve}")
+    print(f"Error in solution           : {error_l2_norm}")
diff --git a/examples/gmg.py b/examples/gmg.py
deleted file mode 100644
index 6491ef59..00000000
--- a/examples/gmg.py
+++ /dev/null
@@ -1,492 +0,0 @@
-# Copyright 2022-2024 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Portions of this file are also subject to the following license:
-#
-# The MIT License (MIT)
-#
-# Copyright (c) 2008-2015 PyAMG Developers
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import argparse
-
-# for some small data manipulations on host
-import numpy
-from common import diffusion2D, get_phase_procs, parse_common_args, poisson2D
-
-
-def max_eigenvalue(A, iters=15):
-    # Compute eigenvector associated with maximum eigenvalue via power
-    # iteration.  This is the same as Steven's imp for estimating spectral
-    # radius.
-    x1 = np.random.rand(A.shape[1]).reshape(-1, 1)
-    for _ in range(iters):
-        x1 = A @ x1
-        x1 /= np.linalg.norm(x1)
-    # Compute and return max eigenvalue via Raleigh quotient.
-    # This is np.dot(A @ x1, x1) / np.dot(x1, x1)
-    # but since x1 is a unit vector, we can assume denominator is 1.
-    return np.dot(x1.T, A @ x1).item()
-
-
-class GMG(object):
-    """
-    Geometric Multigrid solver for the 2D Poisson problem.
-
-    - Source on correctness of restriction / prolongation operators: [1]
-    - Sources on V-cycle algorithm: [1, 2, 3, 4]
-    - Source on preconditioned conjugate gradient and Gauss-Seidel smoothing: [4]
-
-    [1] https://www.researchgate.net/publication/220690328_A_Multigrid_Tutorial_2nd_Edition
-    [2] https://github.com/pyamg/pyamg
-    [3] http://www.cs.columbia.edu/cg/pdfs/28_GPUSim.pdf
-    [4] https://netlib.org/utk/people/JackDongarra/PAPERS/HPCG-benchmark.pdf
-    """  # noqa: E501
-
-    def __init__(self, A, shape, levels, smoother, gridop, machine):
-        self.A = A
-        self.shape = shape
-        self.N = numpy.prod(self.shape)
-        self.levels = levels
-        self.restriction_op = {
-            "injection": injection_operator,
-            "linear": linear_operator,
-        }[gridop]
-        self.smoother = {"jacobi": WeightedJacobi}[smoother]()
-        self.operators = self.compute_operators(A)
-        self.temp = None
-        self.machine = machine
-        self.proc_kind = machine.preferred_target
-
-    def compute_operators(self, A):
-        operators = []
-        dim = self.N
-        self.smoother.init_level_params(A, 0)
-        for level in range(self.levels):
-            R, dim = self.compute_restriction_level(dim)
-            P = R.T
-            # assert sparse.issparse(P)
-            A = R @ A @ P
-            # assert sparse.issparse(A)
-            self.smoother.init_level_params(A, level + 1)
-            operators.append((R, A, P))
-        return operators
-
-    def cycle(self, r):
-        # Kick off the cycle with the top-level machine.
-        # TODO (marsaev): there are issues with scoping
-        # disabling it for now
-        return self._cycle(self.A, r, 0, self.machine)
-
-    def _cycle(self, A, r, level, machine):
-        if level == self.levels - 1:
-            return self.smoother.coarse(A, r, None, level=level)
-        x = None
-        # Do one pre-smoothing iteration.
-        R, coarse_A, P = self.operators[level]
-        x = self.smoother.pre(A, r, x, level=level)
-        # Compute the residual.
-        fine_r = r - A.dot(x)
-
-        # Restrict the residual.
-        if use_legate:
-            # TODO (marsaev): there col-split splmv optimization
-            coarse_r = R.dot(fine_r)
-        else:
-            coarse_r = R.dot(fine_r)
-
-        # Compute coarse solution using a subset of the machine.
-        # TODO (marsaev): there are issues with scoping
-        # disabling it for now
-        coarse_x = self._cycle(coarse_A, coarse_r, level + 1, self.machine)
-
-        fine_x = P @ coarse_x
-        x_corrected = x + fine_x
-        # Do one post-smoothing iteration.
-        return self.smoother.post(A, r, x_corrected, level=level)
-
-    def compute_restriction_level(self, fine_dim):
-        return self.restriction_op(fine_dim)
-
-    def linear_operator(self):
-        return linalg.LinearOperator(
-            self.A.shape, dtype=float, matvec=lambda r: self.cycle(r)
-        )
-
-
-class WeightedJacobi(object):
-    def __init__(self, omega=4.0 / 3.0):
-        # Basically, similar solution to PyAMG.
-        self.level_params = []
-        self._init_omega = omega
-
-    def init_level_params(self, A, level):
-        D_inv = 1.0 / A.diagonal()
-        # We need to create a new sparse matrix with just this modified
-        # diagonal of A. sparse.eye doesn't have this nob, but we can take
-        # the output of sparse.eye and mess with it to get the matrix
-        # that we want.
-        D_inv_nnz = min(A.shape[0], A.shape[1])
-        D_inv_mat = sparse.csr_array(
-            (
-                np.ones(D_inv_nnz).astype(A.dtype),
-                (
-                    np.arange(D_inv_nnz).astype(sparse.coord_ty),
-                    np.arange(D_inv_nnz).astype(sparse.coord_ty),
-                ),
-            ),
-            shape=A.shape,
-            dtype=A.dtype,
-            copy=False,
-        )
-        """
-        sparse.eye(
-            A.shape[0], n=A.shape[1], dtype=A.dtype, format="csr"
-        )
-        """
-        D_inv_mat.data = 1.0 / D_inv
-        spectral_radius = max_eigenvalue(A @ D_inv_mat, 1)
-        omega = self._init_omega / spectral_radius
-        self.level_params.append((omega, D_inv))
-        assert len(self.level_params) - 1 == level
-
-    def __call__(self, A, r, x, level):
-        omega, D_inv = self.level_params[level]
-        return (1 - omega) * x + omega * (r - A @ x + x / D_inv) * D_inv
-
-    def pre(self, A, r, x, level):
-        if x is not None:
-            raise Exception("Expected x is None.")
-        omega, D_inv = self.level_params[level]
-        return omega * r * D_inv
-
-    def post(self, A, r, x, level):
-        omega, D_inv = self.level_params[level]
-        return x + omega * (r - A @ x) * D_inv
-
-    def coarse(self, A, r, x, level):
-        return self.pre(A, r, x, level)
-        # return sparse.linalg.spsolve(A, r)
-
-
-def injection_operator(fine_dim):
-    fine_shape = (int(np.sqrt(fine_dim)),) * 2
-    coarse_shape = fine_shape[0] // 2, fine_shape[1] // 2
-    coarse_dim = numpy.prod(coarse_shape)
-    Rp = np.arange(coarse_dim + 1)
-    Rx = np.ones((coarse_dim,), dtype=np.float64)
-    ij = np.arange(coarse_dim, dtype=np.int64)
-    i = ij % coarse_shape[1]
-    j = ij // coarse_shape[1]
-    Rj = 2 * i + 2 * j * coarse_shape[1]
-    R = sparse.csr_matrix((Rx, Rj, Rp), shape=(coarse_dim, fine_dim), dtype=np.float64)
-    return R, coarse_dim
-
-
-def linear_operator(fine_dim):
-    fine_shape = (int(np.sqrt(fine_dim)),) * 2
-    coarse_shape = fine_shape[0] // 2, fine_shape[1] // 2
-    coarse_dim = np.prod(coarse_shape)
-    # Construct CSR directly.
-    Rp = numpy.empty(coarse_dim + 1, dtype=np.int64)
-    # Get an upper bound on the total number of non-zeroes, and construct Rj
-    # and Rx based on this bound.  Computing this value exactly is tedious and
-    # the extra allocation can be truncated at the end.  We won't need more
-    # than 9*coarse_dim rows.
-    nnz = 9 * coarse_dim
-    Rj = numpy.empty((nnz,), dtype=np.int64)
-    Rx = numpy.empty((nnz,), dtype=np.float64)
-    p = 0
-
-    def flatten(i, j):
-        return i * fine_shape[1] + j
-
-    for ij in range(coarse_dim):
-        Rp[ij] = p
-        # For linear interpolation,
-        # we have 9 points over which to average in the 2d case.
-        # The coefficient matrix will act as a stencil operator.
-        i, j = (ij // coarse_shape[1]), (ij % coarse_shape[1])
-        # Corners.
-        # r[2*i-1, 2*j-1] = 1/16
-        # r[2*i-1, 2*j+1] = 1/16
-        # r[2*i+1, 2*j-1] = 1/16
-        # r[2*i+1, 2*j+1] = 1/16
-        # Edges.
-        # r[2*i, 2*j+1] = 2/16
-        # r[2*i, 2*j-1] = 2/16
-        # r[2*i-1, 2*j] = 2/16
-        # r[2*i+1, 2*j] = 2/16
-        # Center.
-        # r[2 * i, 2 * j] = 4/16
-        # Ensure indices are constructed in order.
-        # Assumes row-major ordering.
-        if 0 <= 2 * i - 1:
-            if 0 <= 2 * j - 1:
-                # top-left
-                Rj[p], Rx[p] = flatten(2 * i - 1, 2 * j - 1), 1 / 16
-                p += 1
-            # top-middle
-            Rj[p], Rx[p] = flatten(2 * i - 1, 2 * j), 2 / 16
-            p += 1
-            if 2 * j + 1 < fine_dim:
-                # top-right
-                Rj[p], Rx[p] = flatten(2 * i - 1, 2 * j + 1), 1 / 16
-                p += 1
-        if 0 <= 2 * j - 1:
-            # middle-left
-            Rj[p], Rx[p] = flatten(2 * i, 2 * j - 1), 2 / 16
-            p += 1
-        # middle-middle
-        Rj[p], Rx[p] = flatten(2 * i, 2 * j), 4 / 16
-        p += 1
-        if 2 * j + 1 < fine_dim:
-            # middle-right
-            Rj[p], Rx[p] = flatten(2 * i, 2 * j + 1), 2 / 16
-            p += 1
-        if 2 * i + 1 < fine_dim:
-            if 0 <= 2 * j - 1:
-                # bottom-left
-                Rj[p], Rx[p] = flatten(2 * i + 1, 2 * j - 1), 1 / 16
-                p += 1
-            # bottom-middle
-            Rj[p], Rx[p] = flatten(2 * i + 1, 2 * j), 2 / 16
-            p += 1
-            if 2 * j + 1 < fine_dim:
-                # bottom-right
-                Rj[p], Rx[p] = flatten(2 * i + 1, 2 * j + 1), 1 / 16
-                p += 1
-
-    Rp[coarse_dim] = p
-    Rx, Rj, Rp = np.array(Rx[:p]), np.array(Rj[:p]), np.array(Rp)
-    R = sparse.csr_matrix((Rx[:p], Rj[:p], Rp), shape=(coarse_dim, fine_dim))
-    return R, coarse_dim
-
-
-def required_driver_memory(N):
-    NN = N * N
-    fine_shape = (int(np.sqrt(NN)),) * 2
-    coarse_shape = fine_shape[0] // 2, fine_shape[1] // 2
-    coarse_dim = numpy.prod(coarse_shape)
-    nnz = 9 * coarse_dim
-    elements = nnz + coarse_dim + 1
-    bytes = elements * 8
-    mb = bytes / 10**6
-    print("Max required driver memory for N=%d is %fMB" % (N, mb))
-
-
-def print_diagnostics(operators):
-    """Print basic statistics about the multigrid hierarchy."""
-    output = "MultilevelSolver\n"
-    output += f"Number of Levels:     {len(operators)}\n"
-    # output += f"Operator Complexity: {operator_complexity(levels):6.3f}\n"
-    # output += f"Grid Complexity:     {grid_complexity(levels):6.3f}\n"
-
-    total_nnz = sum(level[1].nnz for level in operators)
-
-    #          123456712345678901 123456789012 123456789
-    #               0       10000        49600 [52.88%]
-    output += "  level   unknowns     nonzeros\n"
-    for n, level in enumerate(operators):
-        A = level[1]
-        ratio = 100 * A.nnz / total_nnz
-        output += f"{n:>6} {A.shape[1]:>11} {A.nnz:>12} [{ratio:2.2f}%]\n"
-
-    print(output)
-
-
-def execute(N, data, smoother, gridop, levels, maxiter, tol, verbose, warmup, timer):
-    build, solve = get_phase_procs(use_legate)
-
-    if warmup:
-        tA = diffusion2D(64, epsilon=0.1, theta=np.pi / 4)
-        tB = tA.T
-        tC = tB @ tA  # noqa: F841
-
-    # Generate matrix
-    timer.start()
-    if data == "poisson":
-        A = poisson2D(N)
-        b = np.random.rand(N**2)
-    elif data == "diffusion":
-        A = diffusion2D(N)
-        b = np.random.rand(N**2)
-    else:
-        raise NotImplementedError(data)
-    print(f"GMG: {A.shape}")
-    print(f"Data creation time: {timer.stop()} ms")
-
-    assert smoother == "jacobi", "Only Jacobi smoother is currently supported."
-
-    if verbose:
-
-        def callback(x):
-            print(f"Residual: {np.linalg.norm(b - (A @ x))}")
-
-    else:
-        callback = None
-
-    required_driver_memory(N)
-    # Setup
-    timer.start()
-    mg_solver = GMG(
-        A=A,
-        shape=(N, N),
-        levels=levels,
-        smoother=smoother,
-        gridop=gridop,
-        machine=solve,
-    )
-    M = mg_solver.linear_operator()
-    print(f"GMG init time: {timer.stop()} ms")
-
-    print_diagnostics(mg_solver.operators)
-
-    # Warm up the runtime.
-    float(
-        np.linalg.norm(
-            A.dot(
-                np.zeros(
-                    A.shape[1],
-                )
-            )
-        )
-    )
-    float(
-        np.linalg.norm(
-            M.matvec(
-                np.zeros(
-                    M.shape[1],
-                )
-            )
-        )
-    )
-    # Make another call to random here as well.
-    float(np.linalg.norm(np.random.rand(b.shape[0])))
-
-    # Solve
-    timer.start()
-    x, iters = linalg.cg(A, b, rtol=tol, maxiter=maxiter, M=M, callback=callback)
-    total = timer.stop()
-
-    norm_ini = np.linalg.norm(b)
-    norm_res = np.linalg.norm(b - (A @ x))
-
-    # Check convergence with relative tolerance
-    convergence_status = True if norm_res <= norm_ini * tol else False
-    print(f"Dimension of A                      : {A.shape}")
-    print(f"Did the solution converge           : {convergence_status}")
-    print(f"Final relative residual norm        : {norm_res / norm_ini}")
-    print(f"Number of iterations                : {iters}")
-    print(f"Total elapsed time (ms)             : {total}")
-    print(f"Time per iteration (ms)             : {total / iters}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-n",
-        "--num",
-        type=int,
-        default=16,
-        dest="N",
-        help="number of elements in one dimension",
-    )
-    parser.add_argument(
-        "-d",
-        "--data",
-        dest="data",
-        choices=["poisson", "diffusion"],
-        type=str,
-        default="poisson",
-        help="The problem instance to solve.",
-    )
-    parser.add_argument(
-        "-s",
-        "--smoother",
-        dest="smoother",
-        choices=["jacobi"],
-        type=str,
-        default="jacobi",
-        help="Smoother to use.",
-    )
-    parser.add_argument(
-        "-g",
-        "--gridop",
-        dest="gridop",
-        choices=["linear", "injection"],
-        type=str,
-        default="injection",
-        help="Intergrid transfer operator to use.",
-    )
-    parser.add_argument(
-        "-l",
-        "--levels",
-        dest="levels",
-        type=int,
-        default=2,
-        help="Number of multigrid levels.",
-    )
-    parser.add_argument(
-        "-m",
-        "--maxiter",
-        type=int,
-        default=200,
-        dest="maxiter",
-        help="bound the maximum number of iterations",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        dest="verbose",
-        action="store_true",
-        help="print verbose output",
-    )
-    parser.add_argument(
-        "--tol",
-        type=float,
-        default=1e-10,
-        dest="tol",
-        help="Convergence relative norm check threshold",
-    )
-
-    parser.add_argument(
-        "-w",
-        "--warmup",
-        dest="warmup",
-        action="store_true",
-        help="Perform some Warmup operations before running timings",
-    )
-
-    args, _ = parser.parse_known_args()
-    _, timer, np, sparse, linalg, use_legate = parse_common_args()
-    execute(**vars(args), timer=timer)
diff --git a/examples/matrix_power.py b/examples/matrix_power.py
index cc52c08b..69a807aa 100644
--- a/examples/matrix_power.py
+++ b/examples/matrix_power.py
@@ -32,11 +32,17 @@
 --package: Backend to use (legate, cupy, scipy)
 """
 
+from __future__ import annotations
+
 import argparse
 from functools import reduce
+from typing import TYPE_CHECKING
+
+from common import Timer, get_arg_number, parse_common_args
 
-import numpy.typing as npt
-from common import get_arg_number, parse_common_args
+if TYPE_CHECKING:
+    import numpy.typing as npt
+    from legate_sparse import csr_array
 
 # global states random_seed, rng
 global random_seed, rng
@@ -46,7 +52,9 @@
 # ----------------------------
 
 
-def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike = None):
+def create_csr_with_nnz_per_row(
+    nrows: int, nnz_per_row: int, dtype: npt.DTypeLike | None = None
+) -> csr_array:
     """Create a CSR matrix with a prescribed number of nonzeros in each row.
 
     Parameters
@@ -84,7 +92,9 @@ def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike =
     return matrix
 
 
-def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
+def create_csr_with_nnz_total(
+    nrows: int, nnz_total: int, dtype: npt.DTypeLike | None = None
+) -> csr_array:
     """Create a CSR matrix with a prescribed total number of nonzeros.
 
     Parameters
@@ -113,7 +123,9 @@ def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
     coo_rows = rng.integers(0, nrows, nnz_total)
     coo_cols = rng.integers(0, ncols, nnz_total)
     vals = np.ones(nnz_total, dtype=dtype)
-    matrix = sparse.csr_matrix((vals, (coo_rows, coo_cols)), shape=(nrows, ncols))
+    matrix = sparse.csr_matrix(
+        (vals, (coo_rows, coo_cols)), shape=(nrows, ncols)
+    )
 
     return matrix
 
@@ -123,7 +135,9 @@ def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
 # ------------------------
 
 
-def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
+def compute_A_power_k(
+    A: csr_array, timer: Timer, nwarmups: int = 2, k: int = 4
+) -> None:
     """Compute A^k and measure performance.
 
     Parameters
@@ -180,7 +194,9 @@ def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
         print(
             f"Elapsed time for spgemm for hop {hop} (ms) : {elapsed_time_spgemm[hop]}"
         )
-        print(f"Elapsed time for copy   for hop {hop} (ms) : {elapsed_time_copy[hop]}")
+        print(
+            f"Elapsed time for copy   for hop {hop} (ms) : {elapsed_time_copy[hop]}"
+        )
 
 
 if __name__ == "__main__":
@@ -243,13 +259,11 @@ def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
     )
 
     args, _ = parser.parse_known_args()
-    _, timer, np, sparse, linalg, use_legate = parse_common_args()
+    package, timer, np, sparse, linalg, use_legate = parse_common_args()
 
     nrows = get_arg_number(args.nrows)
     nnz_total = get_arg_number(args.nnz_total)
 
-    # this is a global variable
-    global random_seed, rng
     random_seed = args.random_seed
 
     if args.same_sparsity_for_cpu_and_gpu:
@@ -277,4 +291,6 @@ def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
 
     compute_A_power_k(A, timer, int(args.nwarmups), int(args.k))
 
-    print(f"Elapsed time in matrix creation (ms)   : {elapsed_time_matrix_gen}")
+    print(
+        f"Elapsed time in matrix creation (ms)   : {elapsed_time_matrix_gen}"
+    )
diff --git a/examples/pde.py b/examples/pde.py
index d9ca0095..6745ee32 100644
--- a/examples/pde.py
+++ b/examples/pde.py
@@ -38,16 +38,19 @@
 --package: Backend to use (legate, cupy, scipy)
 """
 
+from __future__ import annotations
+
 # This PDE solving application is derived from
 # https://aquaulb.github.io/book_solving_pde_mooc/solving_pde_mooc/notebooks/05_IterativeMethods/05_01_Iteration_and_2D.html.
 
 import argparse
 import sys
+from typing import Any
 
-from common import get_phase_procs, parse_common_args
+from common import Timer, get_phase_procs, parse_common_args
 
 
-def d2_mat_dirichlet_2d(nx, ny, dx, dy):
+def d2_mat_dirichlet_2d(nx: int, ny: int, dx: float, dy: float) -> Any:
     """
     Constructs the matrix for the centered second-order accurate
     second-order derivative for Dirichlet boundary conditions in 2D
@@ -114,7 +117,7 @@ def d2_mat_dirichlet_2d(nx, ny, dx, dy):
     return d2mat
 
 
-def p_exact_2d(X, Y):
+def p_exact_2d(X: Any, Y: Any) -> Any:
     """Computes the exact solution of the Poisson equation in the domain
     [0, 1]x[-0.5, 0.5] with rhs:
     b = (np.sin(np.pi * X) * np.cos(np.pi * Y) +
@@ -133,14 +136,26 @@ def p_exact_2d(X, Y):
         exact solution of the Poisson equation
     """
 
-    sol = -1.0 / (2.0 * np.pi**2) * np.sin(np.pi * X) * np.cos(np.pi * Y) - 1.0 / (
-        50.0 * np.pi**2
-    ) * np.sin(5.0 * np.pi * X) * np.cos(5.0 * np.pi * Y)
+    sol = -1.0 / (2.0 * np.pi**2) * np.sin(np.pi * X) * np.cos(
+        np.pi * Y
+    ) - 1.0 / (50.0 * np.pi**2) * np.sin(5.0 * np.pi * X) * np.cos(
+        5.0 * np.pi * Y
+    )
 
     return sol
 
 
-def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters, timer):
+def execute(
+    nx: int,
+    ny: int,
+    plot: bool,
+    plot_fname: str,
+    throughput: bool,
+    tol: float,
+    max_iters: int,
+    warmup_iters: int,
+    timer: Timer,
+) -> None:
     # Grid parameters.
     xmin, xmax = 0.0, 1.0  # limits in the x direction
     ymin, ymax = -0.5, 0.5  # limits in the y direction
@@ -181,9 +196,9 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         # Compute the rhs. Note that we non-dimensionalize the coordinates
         # x and y with the size of the domain in their respective dire-
         # ctions.
-        b = np.sin(np.pi * X) * np.cos(np.pi * Y) + np.sin(5.0 * np.pi * X) * np.cos(
-            5.0 * np.pi * Y
-        )
+        b = np.sin(np.pi * X) * np.cos(np.pi * Y) + np.sin(
+            5.0 * np.pi * X
+        ) * np.cos(5.0 * np.pi * Y)
 
         # b is currently a 2D array. We need to convert it to a column-major
         # ordered 1D array. This is done with the flatten numpy function.
@@ -194,7 +209,7 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         # count combinations as well. Even more annoyingly, doing any sort
         # of flatten results in some bad assignment of equivalence sets within
         # Legion's dependence analysis. So if we're just testing solve
-        # throughput, use an array of all ones.
+        # throughpu: boolt, use an array of all ones.
         if throughput:
             n = b.shape[0] - 2
             bflat = np.ones((n * n,))
@@ -218,7 +233,13 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         # If we're testing throughput, run only the prescribed number of iterations.
         if throughput:
             if use_legate:
-                p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters, conv_test_iters=max_iters)
+                p_sol, iters = linalg.cg(
+                    A,
+                    bflat,
+                    rtol=tol,
+                    maxiter=max_iters,
+                    conv_test_iters=max_iters,
+                )
             else:
                 p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters)
         else:
@@ -242,8 +263,12 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
 
             # Check convergence with relative tolerance
             convergence_status = True if norm_res <= norm_ini * tol else False
-            print(f"Did the solution converge           : {convergence_status}")
-            print(f"Final relative residual norm        : {norm_res / norm_ini}")
+            print(
+                f"Did the solution converge           : {convergence_status}"
+            )
+            print(
+                f"Final relative residual norm        : {norm_res / norm_ini}"
+            )
             if iters > 0:
                 print(f"Number of iterations                : {iters}")
                 print(f"Time per iteration (ms)             : {total / iters}")
@@ -321,10 +346,14 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
     )
 
     args, _ = parser.parse_known_args()
-    _, timer, np, sparse, linalg, use_legate = parse_common_args()
+    package, timer, np, sparse, linalg, use_legate = parse_common_args()
 
-    if args.throughput and (args.max_iters is None or args.warmup_iters is None):
-        print("Must provide --max-iters and --warmup-iters when using --throughput.")
+    if args.throughput and (
+        args.max_iters is None or args.warmup_iters is None
+    ):
+        print(
+            "Must provide --max-iters and --warmup-iters when using --throughput."
+        )
         sys.exit(1)
 
     execute(**vars(args), timer=timer)
diff --git a/examples/poisson_5point_example.py b/examples/poisson_5point_example.py
new file mode 100644
index 00000000..e2685a17
--- /dev/null
+++ b/examples/poisson_5point_example.py
@@ -0,0 +1,193 @@
+# Copyright 2022-2025 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Solve Poisson equation: -∇²u = f(x,y) on domain [0,1]×[0,1]
+With Dirichlet boundary conditions: u = 0 on boundary. We use
+a manufactured solution approach u(x,y) = sin(2πx) * sin(2πy)
+and use that to compute the RHS.
+"""
+
+from __future__ import annotations
+
+import argparse
+
+from common import parse_common_args, get_arg_number
+
+
+def create_poisson_mat(n, h):
+    """
+    Create the 2D Poisson equation discretization matrix using 5-point stencil.
+
+    The 5-point stencil for -∇²u at point (i,j) is:
+    -u_{i,j-1} - u_{i-1,j} + 4*u_{i,j} - u_{i+1,j} - u_{i,j+1} = h²*f_{i,j}
+
+    Parameters
+    ----------
+    n : int
+        Number of interior grid points in each direction (n×n grid)
+    h : float
+        Grid spacing (h = 1/(n+1))
+
+    Returns
+    -------
+    A : sparse CSR matrix
+        The discretization matrix of shape (n**2, n**2)
+    """
+    N = n * n  # Total number of unknowns
+
+    # stencil:
+    #    -1
+    # -1  4  -1
+    #    -1
+    main_diag = 4.0 * np.ones(N) / (h * h)
+    off_diag1 = -1.0 * np.ones(N - 1) / (h * h)
+    off_diag2 = -1.0 * np.ones(N - n) / (h * h)
+
+    # cupynumeric doesn't support non-unit strides in indexing,
+    # so use a mask array to set every "n" elements to zero
+    zero_out_indices = np.array(range(n - 1, N - 1, n), dtype=int)
+    off_diag1[zero_out_indices] = 0.0
+
+    # The offsets   : [-n,      -1,     0,     1,     n  ]
+    # correspond to : [below, left, center, right, above ]
+    diagonals = [off_diag2, off_diag1, main_diag, off_diag1, off_diag2]
+    offsets = [-n, -1, 0, 1, n]
+
+    # Create the sparse matrix and convert to CSR format
+    return sparse.diags(
+        diagonals, offsets, shape=(N, N), dtype=np.float64, format="csr"
+    )
+
+
+def manufactured_solution(x, y):
+    "u(x,y) = sin(2πx) * sin(2πy) satisfies u=0 on the boundary of [0,1]×[0,1]"
+    return np.sin(2 * np.pi * x) * np.sin(2 * np.pi * y)
+
+
+def compute_rhs(x, y):
+    """
+    Compute the right-hand side f(x,y) for the manufactured solution.
+
+    For u(x,y) = sin(2πx) * sin(2πy), we have:
+    -∇²u = 8π² * sin(2πx) * sin(2πy) = f(x,y)
+    """
+    return 8 * np.pi**2 * np.sin(2 * np.pi * x) * np.sin(2 * np.pi * y)
+
+
+def solve_poisson_2d(n, verbose=True) -> float:
+    """
+    Solve the 2D Poisson equation with Dirichlet boundary conditions.
+
+    Parameters
+    ----------
+    n : int
+        Number of interior grid points in each direction
+    verbose : bool
+        Whether to print detailed output
+
+    Returns
+    -------
+    error : float
+        The L2 error between numerical and analytical solutions
+    """
+    h = 1.0 / (n + 1)
+
+    if verbose:
+        print(f"Solving 2D Poisson equation on {n}×{n} grid")
+        print(f"Grid spacing h = {h:.6f}")
+        print(f"Total unknowns: {n * n}")
+
+    # Create grid points (interior points only) and flatten it
+    x = np.linspace(h, 1 - h, n)
+    y = np.linspace(h, 1 - h, n)
+    X, Y = np.meshgrid(x, y, indexing="ij")
+    X_flat = X.flatten()
+    Y_flat = Y.flatten()
+
+    A = create_poisson_mat(n, h)
+    b = compute_rhs(X_flat, Y_flat)
+
+    if verbose:
+        print(f"Matrix shape     : {A.shape}")
+        print(f"Matrix non-zeros : {A.nnz}")
+        print(f"Sparsity         : {A.nnz / (n * n) ** 2:.6f}")
+        print("\nSolving linear system Ax = b using spsolve...")
+
+    x_numerical = linalg.spsolve(A, b)
+    x_analytical = manufactured_solution(X_flat, Y_flat)
+
+    error_vec = x_numerical - x_analytical
+    l2_error = np.linalg.norm(error_vec) * h  # Scale by h for L2 norm
+    l_inf_error = np.max(np.abs(error_vec))
+    relative_error = l2_error / (np.linalg.norm(x_analytical) * h)
+
+    residual = A @ x_numerical - b
+    residual_norm = np.linalg.norm(residual)
+
+    if verbose:
+        print("\nResults:")
+        print(f"L2 error         : {l2_error:.6e}")
+        print(f"L∞ error         : {l_inf_error:.6e}")
+        print(f"Relative L2 error: {relative_error:.6e}")
+        print(f"Residual norm ||Ax - b||: {residual_norm:.6e}")
+
+    return l2_error
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Solve 2D Poisson equation with 5-point stencil"
+    )
+    parser.add_argument(
+        "--size",
+        "-n",
+        type=str,
+        default="32",
+        help="Number of interior grid points in each direction (default: 32)",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Use this argument for verbose output",
+    )
+
+    args, _ = parser.parse_known_args()
+    package, timer, np, sparse, linalg, use_legate = parse_common_args()
+
+    n_interior = get_arg_number(args.size)
+
+    solve_poisson_2d(n_interior, verbose=args.verbose)
+
+    print("\n" + "=" * 60)
+    print("Verification: Testing with smaller grid for convergence check")
+    print("=" * 60)
+
+    # Perform convergence tests
+    n1, n2 = n_interior, n_interior * 2
+    l2_error1 = solve_poisson_2d(n1, verbose=False)
+    l2_error2 = solve_poisson_2d(n2, verbose=False)
+
+    convergence_rate = np.log2(l2_error1 / l2_error2)
+    print(f"Grid refinement                  : {n1}×{n1} → {n2}×{n2}")
+    print(f"Error reduction factor           : {l2_error1 / l2_error2:.3f}")
+    print(f"Convergence rate                 : {convergence_rate:.3f}")
+    print("Expected rate for 5-point stencil: ~2.0")
+
+    if abs(convergence_rate - 2.0) < 0.5:
+        print(
+            "\n✓ Solution verified: convergence rate is close to expected value"
+        )
+    else:
+        print("\n⚠ Warning: convergence rate differs from expected value")
diff --git a/examples/spgemm_microbenchmark.py b/examples/spgemm_microbenchmark.py
index e30c05dd..0f97be69 100644
--- a/examples/spgemm_microbenchmark.py
+++ b/examples/spgemm_microbenchmark.py
@@ -32,12 +32,24 @@
 --package: Backend to use (legate, cupy, scipy)
 """
 
+from __future__ import annotations
+
 import argparse
+from typing import TYPE_CHECKING
+
+from common import (
+    Timer,
+    banded_matrix,
+    get_arg_number,
+    get_phase_procs,
+    parse_common_args,
+)
 
-from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args
+if TYPE_CHECKING:
+    from legate_sparse import csr_array
 
 
-def spgemm_dispatch(A, B):
+def spgemm_dispatch(A: csr_array, B: csr_array) -> csr_array:
     """Dispatch sparse matrix-matrix multiplication operation.
 
     Parameters
@@ -61,7 +73,9 @@ def spgemm_dispatch(A, B):
     return C
 
 
-def get_matrices(N, nnz_per_row, fname1, fname2):
+def get_matrices(
+    N: int, nnz_per_row: int, fname1: str, fname2: str
+) -> tuple[csr_array, csr_array]:
     """Get matrices for SpGEMM benchmark.
 
     Parameters
@@ -100,7 +114,15 @@ def get_matrices(N, nnz_per_row, fname1, fname2):
         return A, A.copy()
 
 
-def run_spgemm(N, nnz_per_row, fname1, fname2, iters, stable, timer):
+def run_spgemm(
+    N: int,
+    nnz_per_row: int,
+    fname1: str,
+    fname2: str,
+    iters: int,
+    stable: bool,
+    timer: Timer,
+) -> None:
     """Run sparse matrix-matrix multiplication benchmark.
 
     Parameters
@@ -229,7 +251,7 @@ def run_spgemm(N, nnz_per_row, fname1, fname2, iters, stable, timer):
     )
 
     args, _ = parser.parse_known_args()
-    _, timer, np, sparse, linalg, use_legate = parse_common_args()
+    package, timer, np, sparse, linalg, use_legate = parse_common_args()
 
     init_procs, bench_procs = get_phase_procs(use_legate)
 
diff --git a/examples/spmv_microbenchmark.py b/examples/spmv_microbenchmark.py
index c6f11ff8..b449b026 100644
--- a/examples/spmv_microbenchmark.py
+++ b/examples/spmv_microbenchmark.py
@@ -34,13 +34,27 @@
 --package: Backend to use (legate, cupy, scipy)
 """
 
+from __future__ import annotations
+
 import argparse
+from typing import TYPE_CHECKING, Any
+
+from common import (
+    Timer,
+    banded_matrix,
+    get_arg_number,
+    get_phase_procs,
+    parse_common_args,
+)
 
-from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args
+if TYPE_CHECKING:
+    from legate_sparse import csr_array
 
 
 # Writing to pre-allocated array is preferred
-def spmv_dispatch(A, x, y, i, repartition):
+def spmv_dispatch(
+    A: csr_array, x: Any, y: Any, i: int, repartition: bool
+) -> None:
     """Dispatch sparse matrix-vector multiplication operation.
 
     Parameters
@@ -77,7 +91,9 @@ def spmv_dispatch(A, x, y, i, repartition):
             y = A @ x
 
 
-def run_spmv(A, iters, repartition, timer):
+def run_spmv(
+    A: csr_array, iters: int, repartition: bool, timer: Timer
+) -> None:
     """Run sparse matrix-vector multiplication benchmark.
 
     Parameters
@@ -105,9 +121,9 @@ def run_spmv(A, iters, repartition, timer):
     x = np.ones((A.shape[1],))
     y = np.zeros((A.shape[0],))
 
-    assert not repartition or (
-        A.shape[0] == A.shape[1]
-    ), "Matrix should be square for switching x and y"
+    assert not repartition or (A.shape[0] == A.shape[1]), (
+        "Matrix should be square for switching x and y"
+    )
 
     # Warm up runs
     warmup_iters = 5
@@ -186,7 +202,7 @@ def run_spmv(A, iters, repartition, timer):
     )
 
     args, _ = parser.parse_known_args()
-    _, timer, np, sparse, linalg, use_legate = parse_common_args()
+    package, timer, np, sparse, linalg, use_legate = parse_common_args()
 
     init_procs, bench_procs = get_phase_procs(use_legate)
 
diff --git a/install.py b/install.py
index c46e03a7..6be4fdae 100755
--- a/install.py
+++ b/install.py
@@ -109,7 +109,9 @@ def was_previously_built_with_different_build_isolation(
         legate_sparse_build_dir is not None
         and os.path.exists(legate_sparse_build_dir)
         and os.path.exists(
-            cmake_cache := os.path.join(legate_sparse_build_dir, "CMakeCache.txt")
+            cmake_cache := os.path.join(
+                legate_sparse_build_dir, "CMakeCache.txt"
+            )
         )
     ):
         try:
@@ -298,9 +300,15 @@ def validate_path(path):
         cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")]
 
     cmake_flags += f"""\
--DCMAKE_BUILD_TYPE={(
-    "Debug" if debug else "RelWithDebInfo" if debug_release else "Release"
-)}
+-DCMAKE_BUILD_TYPE={
+        (
+            "Debug"
+            if debug
+            else "RelWithDebInfo"
+            if debug_release
+            else "Release"
+        )
+    }
 -DBUILD_SHARED_LIBS=ON
 -DBUILD_MARCH={str(march)}
 -DCMAKE_CUDA_ARCHITECTURES={str(arch)}
@@ -345,7 +353,9 @@ def validate_path(path):
         }
     )
 
-    execute_command(pip_install_cmd, verbose, cwd=legate_sparse_dir, env=cmd_env)
+    execute_command(
+        pip_install_cmd, verbose, cwd=legate_sparse_dir, env=cmd_env
+    )
 
 
 def driver():
diff --git a/legate_sparse/__init__.py b/legate_sparse/__init__.py
index c8f44589..35d65fb9 100644
--- a/legate_sparse/__init__.py
+++ b/legate_sparse/__init__.py
@@ -17,12 +17,15 @@
 
 """
 
-import scipy.sparse as _sp  # type: ignore
+from __future__ import annotations
+
+import scipy.sparse as _sp
 
 from .coverage import clone_module  # noqa: F401
 from .csr import csr_array, csr_matrix  # noqa: F401
 from .dia import dia_array, dia_matrix  # noqa: F401
-from .module import *  # noqa: F401
+from .module import *  # noqa: F401,F403
+from .construct import block_array  # noqa: F401
 
 clone_module(_sp, globals())
 
diff --git a/legate_sparse/_version.py b/legate_sparse/_version.py
index b50be7bd..ff2762af 100644
--- a/legate_sparse/_version.py
+++ b/legate_sparse/_version.py
@@ -69,7 +69,9 @@ def decorate(f):
     return decorate
 
 
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
+def run_command(
+    commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
+):
     """Call the given command(s)."""
     assert isinstance(commands, list)
     process = None
@@ -263,7 +265,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
     env.pop("GIT_DIR", None)
     runner = functools.partial(runner, env=env)
 
-    _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
+    _, rc = runner(
+        GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True
+    )
     if rc != 0:
         if verbose:
             print("Directory %s not under git control" % root)
@@ -292,7 +296,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
     pieces["short"] = full_out[:7]  # maybe improved later
     pieces["error"] = None
 
-    branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root)
+    branch_name, rc = runner(
+        GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root
+    )
     # --abbrev-ref was added in git-1.6.3
     if rc != 0 or branch_name is None:
         raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
@@ -341,7 +347,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
         mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
             # unparsable. Maybe git-describe is misbehaving?
-            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
+            pieces["error"] = (
+                "unable to parse git-describe output: '%s'" % describe_out
+            )
             return pieces
 
         # tag
@@ -370,7 +378,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
         pieces["distance"] = int(count_out)  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
+    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
     # Use only the last line.  Previous lines may contain GPG signature
     # information.
     date = date.splitlines()[-1]
@@ -458,7 +468,9 @@ def render_pep440_pre(pieces):
     if pieces["closest-tag"]:
         if pieces["distance"]:
             # update the post release segment
-            tag_version, post_version = pep440_split_post(pieces["closest-tag"])
+            tag_version, post_version = pep440_split_post(
+                pieces["closest-tag"]
+            )
             rendered = tag_version
             if post_version is not None:
                 rendered += ".post%d.dev%d" % (
@@ -647,7 +659,9 @@ def get_versions():
     verbose = cfg.verbose
 
     try:
-        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
+        return git_versions_from_keywords(
+            get_keywords(), cfg.tag_prefix, verbose
+        )
     except NotThisMethod:
         pass
 
diff --git a/legate_sparse/base.py b/legate_sparse/base.py
index c9d99a31..fd01a266 100644
--- a/legate_sparse/base.py
+++ b/legate_sparse/base.py
@@ -44,8 +44,12 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from __future__ import annotations
 
-import cupynumeric
+from typing import TYPE_CHECKING
+
+import cupynumeric as cn
+import numpy as np
 from legate.core import LogicalStore, align
 
 from .config import SparseOpCode, rect1
@@ -58,6 +62,13 @@
     store_to_cupynumeric_array,
 )
 
+if TYPE_CHECKING:
+    from typing import Any, Callable
+
+    import numpy.typing as npt
+
+    from cupynumeric.types import CastingKind
+
 
 # CompressedBase is a base class for several different kinds of sparse
 # matrices, such as CSR, CSC, COO and DIA.
@@ -74,8 +85,28 @@ class CompressedBase:
     Use specific format classes like csr_array instead.
     """
 
+    shape: tuple[int, ...]
+    pos: LogicalStore
+    dtype: npt.dtype[Any]
+    format: str
+    crd: LogicalStore
+    _data: cn.ndarray
+
+    def __init__(self, *args: Any, **kw: Any) -> None:
+        super().__init__(*args, **kw)
+
+    @property
+    def data(self) -> cn.ndarray:
+        return self._data
+
+    @property
+    def size(self) -> int:
+        raise NotImplementedError
+
     @classmethod
-    def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
+    def nnz_to_pos_cls(
+        cls, q_nnz: LogicalStore
+    ) -> tuple[LogicalStore, cn.ndarray]:
         """Convert non-zero counts to position arrays.
 
         This class method converts an array of non-zero counts per row/column
@@ -93,15 +124,13 @@ def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
             is the total number of non-zeros.
         """
         q_nnz_arr = store_to_cupynumeric_array(q_nnz)
-        cs = cupynumeric.cumsum(q_nnz_arr)
+        cs = cn.cumsum(q_nnz_arr)
         cs_shifted = cs - q_nnz_arr
         cs_store = get_store_from_cupynumeric_array(cs)
         cs_shifted_store = get_store_from_cupynumeric_array(cs_shifted)
         # Zip the scan result into a rect1 region for the pos.
         pos = runtime.create_store(
-            rect1,  # type: ignore
-            shape=(q_nnz.shape[0],),
-            optimize_scalar=False,
+            rect1, shape=(q_nnz.shape[0],), optimize_scalar=False
         )
         task = runtime.create_auto_task(SparseOpCode.ZIP_TO_RECT1)
         pos_var = task.add_output(pos)
@@ -113,7 +142,9 @@ def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
         # Don't convert cs[-1] to an int to avoid blocking.
         return pos, cs[-1]
 
-    def nnz_to_pos(self, q_nnz: LogicalStore):
+    def nnz_to_pos(
+        self, q_nnz: LogicalStore
+    ) -> tuple[LogicalStore, cn.ndarray]:
         """Convert non-zero counts to position arrays for this instance.
 
         Parameters
@@ -129,7 +160,12 @@ def nnz_to_pos(self, q_nnz: LogicalStore):
         """
         return CompressedBase.nnz_to_pos_cls(q_nnz)
 
-    def asformat(self, format, copy=False):
+    def copy(self) -> CompressedBase:
+        raise NotImplementedError()
+
+    def asformat(
+        self, format: str | None, copy: bool = False
+    ) -> CompressedBase:
         """Convert the matrix to a specified format.
 
         Parameters
@@ -158,7 +194,9 @@ def asformat(self, format, copy=False):
                 return self
         else:
             try:
-                convert_method = getattr(self, "to" + format)
+                convert_method: Callable[..., CompressedBase] = getattr(
+                    self, "to" + format
+                )
             except AttributeError as e:
                 raise ValueError("Format {} is unknown.".format(format)) from e
 
@@ -169,7 +207,12 @@ def asformat(self, format, copy=False):
                 return convert_method()
 
     # The implementation of sum is mostly lifted from scipy.sparse.
-    def sum(self, axis=None, dtype=None, out=None):
+    def sum(
+        self,
+        axis: int | None = None,
+        dtype: npt.dtype[Any] | None = None,
+        out: cn.ndarray | None = None,
+    ) -> cn.ndarray:
         """Sum the matrix elements over a given axis.
 
         Parameters
@@ -237,10 +280,10 @@ def sum(self, axis=None, dtype=None, out=None):
             # TODO: (marsaev) currently not supported as we don't have rmatmul yet
             # (need CSC to have easier sum over columns)
             raise NotImplementedError
-            ret = self.__rmatmul__(cupynumeric.ones((1, m), dtype=res_dtype))
+            # ret = self.__rmatmul__(cn.ones((1, m), dtype=res_dtype))
         else:
             # sum over rows
-            ret = self @ cupynumeric.ones((n, 1), dtype=res_dtype)
+            ret = self @ cn.ones((n, 1), dtype=res_dtype)
 
         if out is not None and out.shape != ret.shape:
             raise ValueError("dimensions do not match")
@@ -248,7 +291,7 @@ def sum(self, axis=None, dtype=None, out=None):
         return ret.sum(axis=axis, dtype=dtype, out=out)
 
     # needed by _data_matrix
-    def _with_data(self, data, copy=True):
+    def _with_data(self, data: Any, copy: bool = True) -> CompressedBase:
         """Returns a matrix object with the same sparsity structure as self,
         but with different data.
 
@@ -290,8 +333,13 @@ def _with_data(self, data, copy=True):
                 copy=False,
             )
 
-    def astype(self, dtype, casting="unsafe", copy=True):
-        dtype = cupynumeric.dtype(dtype)
+    def astype(
+        self,
+        dtype: npt.dtype[Any],
+        casting: CastingKind = "unsafe",
+        copy: bool = True,
+    ) -> CompressedBase:
+        dtype = np.dtype(dtype)
         # if type doesn't match, create a matrix copy with casted data array
         if self.dtype != dtype:
             return self._with_data(
@@ -304,24 +352,24 @@ def astype(self, dtype, casting="unsafe", copy=True):
 # These univariate ufuncs preserve zeros.
 _ufuncs_with_fixed_point_at_zero = frozenset(
     [
-        cupynumeric.sin,
-        cupynumeric.tan,
-        cupynumeric.arcsin,
-        cupynumeric.arctan,
-        cupynumeric.sinh,
-        cupynumeric.tanh,
-        cupynumeric.arcsinh,
-        cupynumeric.arctanh,
-        cupynumeric.rint,
-        cupynumeric.sign,
-        cupynumeric.expm1,
-        cupynumeric.log1p,
-        cupynumeric.deg2rad,
-        cupynumeric.rad2deg,
-        cupynumeric.floor,
-        cupynumeric.ceil,
-        cupynumeric.trunc,
-        cupynumeric.sqrt,
+        cn.sin,
+        cn.tan,
+        cn.arcsin,
+        cn.arctan,
+        cn.sinh,
+        cn.tanh,
+        cn.arcsinh,
+        cn.arctanh,
+        cn.rint,
+        cn.sign,
+        cn.expm1,
+        cn.log1p,
+        cn.deg2rad,
+        cn.rad2deg,
+        cn.floor,
+        cn.ceil,
+        cn.trunc,
+        cn.sqrt,
     ]
 )
 
@@ -329,14 +377,14 @@ def astype(self, dtype, casting="unsafe", copy=True):
 for npfunc in _ufuncs_with_fixed_point_at_zero:
     name = npfunc.__name__
 
-    def _create_method(op):
-        def method(self):
+    def _create_method(op: Callable[[Any], Any]) -> Callable[[Any], Any]:
+        def method(self: Any) -> Any:
             result = op(self.data)
             return self._with_data(result)
 
-        method.__doc__ = "Element-wise %s.\n\nSee `numpy.%s` for more information." % (
-            name,
-            name,
+        method.__doc__ = (
+            "Element-wise %s.\n\nSee `numpy.%s` for more information."
+            % (name, name)
         )
         method.__name__ = name
 
@@ -345,56 +393,8 @@ def method(self):
     setattr(CompressedBase, name, _create_method(npfunc))
 
 
-# DenseSparseBase is a base class for sparse matrices that have a TACO
-# format of {Dense, Sparse}. For our purposes, that means CSC and CSR
-# matrices.
-class DenseSparseBase:
-    """Base class for sparse matrices with dense-sparse format.
-
-    This class provides functionality for sparse matrices that have a TACO
-    format of {Dense, Sparse}, which includes CSR and CSC matrices.
-
-    Notes
-    -----
-    This is an internal base class and should not be instantiated directly.
-    Use specific format classes like csr_array instead.
-    """
-
-    def __init__(self):
-        """Initialize the DenseSparseBase class."""
-        self._balanced_pos_partition = None
-
-    # consider using _with_data() here
-    @classmethod
-    def make_with_same_nnz_structure(cls, mat, arg, shape=None, dtype=None):
-        """Create a new matrix with the same non-zero structure as mat.
-
-        Parameters
-        ----------
-        mat : sparse matrix
-            The reference matrix whose structure to copy.
-        arg : array_like
-            The data for the new matrix.
-        shape : tuple, optional
-            The shape of the new matrix. If None, uses mat.shape.
-        dtype : dtype, optional
-            The data type of the new matrix. If None, uses mat.dtype.
-
-        Returns
-        -------
-        sparse matrix
-            A new matrix with the same structure as mat but with data from arg.
-        """
-        if shape is None:
-            shape = mat.shape
-        if dtype is None:
-            dtype = mat.dtype
-        result = cls(arg, shape=shape, dtype=dtype)
-        return result
-
-
 # unpack_rect1_store unpacks a rect1 store into two int64 stores.
-def unpack_rect1_store(pos):
+def unpack_rect1_store(pos: LogicalStore) -> tuple[LogicalStore, LogicalStore]:
     """Unpack a rect1 store into two int64 stores.
 
     This function unpacks the compressed position array used in CSR/CSC
@@ -423,7 +423,9 @@ def unpack_rect1_store(pos):
 
 
 # pack_to_rect1_store packs two int64 stores into a rect1 store.
-def pack_to_rect1_store(lo, hi, output=None):
+def pack_to_rect1_store(
+    lo: LogicalStore, hi: LogicalStore, output: LogicalStore | None = None
+) -> LogicalStore:
     """Pack two int64 stores into a rect1 store.
 
     This function packs separate start and end position arrays into the
diff --git a/legate_sparse/config.py b/legate_sparse/config.py
index 8c601981..ab146521 100644
--- a/legate_sparse/config.py
+++ b/legate_sparse/config.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import os
 import platform
@@ -29,6 +30,10 @@ class _LegateSparseSharedLib:
     implements the core sparse matrix operations.
     """
 
+    LEGATE_SPARSE_LOAD_CUDALIBS: int
+    LEGATE_SPARSE_UNLOAD_CUDALIBS: int
+
+    LEGATE_SPARSE_CSR_TO_DENSE: int
     LEGATE_SPARSE_DENSE_TO_CSR: int
     LEGATE_SPARSE_DENSE_TO_CSR_NNZ: int
     LEGATE_SPARSE_ZIP_TO_RECT_1: int
@@ -49,6 +54,9 @@ class _LegateSparseSharedLib:
     LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR: int
     LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU: int
     LEGATE_SPARSE_AXPBY: int
+    LEGATE_SPARSE_SPSOLVE: int
+    LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC: int
+    LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE: int
 
 
 def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
@@ -88,7 +96,7 @@ class LegateSparseLib:
     library with the Legate runtime.
     """
 
-    def __init__(self, name):
+    def __init__(self, name: str) -> None:
         """Initialize the Legate sparse library.
 
         Parameters
@@ -98,9 +106,6 @@ def __init__(self, name):
         """
         self.name = name
         self.runtime = None
-        self.shared_object = None
-
-        self.name = name
 
         shared_lib_path = self.get_shared_library()
         assert shared_lib_path is not None
@@ -118,7 +123,9 @@ def __init__(self, name):
 
     def register(self) -> None:
         """Register the library with the Legate runtime."""
-        callback = getattr(self.shared_object, "legate_sparse_perform_registration")
+        callback = getattr(
+            self.shared_object, "legate_sparse_perform_registration"
+        )
         callback()
 
     def get_shared_library(self) -> str:
@@ -131,7 +138,9 @@ def get_shared_library(self) -> str:
         """
         from legate_sparse.install_info import libpath
 
-        return os.path.join(libpath, "liblegate_sparse" + self.get_library_extension())
+        return os.path.join(
+            libpath, "liblegate_sparse" + self.get_library_extension()
+        )
 
     def get_legate_library(self) -> Library:
         """Get the Legate library object.
@@ -181,7 +190,14 @@ def get_library_extension() -> str:
 """Name of the Legate sparse library."""
 
 sparse_lib = LegateSparseLib(SPARSE_LIB_NAME)
-sparse_lib.register()
+
+# Guard against double registration (can happen during Sphinx documentation builds)
+try:
+    sparse_lib.register()
+except Exception:
+    # Library may already be registered from a previous import
+    pass
+
 _sparse = sparse_lib.shared_object
 # has to be called after register()
 _library = sparse_lib.get_legate_library()
@@ -225,6 +241,10 @@ class SparseOpCode(IntEnum):
     SPGEMM_CSR_CSR_CSR = _sparse.LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR
     SPGEMM_CSR_CSR_CSR_GPU = _sparse.LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU
 
+    SPSOLVE = _sparse.LEGATE_SPARSE_SPSOLVE
+    GEAM_CSR_CSR_SYMBOLIC = _sparse.LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC
+    GEAM_CSR_CSR_COMPUTE = _sparse.LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE
+
 
 # Register some types for us to use.
 rect1 = types.rect_type(1)
diff --git a/legate_sparse/construct.py b/legate_sparse/construct.py
new file mode 100644
index 00000000..89a16c6e
--- /dev/null
+++ b/legate_sparse/construct.py
@@ -0,0 +1,260 @@
+# Copyright 2022-2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Portions of this file are also subject to the following license:
+#
+# Copyright (c) 2001-2002 Enthought, Inc. 2003-2022, SciPy Developers.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import annotations
+
+import cupynumeric as cn
+
+from .csr import csr_array
+
+
+def _block(blocks, format="csr", dtype=None):
+    """Build a sparse CSR array from sparse sub-blocks using COO intermediate.
+
+    1. Extracts (row, col, data) from each block
+    2. Adjusts indices by block offsets
+    3. Concatenates all coordinates
+    4. Builds CSR from COO format
+    """
+    if format != "csr":
+        raise ValueError("Only 'csr' format is supported for block_array")
+
+    if not isinstance(blocks, (list, tuple)):
+        blocks = list(blocks)
+
+    blocks = [
+        list(row) if isinstance(row, (list, tuple)) else [row]
+        for row in blocks
+    ]
+
+    n_block_rows = len(blocks)
+    if n_block_rows == 0:
+        raise ValueError("blocks cannot be empty")
+
+    n_block_cols = len(blocks[0])
+    if n_block_cols == 0:
+        raise ValueError("blocks cannot be empty")
+
+    # Row height and col width for a sub-block looks like this.
+    # +--------------+
+    # | ^            |
+    # | | row height |
+    # | v            |
+    # +--------------+
+    # <- col width ->
+
+    # store row heights and col widths of each sub-block
+    row_heights = [None] * n_block_rows
+    col_widths = [None] * n_block_cols
+
+    for i in range(n_block_rows):
+        for j in range(n_block_cols):
+            block = blocks[i][j]
+            if block is None:
+                continue
+
+            if not isinstance(block, csr_array):
+                raise TypeError(
+                    f"blocks[{i}][{j}] must be a csr_array or None, "
+                    f"got {type(block).__name__}"
+                )
+
+            block_nrows, block_ncols = block.shape
+
+            # Check/set row height for this block row.
+            # The row heights of all the sub-blocks in a row of the input
+            # should be the same, else we can't concatenate horizontally
+            if row_heights[i] is None:
+                row_heights[i] = block_nrows
+            elif row_heights[i] != block_nrows:
+                raise ValueError(
+                    f"blocks[{i}][{j}] has {block_nrows} rows, "
+                    f"expected {row_heights[i]}"
+                )
+
+            # Check/set column width for this block column.
+            # The col widths of all the sub-blocks in a col of the input
+            # should be the same, else we can't concatenate vertically
+            if col_widths[j] is None:
+                col_widths[j] = block_ncols
+            elif col_widths[j] != block_ncols:
+                raise ValueError(
+                    f"blocks[{i}][{j}] has {block_ncols} columns, "
+                    f"expected {col_widths[j]}"
+                )
+
+    # The input can have None instead of a csr matrix. To correctly compute
+    # the row offsets for those cases, we set the row height to 0 if the
+    # input is None.
+    row_heights = cn.array([h if h is not None else 0 for h in row_heights])
+    col_widths = cn.array([w if w is not None else 0 for w in col_widths])
+
+    # Compute the no. or rows and cols in the output matrix.
+    total_nrows = cn.sum(row_heights).item()
+    total_ncols = cn.sum(col_widths).item()
+
+    # When the output matrix is empty, we don't need to concatenate.
+    if total_nrows == 0 or total_ncols == 0:
+        result_dtype = dtype if dtype is not None else cn.float64
+        return csr_array((total_nrows, total_ncols), dtype=result_dtype)
+
+    row_offsets = cn.concatenate([cn.array([0]), cn.cumsum(row_heights)])
+    col_offsets = cn.concatenate([cn.array([0]), cn.cumsum(col_widths)])
+
+    if dtype is None:
+        dtypes = [b.dtype for row in blocks for b in row if b is not None]
+        dtype = cn.result_type(*dtypes) if dtypes else cn.float64
+
+    all_rows = []
+    all_cols = []
+    all_data = []
+
+    # Populate the concatenated (rows, cols, data) arrays for the
+    # output matrix. The outer loop concatenates the sub-blocks vertically
+    # while the inner loop concatenates them horizontally. This is done
+    # without creating any intermediate csr representation.
+    for i in range(n_block_rows):
+        row_offset = row_offsets[i].item()
+
+        for j in range(n_block_cols):
+            block = blocks[i][j]
+
+            # If block is empty, the (rows, cols, data) of the output matrix
+            # doesn't get modified, so we continue with the loop.
+            if block is None:
+                continue
+
+            col_offset = col_offsets[j].item()
+            block_nrows = block.shape[0]
+
+            indptr = block.indptr
+            indices = block.indices
+            data = block.data
+
+            # Empty csr matrices don't modify the output matrix either, so we
+            # continue with the loop.
+            if data.size == 0:
+                continue
+
+            # Expand the indptr array to store the row indices.
+            # For each row r, repeating r by (indptr[r+1] - indptr[r]) times
+            # the needed storage to store non-zero entries.
+            nnz_per_row = cn.diff(indptr)
+            block_rows = cn.repeat(cn.arange(block_nrows), nnz_per_row)
+
+            # After concatenating the matrices, we get one block matrix that
+            # can be represented by (rows, cols, data) arrays. Note that
+            # we have to add the offsets for both the row and col indices
+            # that correspond to the non-zero in the previous sub-block as
+            # concatenate them horizontally. This is because the output matrix
+            # is going to be represented as one giant CSR matrix.
+            all_rows.append(block_rows + row_offset)
+            all_cols.append(indices + col_offset)
+            all_data.append(data)
+
+    if not all_data:
+        result_dtype = dtype if dtype is not None else cn.float64
+        return csr_array((total_nrows, total_ncols), dtype=result_dtype)
+
+    concatenated_rows = cn.concatenate(all_rows)
+    concatenated_cols = cn.concatenate(all_cols)
+    concatenated_data = cn.concatenate(all_data).astype(dtype)
+
+    return csr_array(
+        (concatenated_data, (concatenated_rows, concatenated_cols)),
+        shape=(total_nrows, total_ncols),
+        dtype=dtype,
+    )
+
+
+def block_array(blocks, format="csr", dtype=None):
+    """Build a sparse array from sparse sub-blocks.
+
+    Parameters
+    ----------
+    blocks : array_like
+        A 2-D array-like of shape (M, N) where each element is a sparse
+        CSR array or None. None elements are treated as zero matrices.
+    format : str, optional
+        Output format. Currently only 'csr' is supported. Default is 'csr'.
+    dtype : dtype, optional
+        Data type of the output array. If None, inferred from the blocks.
+
+    Returns
+    -------
+    csr_array
+        A sparse CSR array formed by combining the sub-blocks.
+
+    Raises
+    ------
+    ValueError
+        - If `format` is not 'csr'.
+        - If `blocks` is empty (has zero rows or zero columns).
+        - If sub-blocks in the same row have different numbers of rows.
+        - If sub-blocks in the same column have different numbers of columns.
+    TypeError
+        - If any non-None block is not a csr_array.
+
+    Notes
+    -----
+    This function may not be performant when the number of sub-blocks is large,
+    as it iterates over all blocks sequentially to extract and concatenate their
+    COO coordinates.
+
+    Examples
+    --------
+    >>> import legate_sparse as sparse
+    >>> A = sparse.csr_array([[1, 2], [3, 4]])
+    >>> B = sparse.csr_array([[5], [6]])
+    >>> C = sparse.csr_array([[7, 8, 9]])
+    >>> result = sparse.block_array([[A, B], [C, None]])
+    >>> result.todense()
+    array([[1, 2, 5],
+           [3, 4, 6],
+           [7, 8, 9]])
+    """
+    return _block(blocks, format, dtype)
diff --git a/legate_sparse/coverage.py b/legate_sparse/coverage.py
index 8765044e..a6fa2bae 100644
--- a/legate_sparse/coverage.py
+++ b/legate_sparse/coverage.py
@@ -16,7 +16,7 @@
 
 from functools import wraps
 from types import FunctionType, MethodDescriptorType, MethodType, ModuleType
-from typing import Any, Container, Mapping, Optional, cast
+from typing import Any, Callable, Container, Mapping, TypeVar, cast
 
 from legate.core import track_provenance
 from typing_extensions import Protocol
@@ -27,7 +27,7 @@
 def filter_namespace(
     ns: Mapping[str, Any],
     *,
-    omit_names: Optional[Container[str]] = None,
+    omit_names: Container[str] | None = None,
     omit_types: tuple[type, ...] = (),
 ) -> dict[str, Any]:
     omit_names = omit_names or set()
@@ -43,8 +43,7 @@ def should_wrap(obj: object) -> bool:
 
 
 class AnyCallable(Protocol):
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        ...
+    def __call__(self, *args: Any, **kwargs: Any) -> Any: ...
 
 
 def wrap(func: AnyCallable) -> Any:
@@ -56,7 +55,9 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
     return wrapper
 
 
-def clone_module(origin_module: ModuleType, new_globals: dict[str, Any]) -> None:
+def clone_module(
+    origin_module: ModuleType, new_globals: dict[str, Any]
+) -> None:
     """Copy attributes from one module to another, excluding submodules
 
     Function types are wrapped with a decorator to report API calls. All
@@ -84,7 +85,10 @@ def clone_module(origin_module: ModuleType, new_globals: dict[str, Any]) -> None
             new_globals[attr] = wrapped
 
 
-def clone_scipy_arr_kind(origin_class: type) -> Any:
+T = TypeVar("T")
+
+
+def clone_scipy_arr_kind(origin_class: type) -> Callable[[T], T]:
     """Copy attributes from an origin class to the input class.
 
     Method types are wrapped with a decorator to report API calls. All
@@ -92,7 +96,7 @@ def clone_scipy_arr_kind(origin_class: type) -> Any:
 
     """
 
-    def body(cls: type):
+    def body(cls: T) -> T:
         for attr, value in cls.__dict__.items():
             # Only need to wrap things that are in the origin class to begin
             # with
diff --git a/legate_sparse/csr.py b/legate_sparse/csr.py
index 3008356e..051298e9 100644
--- a/legate_sparse/csr.py
+++ b/legate_sparse/csr.py
@@ -44,12 +44,14 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from __future__ import annotations
 
 import warnings
+from typing import TYPE_CHECKING, cast
 
-import cupynumeric
-import numpy
-import scipy  # type: ignore
+import cupynumeric as cn
+import numpy as np
+import scipy
 from legate.core import (
     ImageComputationHint,
     Scalar,
@@ -60,12 +62,7 @@
     types,
 )
 
-from .base import (
-    CompressedBase,
-    DenseSparseBase,
-    pack_to_rect1_store,
-    unpack_rect1_store,
-)
+from .base import CompressedBase, pack_to_rect1_store, unpack_rect1_store
 from .config import SparseOpCode, rect1
 from .coverage import clone_scipy_arr_kind
 from .runtime import runtime
@@ -75,22 +72,31 @@
     SUPPORTED_DATATYPES,
     array_from_store_or_array,
     cast_arr,
-    cast_to_common_type,
     cast_to_store,
     copy_store,
+    find_common_type,
     find_last_user_stacklevel,
     get_storage_type,
     get_store_from_cupynumeric_array,
+    is_dense,
     is_dtype_supported,
     is_scalar_like,
+    is_sparse,
     sort_by_rows_then_cols,
     store_from_store_or_array,
     store_to_cupynumeric_array,
 )
 
+if TYPE_CHECKING:
+    from typing import Any, Callable
+
+    import numpy.typing as npt
+
+    from cupynumeric.types import CastingKind
+
 
 @clone_scipy_arr_kind(scipy.sparse.csr_array)
-class csr_array(CompressedBase, DenseSparseBase):
+class csr_array(CompressedBase):
     """Compressed Sparse Row array.
 
     This can be instantiated in several ways:
@@ -187,7 +193,13 @@ class csr_array(CompressedBase, DenseSparseBase):
            [4, 5, 6]])
     """
 
-    def __init__(self, arg, shape=None, dtype=None, copy=False):
+    def __init__(
+        self,
+        arg: Any,
+        shape: tuple[int, ...] | None = None,
+        dtype: npt.dtype[Any] | None = None,
+        copy: bool = False,
+    ) -> None:
         """Initialize a CSR array.
 
         Parameters
@@ -233,11 +245,11 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         # Note that cupynumeric.dtype(None) returns float64, so make
         # sure dtype is passed to csr_array if it is known apriori,
         # especially when copying the matrix
-        dtype = cupynumeric.dtype(dtype)
+        dtype = np.dtype(dtype)
 
         # If from numpy.array - convert to cupynumeric array first
-        if isinstance(arg, numpy.ndarray):
-            arg = cupynumeric.array(arg)
+        if isinstance(arg, np.ndarray):
+            arg = cn.array(arg)
 
         # from scipy.sparse.csr_array
         if isinstance(arg, scipy.sparse.csr_array) or isinstance(
@@ -247,7 +259,7 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
             arg = (arg.data, arg.indices, arg.indptr)
 
         # from dense cupynumeric array
-        if isinstance(arg, cupynumeric.ndarray):
+        if isinstance(arg, cn.ndarray):
             assert arg.ndim == 2
 
             shape = arg.shape
@@ -257,18 +269,18 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
             src_store = get_store_from_cupynumeric_array(arg)
 
             q_nnz = runtime.create_store(nnz_ty, shape=Shape((shape[0],)))
-            task = runtime.create_auto_task(SparseOpCode.DENSE_TO_CSR_NNZ)
+            task1 = runtime.create_auto_task(SparseOpCode.DENSE_TO_CSR_NNZ)
             promoted_q_nnz = q_nnz.promote(1, shape[1])
-            nnz_per_row_part = task.add_output(promoted_q_nnz)
-            src_part = task.add_input(src_store)
-            task.add_constraint(broadcast(nnz_per_row_part, (1,)))
-            task.add_constraint(align(nnz_per_row_part, src_part))
-            task.execute()
+            nnz_per_row_part = task1.add_output(promoted_q_nnz)
+            src_part = task1.add_input(src_store)
+            task1.add_constraint(broadcast(nnz_per_row_part, (1,)))
+            task1.add_constraint(align(nnz_per_row_part, src_part))
+            task1.execute()
 
             # Assemble the output CSR array using the non-zeros per row.
-            self.pos, nnz = self.nnz_to_pos(q_nnz)
+            self.pos, nnz_scalar = self.nnz_to_pos(q_nnz)
             # Block and convert the nnz future into an int.
-            nnz = int(nnz)
+            nnz = int(nnz_scalar)
             self.crd = runtime.create_store(coord_ty, shape=((nnz,)))
             self.vals = runtime.create_store(arg.dtype, shape=((nnz,)))
 
@@ -276,14 +288,14 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
             # and 2-D input array, our only option is launch single process
             # which will handle all of the data, which makes this funciton not usable
             # on scale.
-            task = runtime.create_manual_task(SparseOpCode.DENSE_TO_CSR, (1,))
+            task2 = runtime.create_manual_task(SparseOpCode.DENSE_TO_CSR, (1,))
 
             promoted_pos = self.pos.promote(1, shape[1])
-            task.add_input(promoted_pos)
-            src_part = task.add_input(src_store)
-            task.add_output(self.crd)
-            task.add_output(self.vals)
-            task.execute()
+            task2.add_input(promoted_pos)
+            task2.add_input(src_store)
+            task2.add_output(self.crd)
+            task2.add_output(self.vals)
+            task2.execute()
 
             # we ignore dtype (TODO: is this behaviour matches SciPy?) and use arg.dtype
             dtype = arg.dtype
@@ -298,7 +310,9 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
             self.canonical_format = arg.canonical_format
 
         elif isinstance(arg, tuple):
-            dtype, shape = self._init_from_tuple_inputs(arg, dtype, shape, copy)
+            dtype, shape = self._init_from_tuple_inputs(
+                arg, dtype, shape, copy
+            )
         else:
             raise NotImplementedError("Can't convert to CSR from the input")
 
@@ -315,13 +329,19 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         if dtype is None:
             dtype = temp_vals_type
         if temp_vals_type is not dtype:
-            self.data = self.data.astype(dtype)
-        if not isinstance(dtype, numpy.dtype):
-            dtype = numpy.dtype(dtype)
+            self._data = self._data.astype(dtype)
+        if not isinstance(dtype, np.dtype):
+            dtype = np.dtype(dtype)
         # Saving the type
         self._dtype = dtype
 
-    def _init_from_tuple_inputs(self, arg, dtype, shape, copy):
+    def _init_from_tuple_inputs(
+        self,
+        arg: tuple[Any, ...],
+        dtype: npt.dtype[Any] | None,
+        shape: tuple[int, ...] | None,
+        copy: bool,
+    ) -> tuple[npt.dtype[Any], tuple[int, ...]]:
         """Initialize CSR array from tuple inputs.
 
         This internal method handles the various tuple-based constructor formats:
@@ -333,9 +353,9 @@ def _init_from_tuple_inputs(self, arg, dtype, shape, copy):
         ----------
         arg : tuple
             The input tuple in one of the supported formats.
-        dtype : dtype, optional
+        dtype : dtype
             The desired data type.
-        shape : tuple, optional
+        shape : tuple
             The shape of the array.
         copy : bool
             Whether to copy the input data.
@@ -353,12 +373,14 @@ def _init_from_tuple_inputs(self, arg, dtype, shape, copy):
             If the tuple format is not supported.
         """
 
-        def _get_empty_csr(dtype, nrows_plus_one):
+        def _get_empty_csr(
+            dtype: npt.dtype[Any] | None, nrows_plus_one: int
+        ) -> tuple[cn.ndarray, cn.ndarray, cn.ndarray]:
             """Helper function to create empty CSR arrays."""
             return (
-                cupynumeric.zeros(0, dtype=dtype),
-                cupynumeric.zeros(0, dtype=coord_ty),
-                cupynumeric.zeros(nrows_plus_one, dtype=coord_ty),
+                cn.zeros(0, dtype=dtype),
+                cn.zeros(0, dtype=coord_ty),
+                cn.zeros(nrows_plus_one, dtype=coord_ty),
             )
 
         # Couple of options here
@@ -367,16 +389,14 @@ def _get_empty_csr(dtype, nrows_plus_one):
             # csr_array((M, N), [dtype])
             if not isinstance(arg[1], tuple):
                 (M, N) = arg
-                if not isinstance(M, (int, numpy.integer)) or not isinstance(
-                    N, (int, numpy.integer)
+                if not isinstance(M, (int, np.integer)) or not isinstance(
+                    N, (int, np.integer)
                 ):
                     NotImplementedError(
                         "Input tuple for empty CSR ctor should be it's shape"
                     )
                 shape = arg
-                dtype = (
-                    cupynumeric.float64 if dtype is None else cupynumeric.dtype(dtype)
-                )
+                dtype = np.float64 if dtype is None else np.dtype(dtype)
 
                 # and pass this to next ctor
                 arg = _get_empty_csr(dtype, M + 1)
@@ -394,12 +414,12 @@ def _get_empty_csr(dtype, nrows_plus_one):
                     copy = False
                 else:
                     # if passed numpy arrays - convert them
-                    if isinstance(st_row, numpy.ndarray):
-                        st_row = cupynumeric.array(st_row)
-                    if isinstance(st_col, numpy.ndarray):
-                        st_col = cupynumeric.array(st_col)
-                    if isinstance(st_data, numpy.ndarray):
-                        st_data = cupynumeric.array(st_data)
+                    if isinstance(st_row, np.ndarray):
+                        st_row = cn.array(st_row)
+                    if isinstance(st_col, np.ndarray):
+                        st_col = cn.array(st_col)
+                    if isinstance(st_data, np.ndarray):
+                        st_data = cn.array(st_data)
 
                     if not self.indices_sorted:
                         # NOTE that CSR format does not require sorting the data
@@ -407,9 +427,15 @@ def _get_empty_csr(dtype, nrows_plus_one):
                         # sorted by rows and then by columns, so we sort the data
                         # by columns as well
 
-                        row_array = array_from_store_or_array(st_row, copy=copy)
-                        col_array = array_from_store_or_array(st_col, copy=copy)
-                        new_data = array_from_store_or_array(st_data, copy=copy)
+                        row_array = array_from_store_or_array(
+                            st_row, copy=copy
+                        )
+                        col_array = array_from_store_or_array(
+                            st_col, copy=copy
+                        )
+                        new_data = array_from_store_or_array(
+                            st_data, copy=copy
+                        )
 
                         indices = sort_by_rows_then_cols(row_array, col_array)
 
@@ -417,10 +443,10 @@ def _get_empty_csr(dtype, nrows_plus_one):
                         row_array = row_array[indices]
                         col_array = col_array[indices]
 
-                        row_offsets = cupynumeric.append(
-                            cupynumeric.array([0]),
-                            cupynumeric.cumsum(
-                                cupynumeric.bincount(row_array, minlength=shape[0])
+                        row_offsets = cn.append(
+                            cn.array([0]),
+                            cn.cumsum(
+                                cn.bincount(row_array, minlength=shape[0])
                             ),
                         )
 
@@ -432,10 +458,10 @@ def _get_empty_csr(dtype, nrows_plus_one):
                     else:
                         # we need to convert row indices to row offsets/indptr
                         row_array = array_from_store_or_array(st_row)
-                        row_offsets = cupynumeric.append(
-                            cupynumeric.array([0]),
-                            cupynumeric.cumsum(
-                                cupynumeric.bincount(row_array, minlength=shape[0])
+                        row_offsets = cn.append(
+                            cn.array([0]),
+                            cn.cumsum(
+                                cn.bincount(row_array, minlength=shape[0])
                             ),
                         )
                         if copy:
@@ -452,12 +478,12 @@ def _get_empty_csr(dtype, nrows_plus_one):
             (data, indices, indptr) = arg
 
             # if passed numpy arrays - convert them
-            if isinstance(data, numpy.ndarray):
-                data = cupynumeric.array(data)
-            if isinstance(indices, numpy.ndarray):
-                indices = cupynumeric.array(indices).astype(coord_ty)
-            if isinstance(indptr, numpy.ndarray):
-                indptr = cupynumeric.array(indptr).astype(coord_ty)
+            if isinstance(data, np.ndarray):
+                data = cn.array(data)
+            if isinstance(indices, np.ndarray):
+                indices = cn.array(indices).astype(coord_ty)
+            if isinstance(indptr, np.ndarray):
+                indptr = cn.array(indptr).astype(coord_ty)
 
             # checking that shape matches with expectations for row_offsets
             if indptr.shape[0] == shape[0] + 1:
@@ -470,8 +496,12 @@ def _get_empty_csr(dtype, nrows_plus_one):
                 )
                 # copy explicitly, just in case (there are paths that won't create temp object)
                 # For crd we enforce our internal type
-                self.crd = store_from_store_or_array(cast_arr(indices, coord_ty), copy)
-                self.vals = store_from_store_or_array(cast_to_store(data), copy)
+                self.crd = store_from_store_or_array(
+                    cast_arr(indices, coord_ty), copy
+                )
+                self.vals = store_from_store_or_array(
+                    cast_to_store(data), copy
+                )
 
             # Otherwise we assume that we are passing pos store from existing csr_array
             # This is internal only functionality, and we assume here only Store or cupynumeric.array
@@ -487,15 +517,22 @@ def _get_empty_csr(dtype, nrows_plus_one):
 
             dtype = get_storage_type(data)
 
+        assert dtype is not None
+        assert shape is not None
+
         return dtype, shape
 
+    # correct return type value on this subclass
+    def _with_data(self, data: Any, copy: bool = True) -> csr_array:
+        return cast(csr_array, super()._with_data(data, copy))
+
     @property
-    def dim(self):
+    def dim(self) -> int:
         """Number of dimensions (always 2 for CSR arrays)."""
         return self.ndim
 
     @property
-    def nnz(self):
+    def nnz(self) -> int:
         """Number of stored values, including explicit zeros.
 
         Returns
@@ -506,7 +543,12 @@ def nnz(self):
         return self.vals.shape[0]
 
     @property
-    def dtype(self):
+    def size(self) -> int:
+        """Number of stored values"""
+        return self.nnz
+
+    @property
+    def dtype(self) -> npt.dtype[Any]:
         """Data type of the array.
 
         Returns
@@ -518,7 +560,7 @@ def dtype(self):
         return self._dtype
 
     # Enable direct operation on the values array.
-    def get_data(self):
+    def get_data(self) -> cn.ndarray:
         """Get the data array of the CSR matrix.
 
         Returns
@@ -529,7 +571,7 @@ def get_data(self):
         return store_to_cupynumeric_array(self.vals)
 
     # From array,
-    def set_data(self, data):
+    def set_data(self, data: cn.ndarray) -> None:
         """Set the data array of the CSR matrix.
 
         Parameters
@@ -542,9 +584,9 @@ def set_data(self, data):
         AssertionError
             If data is not a cupynumeric.ndarray.
         """
-        if isinstance(data, numpy.ndarray):
-            data = cupynumeric.array(data)
-        assert isinstance(data, cupynumeric.ndarray)
+        if isinstance(data, np.ndarray):
+            data = cn.array(data)
+        assert isinstance(data, cn.ndarray)
         self.vals = get_store_from_cupynumeric_array(data)
         self._dtype = data.dtype
 
@@ -553,7 +595,7 @@ def set_data(self, data):
     )
 
     # Enable direct operation on the indices array.
-    def get_indices(self):
+    def get_indices(self) -> cn.ndarray:
         """Get the column indices array of the CSR matrix.
 
         Returns
@@ -563,7 +605,7 @@ def get_indices(self):
         """
         return store_to_cupynumeric_array(self.crd)
 
-    def set_indices(self, indices):
+    def set_indices(self, indices: cn.ndarray) -> None:
         """Set the column indices array of the CSR matrix.
 
         Parameters
@@ -581,19 +623,21 @@ def set_indices(self, indices):
         Setting new indices will mark the matrix as not having sorted indices
         and not being in canonical format.
         """
-        if isinstance(indices, numpy.ndarray):
-            indices = cupynumeric.array(indices)
-        assert isinstance(indices, cupynumeric.ndarray)
+        if isinstance(indices, np.ndarray):
+            indices = cn.array(indices)
+        assert isinstance(indices, cn.ndarray)
         self.crd = get_store_from_cupynumeric_array(indices)
         # we can't guarantee new indices are sorted
         self.canonical_format = False
         self.indices_sorted = False
 
     indices = property(
-        fget=get_indices, fset=set_indices, doc="CSR format index array of the matrix"
+        fget=get_indices,
+        fset=set_indices,
+        doc="CSR format index array of the matrix",
     )
 
-    def get_indptr(self):
+    def get_indptr(self) -> cn.ndarray:
         """Get the index pointer array of the CSR matrix.
 
         Returns
@@ -605,14 +649,14 @@ def get_indptr(self):
         """
         row_start_st, row_end_st = unpack_rect1_store(self.pos)
         row_start = store_to_cupynumeric_array(row_start_st)
-        return cupynumeric.append(row_start, [self.nnz])
+        return cn.append(row_start, [self.nnz])
 
     # Disallow changing intptrs directly
     indptr = property(
         fget=get_indptr, doc="CSR format index pointer array of the matrix"
     )
 
-    def _get_row_indices(self):
+    def _get_row_indices(self) -> cn.ndarray:
         """Helper routine that converts pos to row indices.
 
         This internal method expands the compressed row storage format's position
@@ -638,7 +682,7 @@ def _get_row_indices(self):
         task.execute()
         return store_to_cupynumeric_array(row_indices)
 
-    def has_sorted_indices(self):
+    def has_sorted_indices(self) -> bool:
         """Determine whether the matrix has sorted indices.
 
         Returns
@@ -648,7 +692,7 @@ def has_sorted_indices(self):
         """
         return self.indices_sorted
 
-    def has_canonical_format(self):
+    def has_canonical_format(self) -> bool:
         """Determine whether the matrix is in canonical format.
 
         Returns
@@ -665,7 +709,7 @@ def has_canonical_format(self):
         return self.canonical_format
 
     # The rest of the methods
-    def diagonal(self, k=0):
+    def diagonal(self, k: int = 0) -> cn.ndarray:
         """Return the k-th diagonal of the matrix.
 
         Parameters
@@ -691,7 +735,7 @@ def diagonal(self, k=0):
         """
         rows, cols = self.shape
         if k <= -rows or k >= cols:
-            return cupynumeric.empty(0, dtype=self.dtype)
+            return cn.empty(0, dtype=self.dtype)
         output = runtime.create_store(
             self.dtype, shape=Shape((min(rows + min(k, 0), cols - max(k, 0)),))
         )
@@ -713,7 +757,9 @@ def diagonal(self, k=0):
         task.execute()
         return store_to_cupynumeric_array(output)
 
-    def todense(self, order=None, out=None):
+    def todense(
+        self, order: str | None = None, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         """Return a dense matrix representation of this matrix.
 
         Parameters
@@ -744,25 +790,25 @@ def todense(self, order=None, out=None):
         if order is not None:
             raise NotImplementedError
         if out is not None:
-            out = cupynumeric.array(out)
+            out = cn.array(out)
             if out.dtype != self.dtype:
                 raise ValueError(
                     f"Output type {out.dtype} is not consistent with dtype {self.dtype}"
                 )
-            out = get_store_from_cupynumeric_array(out)
+            out_store = get_store_from_cupynumeric_array(out)
         elif out is None:
-            out = runtime.create_store(self.dtype, shape=self.shape)
+            out_store = runtime.create_store(self.dtype, shape=self.shape)
 
         task = runtime.create_manual_task(SparseOpCode.CSR_TO_DENSE, (1,))
         self.pos.promote(1, self.shape[1])
-        task.add_output(out)
+        task.add_output(out_store)
         task.add_input(self.pos)
         task.add_input(self.crd)
         task.add_input(self.vals)
         task.execute()
-        return store_to_cupynumeric_array(out)
+        return store_to_cupynumeric_array(out_store)
 
-    def multiply(self, other):
+    def multiply(self, other: Any) -> csr_array:
         """Point-wise multiplication by another matrix, vector, or scalar.
 
         Parameters
@@ -779,9 +825,9 @@ def multiply(self, other):
         -----
         This is equivalent to the * operator.
         """
-        return self * other
+        return cast(csr_array, self * other)
 
-    def __rmul__(self, other):
+    def __rmul__(self, other: Any) -> csr_array:
         """Right multiplication by a scalar.
 
         Parameters
@@ -794,10 +840,10 @@ def __rmul__(self, other):
         csr_array
             The result of the multiplication.
         """
-        return self * other
+        return cast(csr_array, self * other)
 
     # This is an element-wise operation now.
-    def __mul__(self, other):
+    def __mul__(self, other: Any) -> csr_array:
         """Element-wise multiplication.
 
         Parameters
@@ -820,10 +866,10 @@ def __mul__(self, other):
         Currently only supports scalar multiplication. Array multiplication
         is not implemented.
         """
-        if isinstance(other, numpy.ndarray):
-            other = cupynumeric.array(other)
+        if isinstance(other, np.ndarray):
+            other = cn.array(other)
 
-        if cupynumeric.ndim(other) == 0:
+        if cn.ndim(other) == 0:
             # If we have a scalar, then do an element-wise multiply on the
             # values array.
             new_vals = store_to_cupynumeric_array(self.vals) * other
@@ -832,7 +878,7 @@ def __mul__(self, other):
             raise NotImplementedError
 
     # rmatmul represents the operation other @ self.
-    def __rmatmul__(self, other):
+    def __rmatmul__(self, other: Any) -> cn.ndarray | csr_array:
         """Right matrix multiplication (other @ self).
 
         Parameters
@@ -858,7 +904,7 @@ def __rmatmul__(self, other):
         # Handle dense @ CSR
         raise NotImplementedError
 
-    def __matmul__(self, other):
+    def __matmul__(self, other: Any) -> cn.ndarray | csr_array:
         """Matrix multiplication (self @ other).
 
         Parameters
@@ -877,7 +923,9 @@ def __matmul__(self, other):
         """
         return self.dot(other)
 
-    def _compare_scalar(self, other, op):
+    def _compare_scalar(
+        self, other: object, op: Callable[..., cn.ndarray]
+    ) -> csr_array:
         """Helper method for element-wise comparison operations with scalars.
         This methods returns a boolean CSR array with True values where
         the comparison for op returns True.
@@ -898,7 +946,7 @@ def _compare_scalar(self, other, op):
         mask = op(store_to_cupynumeric_array(self.vals), other)
         col_indices = store_to_cupynumeric_array(self.crd)[mask]
         row_indices = self._get_row_indices()[mask]
-        vals = cupynumeric.ones(row_indices.size, dtype=bool)
+        vals = cn.ones(row_indices.size, dtype=bool)
 
         # NOTE:
         # If the data was already sorted by rows and cols in self,
@@ -906,12 +954,10 @@ def _compare_scalar(self, other, op):
         # but there's no clean way to pass to the class that the data
         # is already sorted
         return csr_array(
-            (vals, (row_indices, col_indices)),
-            shape=self.shape,
-            dtype=bool,
+            (vals, (row_indices, col_indices)), shape=self.shape, dtype=bool
         )
 
-    def __gt__(self, other):
+    def __gt__(self, other: object) -> csr_array:
         """Element-wise greater than comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -936,9 +982,9 @@ def __gt__(self, other):
         >>> A = csr_array(...)
         >>> mask = A > 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.greater)
+        return self._compare_scalar(other, cn.greater)
 
-    def __lt__(self, other):
+    def __lt__(self, other: object) -> csr_array:
         """Element-wise less than comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -963,9 +1009,9 @@ def __lt__(self, other):
         >>> A = csr_array(...)
         >>> mask = A < 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.less)
+        return self._compare_scalar(other, cn.less)
 
-    def __ge__(self, other):
+    def __ge__(self, other: object) -> csr_array:
         """Element-wise greater than or equal comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -990,9 +1036,9 @@ def __ge__(self, other):
         >>> A = csr_array(...)
         >>> mask = A >= 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.greater_equal)
+        return self._compare_scalar(other, cn.greater_equal)
 
-    def __le__(self, other):
+    def __le__(self, other: object) -> csr_array:
         """Element-wise less than or equal comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -1017,9 +1063,9 @@ def __le__(self, other):
         >>> A = csr_array(...)
         >>> mask = A <= 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.less_equal)
+        return self._compare_scalar(other, cn.less_equal)
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> csr_array:  # type: ignore [override]
         """Element-wise equality comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -1044,9 +1090,9 @@ def __eq__(self, other):
         >>> A = csr_array(...)
         >>> mask = A == 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.equal)
+        return self._compare_scalar(other, cn.equal)
 
-    def __ne__(self, other):
+    def __ne__(self, other: object) -> csr_array:  # type: ignore [override]
         """Element-wise not equal comparison with a scalar value.
         This operates only on the existing non-zero elements of the matrix.
 
@@ -1071,9 +1117,11 @@ def __ne__(self, other):
         >>> A = csr_array(...)
         >>> mask = A != 0.5  # Returns boolean CSR array
         """
-        return self._compare_scalar(other, cupynumeric.not_equal)
+        return self._compare_scalar(other, cn.not_equal)
 
-    def __setitem__(self, key, value):
+    def __setitem__(
+        self, key: csr_array | csr_matrix, value: Any
+    ) -> csr_array:
         """Set values in the matrix using a boolean CSR mask.
 
         Parameters
@@ -1118,7 +1166,9 @@ def __setitem__(self, key, value):
         assert key.shape == self.shape
         assert key.dtype == bool
 
-        value_store = runtime.legate_runtime.create_store_from_scalar(Scalar(value))
+        value_store = runtime.legate_runtime.create_store_from_scalar(
+            Scalar(value)
+        )
 
         # launch c++ task
         task = runtime.create_auto_task(SparseOpCode.CSR_INDEXING_CSR)
@@ -1144,7 +1194,76 @@ def __setitem__(self, key, value):
 
         return self
 
-    def dot(self, other, out=None):
+    def __neg__(self) -> csr_array:
+        """Return -self (negation of all values)."""
+        return self._with_data(
+            -store_to_cupynumeric_array(self.vals), copy=True
+        )
+
+    # self - other
+    def __sub__(self, other) -> csr_array:
+        if is_scalar_like(other):
+            if other == 0:
+                return self.copy()
+            raise NotImplementedError(
+                "Subtraction of a scalar from a Legate Sparse array "
+                "will break sparsity and is not supported."
+                "Use the method data() to manipulate only the nonzeros."
+            )
+        elif is_sparse(other):
+            if other.shape != self.shape:
+                raise ValueError(
+                    "Inconsistent shapes: ({self.shape}, {other.shape})"
+                )
+            return geam(self, other, 1.0, -1.0, None)
+        elif is_dense(other):
+            return self.todense() - other
+
+        return NotImplemented
+
+    # other - self
+    def __rsub__(self, other: csr_array) -> csr_array:
+        if is_scalar_like(other):
+            if other == 0:
+                return -self.copy()
+            raise NotImplementedError(
+                "Subtraction of a scalar from a Legate Sparse array "
+                "will break sparsity and is not supported."
+                "Use the method data() to manipulate only the nonzeros."
+            )
+        elif is_dense(other):
+            return other - self.todense()
+
+        return NotImplemented
+
+    # self + other
+    def __add__(self, other):
+        if is_scalar_like(other):
+            if other == 0:
+                return self.copy()
+            raise NotImplementedError(
+                "Addition of a scalar to a Legate Sparse array "
+                "will break sparsity and is not supported."
+                "Use the method data() to manipulate only the nonzeros."
+            )
+        elif is_sparse(other):
+            if other.shape != self.shape:
+                raise ValueError(
+                    "Inconsistent shapes: ({self.shape}, {other.shape})"
+                )
+            return geam(self, other, 1.0, 1.0, None)
+        elif is_dense(other):
+            return self.todense() + other
+
+        return NotImplemented
+
+    # other + self
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def dot(
+        self, other: cn.ndarray | csr_array, out: cn.ndarray | None = None
+    ) -> cn.ndarray | csr_array:
         """Ordinary dot product.
 
         Parameters
@@ -1200,7 +1319,7 @@ def dot(self, other, out=None):
         """
         # If output specified - it should be cupynumeric array
         if out is not None:
-            assert isinstance(out, cupynumeric.ndarray)
+            assert isinstance(out, cn.ndarray)
 
         # only floating point operations are supported by cusparse at the moment
         if runtime.num_gpus > 0:
@@ -1214,10 +1333,12 @@ def dot(self, other, out=None):
                 raise NotImplementedError(msg)
 
         # If other.shape = (M,) then it's SpMV
-        if len(other.shape) == 1 or (len(other.shape) == 2 and other.shape[1] == 1):
+        if len(other.shape) == 1 or (
+            len(other.shape) == 2 and other.shape[1] == 1
+        ):
             # convert X to the cupynumeric array if needed
-            if not isinstance(other, cupynumeric.ndarray):
-                other = cupynumeric.array(other)
+            if not isinstance(other, cn.ndarray):
+                other = cn.array(other)
             assert self.shape[1] == other.shape[0]
             # for the case of X shape == (M, 1)
             other_originally_2d = False
@@ -1233,11 +1354,13 @@ def dot(self, other, out=None):
                     category=RuntimeWarning,
                     stacklevel=level,
                 )
-                other = cupynumeric.array(other)
+                other = cn.array(other)
 
             # Coerce A and x into a common type. Use that coerced type
             # to find the type of the output.
-            A, x = cast_to_common_type(self, other)
+            common_dtype = find_common_type(self, other)
+            A = self.astype(common_dtype, copy=False)
+            x = other.astype(common_dtype, copy=False)
             if out is None:
                 y = store_to_cupynumeric_array(
                     runtime.create_store(A.dtype, shape=(self.shape[0],))
@@ -1272,12 +1395,16 @@ def dot(self, other, out=None):
             if out is not None:
                 raise ValueError("Cannot provide out for CSRxCSR matmul.")
             assert self.shape[1] == other.shape[0]
-            return spgemm_csr_csr_csr(*cast_to_common_type(self, other))
+            common_dtype = find_common_type(self, other)
+            return spgemm_csr_csr_csr(
+                self.astype(common_dtype, copy=False),
+                other.astype(common_dtype, copy=False),
+            )
         else:
             raise NotImplementedError
 
     # Misc
-    def _getpos(self):
+    def _getpos(self) -> list[tuple[int, int]]:
         """Helper method to get row start and end positions.
 
         This internal method unpacks the compressed row storage format's position array
@@ -1295,7 +1422,7 @@ def _getpos(self):
         row_end = store_to_cupynumeric_array(row_end_st)
         return [(i, j) for (i, j) in zip(row_start, row_end)]
 
-    def copy(self):
+    def copy(self) -> csr_array:
         """Returns a copy of this matrix.
 
         Returns
@@ -1305,7 +1432,7 @@ def copy(self):
         """
         return csr_array(self, dtype=self.dtype)
 
-    def conj(self, copy=True):
+    def conj(self, copy: bool = True) -> csr_array:
         """Element-wise complex conjugate.
 
         Parameters
@@ -1329,7 +1456,9 @@ def conj(self, copy=True):
             get_store_from_cupynumeric_array(self.data.conj()), copy=False
         )
 
-    def transpose(self, axes=None, copy=False):
+    def transpose(
+        self, axes: Any | None = None, copy: bool = False
+    ) -> csr_array:
         """Reverses the dimensions of the sparse matrix.
 
         Parameters
@@ -1373,7 +1502,9 @@ def transpose(self, axes=None, copy=False):
         task.execute()
 
         # sort
-        sort_mask = cupynumeric.argsort(self.crd, kind="stable")
+        sort_mask = cn.argsort(
+            store_to_cupynumeric_array(self.crd), kind="stable"
+        )
         new_rows = self.get_indices()[sort_mask]
         new_ci = store_to_cupynumeric_array(rows_expanded)[sort_mask]
         new_data = self.get_data()[sort_mask]
@@ -1388,7 +1519,7 @@ def transpose(self, axes=None, copy=False):
 
     T = property(transpose, doc="Transpose of the matrix")
 
-    def asformat(self, format, copy=False):
+    def asformat(self, format: str | None, copy: bool = False) -> csr_array:
         """Convert this matrix to a specified format.
 
         Parameters
@@ -1417,7 +1548,16 @@ def asformat(self, format, copy=False):
         else:
             raise NotImplementedError("Only CSR format is supported right now")
 
-    def tocsr(self, copy=False):
+    # correct return type value on this subclass
+    def astype(
+        self,
+        dtype: npt.dtype[Any],
+        casting: CastingKind = "unsafe",
+        copy: bool = True,
+    ) -> csr_array:
+        return cast(csr_array, super().astype(dtype, casting, copy))
+
+    def tocsr(self, copy: bool = False) -> csr_array:
         """Convert this matrix to a CSR matrix.
 
         Parameters
@@ -1439,7 +1579,7 @@ def tocsr(self, copy=False):
             return self.copy().tocsr(copy=False)
         return self
 
-    def nonzero(self):
+    def nonzero(self) -> tuple[cn.ndarray, cn.ndarray]:
         """Return the indices of the non-zero elements.
 
         Returns
@@ -1455,13 +1595,15 @@ def nonzero(self):
         """
         task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES)
 
-        row_indices = runtime.create_store(coord_ty, shape=self.crd.shape)
-        row_indices_part = task.add_output(row_indices)
+        row_indices_store = runtime.create_store(
+            coord_ty, shape=self.crd.shape
+        )
+        row_indices_part = task.add_output(row_indices_store)
         pos_part = task.add_input(self.pos)
         task.add_constraint(image(pos_part, row_indices_part))
         task.execute()
 
-        row_indices = store_to_cupynumeric_array(row_indices)
+        row_indices = store_to_cupynumeric_array(row_indices_store)
         col_indices = store_to_cupynumeric_array(self.crd)
         vals_array = store_to_cupynumeric_array(self.vals)
         mask = vals_array != 0.0
@@ -1474,7 +1616,7 @@ def nonzero(self):
 
 
 # spmv computes y = A @ x.
-def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
+def spmv(A: csr_array, x: cn.ndarray, y: cn.ndarray) -> None:
     """Perform sparse matrix vector product y = A @ x.
 
     Parameters
@@ -1506,10 +1648,16 @@ def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
     x_var = task.add_input(x_store)
 
     task.add_constraint(align(y_var, pos_var))
-    task.add_constraint(image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST))
-    task.add_constraint(image(pos_var, vals_var, hint=ImageComputationHint.FIRST_LAST))
+    task.add_constraint(
+        image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)
+    )
+    task.add_constraint(
+        image(pos_var, vals_var, hint=ImageComputationHint.FIRST_LAST)
+    )
     # exact or approximate image to X
-    task.add_constraint(image(crd_var, x_var, hint=ImageComputationHint.MIN_MAX))
+    task.add_constraint(
+        image(crd_var, x_var, hint=ImageComputationHint.MIN_MAX)
+    )
 
     task.execute()
 
@@ -1553,7 +1701,7 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
     if runtime.num_gpus > 0:
         # replacement for the ImagePartition functor to get dense image
         # for rows of B, run separate task for this
-        pos_rect = runtime.create_store(rect1, shape=(A.shape[0],))  # type: ignore
+        pos_rect = runtime.create_store(rect1, shape=(A.shape[0],))
         task = runtime.create_auto_task(SparseOpCode.FAST_IMAGE_RANGE)
         A_pos_part = task.add_input(A.pos)
         A_crd_part = task.add_input(A.crd)
@@ -1566,7 +1714,7 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
 
         task.execute()
 
-        pos = runtime.create_store(rect1, shape=(A.shape[0],))  # type: ignore
+        pos = runtime.create_store(rect1, shape=(A.shape[0],))
         crd = runtime.create_store(coord_ty, ndim=1)
         vals = runtime.create_store(A.dtype, ndim=1)
 
@@ -1605,7 +1753,9 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
         # Array class should provide this functionality
         task.add_constraint(align(A_pos_part, B_pos_image_part))
         task.add_constraint(
-            image(B_pos_image_part, B_pos_part, hint=ImageComputationHint.MIN_MAX)
+            image(
+                B_pos_image_part, B_pos_part, hint=ImageComputationHint.MIN_MAX
+            )
         )
 
         task.add_constraint(
@@ -1659,11 +1809,11 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
 
         task.execute()
 
-        pos, nnz = CompressedBase.nnz_to_pos_cls(q_nnz)
+        pos, nnz_value = CompressedBase.nnz_to_pos_cls(q_nnz)
         # Block and convert the nnz future into an int.
-        nnz = int(nnz)
-        crd = runtime.create_store(coord_ty, shape=(nnz,))
-        vals = runtime.create_store(A.dtype, shape=(nnz,))
+        nnz = int(nnz_value)
+        crd = runtime.create_store(coord_ty, shape=Shape((nnz,)))
+        vals = runtime.create_store(A.dtype, shape=Shape((nnz,)))
 
         task = runtime.create_auto_task(SparseOpCode.SPGEMM_CSR_CSR_CSR)
         C_pos_part_out = task.add_output(pos)
@@ -1692,7 +1842,124 @@ def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
         task.add_constraint(image(B_pos_part, B_vals_part))
 
         task.execute()
+        return csr_array((vals, crd, pos), shape=(A.shape[0], B.shape[1]))
+
+
+def geam(A: csr_array, B: csr_array, alpha: Any, beta: Any, C=None):
+    """Compute C = alpha * A + beta * B for CSR matrices.
+
+    Parameters
+    ----------
+    A : csr_array
+        First input sparse matrix.
+    B : csr_array
+        Second input sparse matrix. Must have same shape as A.
+    alpha : scalar-like
+        Scalar multiplier for A. Will be cast to A.dtype.
+    beta : scalar-like
+        Scalar multiplier for B. Will be cast to A.dtype.
+    C : csr_array, optional
+        Output sparse matrix. If provided, must have the correct sparsity
+        pattern to hold the result. If None, a new matrix is allocated.
+
+    Returns
+    -------
+    csr_array
+        The result C = alpha * A + beta * B.
+
+    Notes
+    -----
+    If C is provided, it is the user's responsibility to ensure the sparsity
+    pattern matches the result. Behavior is undefined otherwise.
+
+    alpha and beta may be integers, floats, or complex values. They are
+    converted to A.dtype before computation. For complex inputs, A.dtype
+    should be a complex dtype to preserve the imaginary component.
+    """
+
+    if C is None:
+        perform_symbolic_phase = True
+    else:
+        # If C is provided, assume it has the correct sparsity pattern
+        assert isinstance(C, csr_array), "C must be a Legate Sparse CSR array"
+        perform_symbolic_phase = False
+
+    # Symbolic phase: compute the sparsity pattern of the result
+    if perform_symbolic_phase:
+        nnz_per_row = runtime.create_store(nnz_ty, A.pos.shape)
+        task = runtime.create_auto_task(SparseOpCode.GEAM_CSR_CSR_SYMBOLIC)
+        A_pos_part = task.add_input(A.pos)
+        A_crd_part = task.add_input(A.crd)
+        B_pos_part = task.add_input(B.pos)
+        B_crd_part = task.add_input(B.crd)
+        nnz_per_row_part = task.add_output(nnz_per_row)
+
+        task.add_constraint(image(A_pos_part, A_crd_part))
+        task.add_constraint(image(B_pos_part, B_crd_part))
+        task.add_constraint(align(A_pos_part, B_pos_part))
+        task.add_constraint(align(A_pos_part, nnz_per_row_part))
+
+        task.execute()
+
+        # Compute C_pos from nnz_per_row using the helper from CompressedBase
+        C_pos, nnz_scalar = CompressedBase.nnz_to_pos_cls(nnz_per_row)
+        nnz_total = int(nnz_scalar)
+
+    # Allocate output arrays if needed
+    if perform_symbolic_phase:
+        C_vals = runtime.create_store(A.dtype, shape=(nnz_total,))
+        C_crd = runtime.create_store(coord_ty, shape=(nnz_total,))
+    else:
+        C_vals = C.vals
+        C_crd = C.crd
+        C_pos = C.pos
+
+    # Create scalar stores for alpha and beta
+    alpha_store = runtime.legate_runtime.create_store_from_scalar(
+        Scalar(A.dtype.type(alpha))
+    )
+    beta_store = runtime.legate_runtime.create_store_from_scalar(
+        Scalar(A.dtype.type(beta))
+    )
+
+    # Compute phase: C = alpha * A + beta * B
+    task = runtime.create_auto_task(SparseOpCode.GEAM_CSR_CSR_COMPUTE)
+
+    # Inputs (order must match C++ template expectations)
+    A_pos_part = task.add_input(A.pos)
+    A_crd_part = task.add_input(A.crd)
+    A_vals_part = task.add_input(A.vals)
+    B_pos_part = task.add_input(B.pos)
+    B_crd_part = task.add_input(B.crd)
+    B_vals_part = task.add_input(B.vals)
+
+    # C_pos is an INPUT (already computed in symbolic phase)
+    C_pos_part = task.add_input(C_pos)
+
+    # C_crd and C_vals are outputs
+    C_crd_part = task.add_output(C_crd)
+    C_vals_part = task.add_output(C_vals)
+
+    # Scalar inputs (alpha and beta)
+    task.add_input(alpha_store)
+    task.add_input(beta_store)
+
+    # Align row partitions: A, B, C all partitioned by the same rows
+    task.add_constraint(align(A_pos_part, B_pos_part))
+    task.add_constraint(align(A_pos_part, C_pos_part))
+
+    # Image constraints: crd and vals are partitioned via pos
+    task.add_constraint(image(A_pos_part, A_crd_part))
+    task.add_constraint(image(A_pos_part, A_vals_part))
+    task.add_constraint(image(B_pos_part, B_crd_part))
+    task.add_constraint(image(B_pos_part, B_vals_part))
+    task.add_constraint(image(C_pos_part, C_crd_part))
+    task.add_constraint(image(C_pos_part, C_vals_part))
+
+    task.execute()
+
+    if perform_symbolic_phase:
         return csr_array(
-            (vals, crd, pos),
-            shape=Shape((A.shape[0], B.shape[1])),
+            (C_vals, C_crd, C_pos), shape=A.shape, dtype=A.dtype, copy=False
         )
+    return C
diff --git a/legate_sparse/dia.py b/legate_sparse/dia.py
index 20f2dc5c..4fc035ca 100644
--- a/legate_sparse/dia.py
+++ b/legate_sparse/dia.py
@@ -44,10 +44,13 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from __future__ import annotations
 
-import cupynumeric
-import numpy
-import scipy  # type: ignore
+from typing import TYPE_CHECKING
+
+import cupynumeric as cn
+import numpy as np
+import scipy
 
 from .base import CompressedBase
 from .coverage import clone_scipy_arr_kind
@@ -59,6 +62,11 @@
     store_to_cupynumeric_array,
 )
 
+if TYPE_CHECKING:
+    from typing import Any
+
+    import numpy.typing as npt
+
 
 # Temporary implementation for matrix generation in examples
 @clone_scipy_arr_kind(scipy.sparse.dia_array)
@@ -128,7 +136,13 @@ class dia_array(CompressedBase):
            [0, 7, 9]])
     """
 
-    def __init__(self, arg, shape=None, dtype=None, copy=False):
+    def __init__(
+        self,
+        arg: tuple[cn.ndarray, cn.ndarray],
+        shape: tuple[int, ...] | None = None,
+        dtype: npt.dtype[Any] | None = None,
+        copy: bool = False,
+    ) -> None:
         """Initialize a DIA array.
 
         Parameters
@@ -169,14 +183,14 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         assert isinstance(arg, tuple)
         data, offsets = arg
         if isinstance(offsets, int):
-            offsets = cupynumeric.full((1,), offsets)
+            offsets = cn.full((1,), offsets)
         data, offsets = cast_arr(data), cast_arr(offsets)
         if dtype is not None:
             data = data.astype(dtype)
         dtype = data.dtype
         assert dtype is not None
-        if not isinstance(dtype, numpy.dtype):
-            dtype = numpy.dtype(dtype)
+        if not isinstance(dtype, np.dtype):
+            dtype = np.dtype(dtype)
 
         self.dtype = dtype
         # Ensure that we don't accidentally include ndarray
@@ -185,10 +199,10 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         # legate under the hood.
         self.shape = tuple(int(i) for i in shape)
         self._offsets = get_store_from_cupynumeric_array(offsets, copy=copy)
-        self._data = get_store_from_cupynumeric_array(data, copy=copy)
+        self._store = get_store_from_cupynumeric_array(data, copy=copy)
 
     @property
-    def nnz(self):
+    def nnz(self) -> int:
         """Number of stored values, including explicit zeros.
 
         Returns
@@ -211,7 +225,7 @@ def nnz(self):
         return int(nnz)
 
     @property
-    def data(self):
+    def data(self) -> cn.ndarray:
         """Get the data array of the DIA matrix.
 
         Returns
@@ -220,10 +234,10 @@ def data(self):
             The data array containing the diagonal values. Each row represents
             a diagonal, with shape (n_diagonals, max_diagonal_length).
         """
-        return store_to_cupynumeric_array(self._data)
+        return store_to_cupynumeric_array(self._store)
 
     @property
-    def offsets(self):
+    def offsets(self) -> cn.ndarray:
         """Get the offsets array of the DIA matrix.
 
         Returns
@@ -235,7 +249,7 @@ def offsets(self):
         """
         return store_to_cupynumeric_array(self._offsets)
 
-    def copy(self):
+    def copy(self) -> dia_array:
         """Returns a copy of this matrix.
 
         Returns
@@ -243,11 +257,13 @@ def copy(self):
         dia_array
             A copy of the matrix with the same data and structure.
         """
-        data = cupynumeric.array(self.data)
-        offsets = cupynumeric.array(self.offsets)
+        data = cn.array(self.data)
+        offsets = cn.array(self.offsets)
         return dia_array((data, offsets), shape=self.shape, dtype=self.dtype)
 
-    def transpose(self, axes=None, copy=False):
+    def transpose(
+        self, axes: tuple[int, ...] | None = None, copy: bool = False
+    ) -> dia_array:
         """Reverses the dimensions of the sparse matrix.
 
         Parameters
@@ -295,13 +311,13 @@ def transpose(self, axes=None, copy=False):
         offsets = -self.offsets
 
         # re-align the data matrix
-        r = cupynumeric.arange(len(offsets), dtype=coord_ty)[:, None]
-        c = cupynumeric.arange(num_rows, dtype=coord_ty) - (offsets % max_dim)[:, None]
+        r = cn.arange(len(offsets), dtype=coord_ty)[:, None]
+        c = cn.arange(num_rows, dtype=coord_ty) - (offsets % max_dim)[:, None]
         pad_amount = max(0, max_dim - self.data.shape[1])
-        data = cupynumeric.hstack(
+        data = cn.hstack(
             (
                 self.data,
-                cupynumeric.zeros(
+                cn.zeros(
                     (self.data.shape[0], pad_amount), dtype=self.data.dtype
                 ),
             )
@@ -316,7 +332,7 @@ def transpose(self, axes=None, copy=False):
 
     T = property(transpose, doc="Transpose of the matrix")
 
-    def tocsr(self, copy=False):
+    def tocsr(self, copy: bool = False) -> csr_array:
         """Convert this matrix to a CSR matrix.
 
         Parameters
@@ -341,7 +357,7 @@ def tocsr(self, copy=False):
         return self.transpose(copy=copy)._tocsr_transposed(copy=False)
 
     # This routine is lifted from scipy.sparse's converter.
-    def _tocsr_transposed(self, copy=False):
+    def _tocsr_transposed(self, copy: bool = False) -> csr_array:
         """Convert the transposed DIA matrix to CSR format.
 
         This internal method converts a transposed DIA matrix to CSR format.
@@ -374,7 +390,7 @@ def _tocsr_transposed(self, copy=False):
 
         num_rows, num_cols = self.shape
         num_offsets, offset_len = self.data.shape
-        offset_inds = cupynumeric.arange(offset_len)
+        offset_inds = cn.arange(offset_len)
 
         row = offset_inds - self.offsets[:, None]
         mask = row >= 0
@@ -383,14 +399,14 @@ def _tocsr_transposed(self, copy=False):
         mask &= self.data != 0
 
         idx_dtype = coord_ty
-        indptr = cupynumeric.zeros(num_cols + 1, dtype=idx_dtype)
+        indptr = cn.zeros(num_cols + 1, dtype=idx_dtype)
         # note that the output dtype in a reduction (e.g, sum) determines
         # the dtype of the accumulator that is used in the reduction
         # in cupynumeric, it looks like the output dtype is set to the src
         # dtype if unspecified and that results in the output not performing
         # an integer sum. But we want the integer sum, so specify
         # dtype as idx_dtype to mask.sum()
-        indptr[1 : offset_len + 1] = cupynumeric.cumsum(
+        indptr[1 : offset_len + 1] = cn.cumsum(
             mask.sum(axis=0, dtype=idx_dtype)[:num_cols]
         )
         if offset_len < num_cols:
@@ -398,7 +414,10 @@ def _tocsr_transposed(self, copy=False):
         indices = row.T[mask.T].astype(idx_dtype, copy=False)
         data = self.data.T[mask.T]
         return csr_array(
-            (data, indices, indptr), shape=self.shape, dtype=self.dtype, copy=False
+            (data, indices, indptr),
+            shape=self.shape,
+            dtype=self.dtype,
+            copy=False,
         )
 
 
diff --git a/legate_sparse/gallery.py b/legate_sparse/gallery.py
index 371a4c44..e5583e6e 100644
--- a/legate_sparse/gallery.py
+++ b/legate_sparse/gallery.py
@@ -66,15 +66,29 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
+from __future__ import annotations
 
+from typing import TYPE_CHECKING
 
-import cupynumeric
-import numpy
+import cupynumeric as cn
+import numpy as np
 
+from .csr import csr_array
 from .dia import dia_array
 
+if TYPE_CHECKING:
+    from typing import Any, Sequence
 
-def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
+    import numpy.typing as npt
+
+
+def diags(
+    diagonals: Sequence[cn.ndarray],
+    offsets: Sequence[int] | int = 0,
+    shape: tuple[int, ...] | None = None,
+    format: str | None = None,
+    dtype: npt.dtype[Any] | None = None,
+) -> csr_array | dia_array:
     """Construct a sparse matrix from diagonals.
 
     Parameters
@@ -159,22 +173,25 @@ def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
            [ 0.,  0.,  0.,  0.]])
     """
     # if offsets is not a sequence, assume that there's only one diagonal
-    if numpy.isscalar(offsets):
+    diags: list[cn.ndarray]
+    if np.isscalar(offsets):
         # now check that there's actually only one diagonal
-        if len(diagonals) == 0 or numpy.isscalar(diagonals[0]):
-            diagonals = [cupynumeric.atleast_1d(diagonals)]
+        if len(diagonals) == 0 or np.isscalar(diagonals[0]):
+            diags = [cn.atleast_1d(diagonals)]  # type: ignore [list-item, arg-type]
         else:
             raise ValueError("Different number of diagonals and offsets.")
     else:
-        diagonals = list(map(cupynumeric.atleast_1d, diagonals))
+        diags = cn.atleast_1d(*diagonals)  # type: ignore [assignment]
+
+    assert not isinstance(offsets, int)
 
     # Basic check
-    if len(diagonals) != len(offsets):
+    if len(diags) != len(offsets):
         raise ValueError("Different number of diagonals and offsets.")
 
     # Determine shape, if omitted
     if shape is None:
-        m = len(diagonals[0]) + abs(int(offsets[0]))
+        m = len(diags[0]) + abs(int(offsets[0]))
         shape = (m, m)
 
     # Determine data type, if omitted
@@ -187,34 +204,38 @@ def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
     # Construct data array
     m, n = shape
 
-    M = max([min(m + offset, n - offset) + max(0, offset) for offset in offsets])
+    M = max(
+        [min(m + offset, n - offset) + max(0, offset) for offset in offsets]
+    )
     M = max(0, M)
-    data_arr = cupynumeric.zeros((len(offsets), M), dtype=dtype)
+    data_arr = cn.zeros((len(offsets), M), dtype=dtype)
 
     K = min(m, n)
 
-    for j, diagonal in enumerate(diagonals):
+    for j, diag in enumerate(diags):
         offset = int(offsets[j])
         k = max(0, offset)
         length = min(m + offset, n - offset, K)
         if length < 0:
-            raise ValueError("Offset %d (index %d) out of bounds" % (offset, j))
+            raise ValueError(
+                "Offset %d (index %d) out of bounds" % (offset, j)
+            )
         try:
-            data_arr[j, k : k + length] = diagonal[..., :length]
+            data_arr[j, k : k + length] = diag[..., :length]
         except ValueError as e:
-            if len(diagonal) != length and len(diagonal) != 1:
+            if len(diag) != length and len(diag) != 1:
                 raise ValueError(
                     "Diagonal length (index %d: %d at offset %d) does not "
                     "agree with matrix size (%d, %d)."
-                    % (j, len(diagonal), offset, m, n)
+                    % (j, len(diag), offset, m, n)
                 ) from e
             raise
 
     # We importantly don't perform this conversion to cupynumeric (involving
     # an attach operation) until we're done indexing into the list. This
     # avoid a cupynumeric crash involving restrictions in attach in pde.py.
-    offsets = cupynumeric.atleast_1d(offsets)
-    dia = dia_array((data_arr, offsets), shape=(m, n), dtype=dtype)
+    offsets_array: cn.ndarray = cn.atleast_1d(offsets)  # type: ignore [arg-type, assignment]
+    dia = dia_array((data_arr, offsets_array), shape=(m, n), dtype=dtype)
     if format == "csr":
         return dia.tocsr()
     return dia
diff --git a/legate_sparse/install_info.py.in b/legate_sparse/install_info.py.in
index 84799ee4..3ad3ecd1 100644
--- a/legate_sparse/install_info.py.in
+++ b/legate_sparse/install_info.py.in
@@ -11,9 +11,13 @@
 
 # IMPORTANT:
 #   * install_info.py is a generated file and should not be modified by hand
+from __future__ import annotations
+
 
 def get_libpath():
-    import os, sys, platform
+    import os
+    import platform
+    import sys
     join = os.path.join
     exists = os.path.exists
     dirname = os.path.dirname
@@ -32,10 +36,10 @@ def get_libpath():
         return None
 
     return (
-        find_liblegate_sparse(join(cn_path, "build", "lib")) or
-        find_liblegate_sparse(join(dirname(dirname(dirname(cn_path))), "lib")) or
-        find_liblegate_sparse(join(dirname(dirname(sys.executable)), "lib")) or
-        ""
+        find_liblegate_sparse(join(cn_path, "build", "lib"))
+        or find_liblegate_sparse(join(dirname(dirname(dirname(cn_path))), "lib"))
+        or find_liblegate_sparse(join(dirname(dirname(sys.executable)), "lib"))
+        or ""
     )
 
 
diff --git a/legate_sparse/io.py b/legate_sparse/io.py
index ecaf8e3c..6ce90ba5 100644
--- a/legate_sparse/io.py
+++ b/legate_sparse/io.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import numpy as np
 from legate.core import track_provenance, types
@@ -22,8 +23,8 @@
 from .utils import store_to_cupynumeric_array
 
 
-@track_provenance(runtime.sparse_library)
-def mmread(source):
+@track_provenance()
+def mmread(source: str) -> csr_array:
     """Read a sparse matrix from a Matrix Market (.mtx) file.
 
     Parameters
@@ -59,28 +60,34 @@ def mmread(source):
     # TODO (rohany): We'll assume for now that all of the nodes in the system
     # can access the file passed in, so we don't need to worry about where this
     # task gets mapped to.
-    rows = runtime.create_store(coord_ty, ndim=1)
-    cols = runtime.create_store(coord_ty, ndim=1)
-    vals = runtime.create_store(float64, ndim=1)
-    m = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,))
-    n = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,))
-    nnz = runtime.create_store(nnz_ty, optimize_scalar=True, shape=(1,))
+    rows_store = runtime.create_store(coord_ty, ndim=1)
+    cols_store = runtime.create_store(coord_ty, ndim=1)
+    vals_store = runtime.create_store(float64, ndim=1)
+    m_store = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,))
+    n_store = runtime.create_store(coord_ty, optimize_scalar=True, shape=(1,))
+    nnz_store = runtime.create_store(nnz_ty, optimize_scalar=True, shape=(1,))
     task = runtime.create_auto_task(SparseOpCode.READ_MTX_TO_COO)
-    task.add_output(m)
-    task.add_output(n)
-    task.add_output(nnz)
-    task.add_output(rows)
-    task.add_output(cols)
-    task.add_output(vals)
+    task.add_output(m_store)
+    task.add_output(n_store)
+    task.add_output(nnz_store)
+    task.add_output(rows_store)
+    task.add_output(cols_store)
+    task.add_output(vals_store)
     task.add_scalar_arg(source, types.string_type)
     task.execute()
 
-    m = int(np.asarray(m.get_physical_store().get_inline_allocation())[0])
-    n = int(np.asarray(n.get_physical_store().get_inline_allocation())[0])
-    nnz = int(np.asarray(nnz.get_physical_store().get_inline_allocation())[0])
+    m = int(
+        np.asarray(m_store.get_physical_store().get_inline_allocation())[0]
+    )
+    n = int(
+        np.asarray(n_store.get_physical_store().get_inline_allocation())[0]
+    )
+    nnz = int(
+        np.asarray(nnz_store.get_physical_store().get_inline_allocation())[0]
+    )
     # Slice down each store from the resulting size into the actual size.
     sl = slice(0, nnz)
-    rows = store_to_cupynumeric_array(rows.slice(0, sl))
-    cols = store_to_cupynumeric_array(cols.slice(0, sl))
-    vals = store_to_cupynumeric_array(vals.slice(0, sl))
+    rows = store_to_cupynumeric_array(rows_store.slice(0, sl))
+    cols = store_to_cupynumeric_array(cols_store.slice(0, sl))
+    vals = store_to_cupynumeric_array(vals_store.slice(0, sl))
     return csr_array((vals, (rows, cols)), shape=(m, n))
diff --git a/legate_sparse/linalg.py b/legate_sparse/linalg.py
index 82aa0edb..789cd2a9 100644
--- a/legate_sparse/linalg.py
+++ b/legate_sparse/linalg.py
@@ -93,15 +93,30 @@
 
 """
 
+from __future__ import annotations
+
 import inspect
 import warnings
+from typing import TYPE_CHECKING, Protocol
 
-import cupynumeric as np
-from legate.core import track_provenance, types
+import cupynumeric as cn
+import numpy as np
+from legate.core import align, image, track_provenance, types
 
 from .config import SparseOpCode
 from .runtime import runtime
-from .utils import get_store_from_cupynumeric_array
+from .utils import get_store_from_cupynumeric_array, store_to_cupynumeric_array
+
+if TYPE_CHECKING:
+    from typing import Any
+
+    import numpy.typing as npt
+
+
+class LOCallable(Protocol):
+    def __call__(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray: ...
 
 
 # We have to implement our own / copy the LinearOperator class from
@@ -196,7 +211,7 @@ class LinearOperator:
 
     ndim = 2
 
-    def __new__(cls, *args, **kwargs):
+    def __new__(cls, *args: Any, **kwargs: Any) -> LinearOperator:
         if cls is LinearOperator:
             # Operate as _CustomLinearOperator factory.
             return super(LinearOperator, cls).__new__(_CustomLinearOperator)
@@ -216,7 +231,9 @@ def __new__(cls, *args, **kwargs):
 
             return obj
 
-    def __init__(self, dtype, shape):
+    def __init__(
+        self, dtype: npt.dtype[Any] | None, shape: tuple[int, ...]
+    ) -> None:
         """Initialize this LinearOperator.
 
         To be called by subclasses. ``dtype`` may be None; ``shape`` should
@@ -229,13 +246,21 @@ def __init__(self, dtype, shape):
         self.dtype = dtype
         self.shape = shape
 
-    def _init_dtype(self):
+    def _init_dtype(self) -> None:
         """Called from subclasses at the end of the __init__ routine."""
         if self.dtype is None:
-            v = np.zeros(self.shape[-1])
-            self.dtype = np.asarray(self.matvec(v)).dtype
+            v = cn.zeros(self.shape[-1])
+            self.dtype = cn.asarray(self.matvec(v)).dtype
+
+    def _matmat(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
+        """Default matrix-matrix multiplication handler."""
+        raise NotImplementedError
 
-    def _matvec(self, x, out=None):
+    def _matvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         """Default matrix-vector multiplication handler.
 
         If self is a linear operator of shape (M, N), then this method will
@@ -247,7 +272,9 @@ def _matvec(self, x, out=None):
         """
         raise NotImplementedError
 
-    def matvec(self, x, out=None):
+    def matvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         """Matrix-vector multiplication.
 
         Performs the operation y=A*x where A is an MxN linear
@@ -275,7 +302,7 @@ def matvec(self, x, out=None):
         if x.shape != (N,) and x.shape != (N, 1):
             raise ValueError("dimension mismatch")
 
-        y = np.asarray(self._matvec(x, out=out))
+        y = cn.asarray(self._matvec(x, out=out))
 
         if x.ndim == 1:
             # TODO (hme): This is a cuPyNumeric bug, reshape should accept an
@@ -288,11 +315,15 @@ def matvec(self, x, out=None):
 
         return y
 
-    def _rmatvec(self, x, out=None):
+    def _rmatvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         """Default implementation of _rmatvec; defers to adjoint."""
         raise NotImplementedError
 
-    def rmatvec(self, x, out=None):
+    def rmatvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         """Adjoint matrix-vector multiplication.
 
         Performs the operation y = A^H * x where A is an MxN linear
@@ -320,14 +351,16 @@ def rmatvec(self, x, out=None):
         if x.shape != (M,) and x.shape != (M, 1):
             raise ValueError("dimension mismatch")
 
-        y = np.asarray(self._rmatvec(x, out=out))
+        y = cn.asarray(self._rmatvec(x, out=out))
 
         if x.ndim == 1:
             y = y.reshape(N)
         elif x.ndim == 2:
             y = y.reshape(N, 1)
         else:
-            raise ValueError("invalid shape returned by user-defined rmatvec()")
+            raise ValueError(
+                "invalid shape returned by user-defined rmatvec()"
+            )
 
         return y
 
@@ -337,88 +370,104 @@ def rmatvec(self, x, out=None):
 class _CustomLinearOperator(LinearOperator):
     """Linear operator defined in terms of user-specified operations."""
 
+    _matvec_impl: LOCallable
+    _rmatvec_impl: LOCallable | None
+
     def __init__(
         self,
-        shape,
-        matvec,
-        rmatvec=None,
-        matmat=None,
-        dtype=None,
-        rmatmat=None,
-    ):
+        shape: tuple[int, ...],
+        matvec: LOCallable,
+        rmatvec: LOCallable | None = None,
+        matmat: LOCallable | None = None,
+        dtype: npt.dtype[Any] | None = None,
+        rmatmat: LOCallable | None = None,
+    ) -> None:
         super().__init__(dtype, shape)
 
         self.args = ()
 
-        self.__matvec_impl = matvec
-        self.__rmatvec_impl = rmatvec
+        self._matvec_impl = matvec
+        self._rmatvec_impl = rmatvec
 
         # Check if the implementations of matvec and rmatvec have the out=
         # parameter.
-        self._matvec_has_out = self._has_out(self.__matvec_impl)
-        self._rmatvec_has_out = self._has_out(self.__rmatvec_impl)
+        self._matvec_has_out = self._has_out(self._matvec_impl)
+        self._rmatvec_has_out = self._has_out(self._rmatvec_impl)
 
         self._init_dtype()
 
-    def _matvec(self, x, out=None):
+    def _matvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         if self._matvec_has_out:
-            return self.__matvec_impl(x, out=out)
+            return self._matvec_impl(x, out=out)
         else:
             if out is None:
-                return self.__matvec_impl(x)
+                return self._matvec_impl(x)
             else:
-                out[:] = self.__matvec_impl(x)
+                out[:] = self._matvec_impl(x)
                 return out
 
-    def _rmatvec(self, x, out=None):
-        func = self.__rmatvec_impl
+    def _rmatvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
+        assert self._rmatvec_impl is not None
+        func = self._rmatvec_impl
         if func is None:
             raise NotImplementedError("rmatvec is not defined")
         if self._rmatvec_has_out:
-            return self.__rmatvec_impl(x, out=out)
+            return self._rmatvec_impl(x, out=out)
         else:
             if out is None:
-                return self.__rmatvec_impl(x)
+                return self._rmatvec_impl(x)
             else:
-                result = self.__rmatvec_impl(x)
+                result = self._rmatvec_impl(x)
                 out[:] = result
                 return out
 
-    def _has_out(self, o):
+    def _has_out(self, o: LOCallable | None) -> bool:
         if o is None:
             return False
         sig = inspect.signature(o)
-        for key, param in sig.parameters.items():
-            if key == "out":
-                return True
-        return False
+        return "out" in sig.parameters
 
 
 # _SparseMatrixLinearOperator is an overload of LinearOperator to wrap
 # sparse matrices as a linear operator. It caches the conjugate transpose
 # of the sparse matrices to avoid repeat conversions.
 class _SparseMatrixLinearOperator(LinearOperator):
-    def __init__(self, A):
+    AH: cn.ndarray | None
+
+    def __init__(self, A: cn.ndarray) -> None:
         self.A = A
         self.AH = None
         super().__init__(A.dtype, A.shape)
 
-    def _matvec(self, x, out=None):
+    def _matvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         return self.A.dot(x, out=out)
 
-    def _rmatvec(self, x, out=None):
+    def _rmatvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         if self.AH is None:
-            self.AH = self.A.T.conj(copy=False)
+            self.AH = self.A.T.conj()
+        assert self.AH is not None
         return self.AH.dot(x, out=out)
 
 
 # IdentityOperator is a no-op linear operator, and is lifted from
 # scipy.sparse.
 class IdentityOperator(LinearOperator):
-    def __init__(self, shape, dtype=None):
+    def __init__(
+        self, shape: tuple[int, ...], dtype: npt.dtype[Any] | None = None
+    ) -> None:
         super().__init__(dtype, shape)
 
-    def _matvec(self, x, out=None):
+    def _matvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         # If out is specified, copy the input into the output.
         if out is not None:
             out[:] = x
@@ -428,7 +477,9 @@ def _matvec(self, x, out=None):
             # the input to avoid silently aliasing the input array.
             return x.copy()
 
-    def _rmatvec(self, x, out=None):
+    def _rmatvec(
+        self, x: cn.ndarray, out: cn.ndarray | None = None
+    ) -> cn.ndarray:
         # If out is specified, copy the input into the output.
         if out is not None:
             out[:] = x
@@ -439,7 +490,7 @@ def _rmatvec(self, x, out=None):
             return x.copy()
 
 
-def make_linear_operator(A):
+def make_linear_operator(A: Any | LinearOperator) -> LinearOperator:
     """Convert a matrix to a LinearOperator.
 
     Parameters
@@ -473,7 +524,14 @@ def make_linear_operator(A):
 # future operations to compute new futures, and avoids
 # allocating unnecessary futures.
 @track_provenance(nested=True)
-def cg_axpby(y, x, a, b, isalpha=True, negate=False):
+def cg_axpby(
+    y: cn.ndarray,
+    x: cn.ndarray,
+    a: cn.ndarray,
+    b: cn.ndarray,
+    isalpha: bool = True,
+    negate: bool = False,
+) -> cn.ndarray:
     """Perform fused vector operation for CG solvers.
 
     This function performs the operation y = alpha * x + beta * y where
@@ -526,7 +584,12 @@ def cg_axpby(y, x, a, b, isalpha=True, negate=False):
     return y
 
 
-def _get_atol_rtol(b_norm, tol=None, atol=0.0, rtol=1e-5):
+def _get_atol_rtol(
+    b_norm: float | cn.ndarray,
+    tol: float | None = None,
+    atol: float = 0.0,
+    rtol: float = 1e-5,
+) -> tuple[float, float]:
     """Compute absolute and relative tolerances for convergence.
 
     Parameters
@@ -561,17 +624,17 @@ def _get_atol_rtol(b_norm, tol=None, atol=0.0, rtol=1e-5):
 
 
 def cg(
-    A,
-    b,
-    x0=None,
-    tol=None,
-    maxiter=None,
-    M=None,
-    callback=None,
-    atol=0.0,
-    rtol=1e-5,
-    conv_test_iters=25,
-):
+    A: Any | LinearOperator,
+    b: cn.ndarray,
+    x0: cn.ndarray | None = None,
+    tol: float | None = None,
+    maxiter: int | None = None,
+    M: Any | LinearOperator | None = None,
+    callback: Any | None = None,
+    atol: float = 0.0,
+    rtol: float = 1e-5,
+    conv_test_iters: int = 25,
+) -> tuple[cn.ndarray, int]:
     """Solve a linear system using the Conjugate Gradient method.
 
     Parameters
@@ -631,8 +694,8 @@ def cg(
     assert len(b.shape) == 1 or (len(b.shape) == 2 and b.shape[1] == 1)
     assert len(A.shape) == 2 and A.shape[0] == A.shape[1]
 
-    bnrm2 = np.linalg.norm(b)
-    atol, _ = _get_atol_rtol(bnrm2, tol, atol, rtol)
+    b_norm = cn.linalg.norm(b)
+    atol, _ = _get_atol_rtol(b_norm, tol, atol, rtol)
 
     n = b.shape[0]
     if maxiter is None:
@@ -644,15 +707,15 @@ def cg(
         if M is None
         else make_linear_operator(M)
     )
-    x = np.zeros(n) if x0 is None else x0.copy()
-    p = np.zeros(n)
+    x = cn.zeros(n) if x0 is None else x0.copy()
+    p = cn.zeros(n)
 
     # This implementation is adapted from CuPy's CG solve:
     # https://github.com/cupy/cupy/blob/master/cupyx/scipy/sparse/linalg/_iterative.py.
     # # Hold onto several temps to store allocations used in each iteration.
     r = b - A.matvec(x)
     iters = 0
-    rho = 0
+    rho: int | cn.ndarray = 0
     z = None
     q = None
 
@@ -679,9 +742,9 @@ def cg(
         iters += 1
         if callback is not None:
             callback(x)
-        if (iters % conv_test_iters == 0 or iters == (maxiter - 1)) and np.linalg.norm(
-            r
-        ) < atol:
+        if (
+            iters % conv_test_iters == 0 or iters == (maxiter - 1)
+        ) and cn.linalg.norm(r) < atol:
             converged = True
             # Test convergence every conv_test_iters iterations.
             break
@@ -696,19 +759,19 @@ def cg(
 # This implementation of GMRES is lifted from the cupy implementation:
 # https://github.com/cupy/cupy/blob/9d2e2381ae7f33a42291d1bf8271484c9d2a55ac/cupyx/scipy/sparse/linalg/_iterative.py#L94.
 def gmres(
-    A,
-    b,
-    x0=None,
-    tol=None,
-    restart=None,
-    maxiter=None,
-    M=None,
-    callback=None,
-    restrt=None,
-    atol=0.0,
-    callback_type=None,
-    rtol=1e-5,
-):
+    A: Any | LinearOperator,
+    b: cn.ndarray,
+    x0: cn.ndarray | None = None,
+    tol: float | None = None,
+    restart: int | None = None,
+    maxiter: int | None = None,
+    M: Any | LinearOperator | None = None,
+    callback: Any = None,
+    restrt: int | None = None,
+    atol: float = 0.0,
+    callback_type: str | None = None,
+    rtol: float = 1e-5,
+) -> tuple[cn.ndarray, int]:
     """Solve a linear system using the Generalized Minimal Residual method.
 
     Parameters
@@ -796,10 +859,10 @@ def gmres(
         if M is None
         else make_linear_operator(M)
     )
-    x = np.zeros(n) if x0 is None else x0.copy()
+    x = cn.zeros(n) if x0 is None else x0.copy()
 
-    bnrm2 = np.linalg.norm(b)
-    atol, _ = _get_atol_rtol(bnrm2, tol, atol, rtol)
+    b_norm = cn.linalg.norm(b)
+    atol, _ = _get_atol_rtol(b_norm, tol, atol, rtol)
 
     if maxiter is None:
         maxiter = n * 10
@@ -813,11 +876,11 @@ def gmres(
     if callback is None:
         callback_type = None
 
-    V = np.empty((n, restart), dtype=A.dtype)
-    H = np.zeros((restart + 1, restart), dtype=A.dtype)
-    e = np.zeros((restart + 1,), dtype=A.dtype)
+    V = cn.empty((n, restart), dtype=A.dtype)
+    H: Any = cn.zeros((restart + 1, restart), dtype=A.dtype)
+    e: Any = cn.zeros((restart + 1,), dtype=A.dtype)
 
-    def compute_hu(u, j):
+    def compute_hu(u: cn.ndarray, j: int) -> tuple[cn.ndarray, cn.ndarray]:
         """Compute Householder transformation for Arnoldi iteration.
 
         Parameters
@@ -847,7 +910,7 @@ def compute_hu(u, j):
     while True:
         mx = M.matvec(x)
         r = b - A.matvec(mx)
-        r_norm = np.linalg.norm(r)
+        r_norm = cn.linalg.norm(r)
         if callback_type == "x":
             callback(mx)
         elif callback_type == "pr_norm" and iters > 0:
@@ -863,14 +926,14 @@ def compute_hu(u, j):
             z = M.matvec(v)
             u = A.matvec(z)
             H[: j + 1, j], u = compute_hu(u, j)
-            H[j + 1, j] = np.linalg.norm(u)
+            H[j + 1, j] = cn.linalg.norm(u)
             if j + 1 < restart:
                 v = u / H[j + 1, j]
                 V[:, j + 1] = v
 
         # Note: The least-square solution to equation Hy = e is computed on CPU
         # because it is faster if tha matrix size is small.
-        ret = np.linalg.lstsq(H, e)
+        ret = cn.linalg.lstsq(H, e)  # type: ignore [attr-defined]
         y = ret[0]
         x += V @ y
         iters += restart
@@ -879,3 +942,258 @@ def compute_hu(u, j):
     if iters == maxiter and not (r_norm <= atol):
         info = iters
     return mx, info
+
+
+def spsolve(A: Any, b: np.ndarray) -> np.ndarray:
+    """
+    Solve a linear system of equation Ax=b by factorizing A
+
+    Parameters
+    ----------
+    A : csr_array
+        Input sparse matrix of shape (N, N).
+    b : cupynumeric.ndarray
+        Dense vector of shape (N,).
+
+    Returns
+    -------
+    x : cupynumeric.ndarray
+        Dense vector of shape (N,), that solves A x = b.
+
+    Raises
+    ------
+    RuntimeError
+        If attempted to solve on any configuration other than one GPU
+    ValueError
+        If the RHS is not one-dimensional
+
+    Notes
+    -----
+    This function uses cuDSS to perform the sparse direct solve, which
+    computes the reordering on Host.
+
+    """
+
+    # TODO:
+    # Support multi-dimensional RHS. Note that cuDSS only supports
+    # column-major order for x and b, so we need to update the
+    # mapper for those stores. Partitioning constraints will also need to
+    # be changed since alignment constraints will need both stores
+    # to be of the same dimension (e.g., we cannot align pos (1D)
+    # and b (say, 2D) without manipulating the stores
+
+    # NOTE: multi-gpu runs might hang with cuda < 13.0.0.
+    # For multi-gpu runs, the user is expected to set the path to
+    # libcudss_comm_nccl.so in the env CUDSS_COMM_LIB
+    if runtime.num_gpus == 0:
+        raise RuntimeError("spsolve is currently supported only for GPU(s)")
+
+    if b.ndim != 1:
+        raise ValueError(f"RHS must be 1D. Dimension of b is: {b.ndim}")
+
+    b_store = get_store_from_cupynumeric_array(b)
+    x_store = runtime.create_store(b.dtype, shape=(A.shape[1],))
+
+    task = runtime.create_auto_task(SparseOpCode.SPSOLVE)
+
+    pos_part = task.add_input(A.pos)
+    crd_part = task.add_input(A.crd)
+    vals_part = task.add_input(A.vals)
+    b_part = task.add_input(b_store)
+    x_part = task.add_output(x_store)
+    task.add_scalar_arg(A.shape[0], types.uint64)  # global nrows
+    task.add_scalar_arg(A.vals.size, types.uint64)  # global nnz
+
+    # Add communicator
+    task.add_communicator("nccl")
+
+    # Since we don't support multi-gpu or multi-cpu runs, these constraints
+    # are not particularly relevant right now, but they enable
+    # debugging the multi-gpu hang. The matrix and the vectors are
+    # partitioned row-wise without any sparsity-dependent constraints
+    # that is typical in other API implementations in legate-sparse
+    # that use mathlibs (e.g., cuSparse). This passes on the responsibility
+    # of inserting appropriate communication primitives to the
+    # underlying math library, cuDSS. This is why we don't constraint the
+    # partition of x to the image of crd (e.g., like in SpMv in csr.py)
+    task.add_constraint(image(pos_part, crd_part))
+    task.add_constraint(image(pos_part, vals_part))
+    task.add_constraint(align(x_part, pos_part))
+    task.add_constraint(align(b_part, pos_part))
+
+    task.execute()
+
+    return store_to_cupynumeric_array(x_store)
+
+
+# this function has been adapted from cupy's implementation of `eigsh`:
+# https://github.com/cupy/cupy/blob/v13.6.0/cupyx/scipy/sparse/linalg/_eigen.py
+def eigsh(
+    a,
+    k=6,
+    *,
+    which="LM",
+    v0=None,
+    ncv=None,
+    maxiter=None,
+    tol=0,
+    return_eigenvectors=True,
+):
+    def _lanczos(a, V, u, alpha, beta, i_start, i_end):
+        for i in range(i_start, i_end):
+            u[...] = a.matvec(V[i])
+            alpha[i] = cn.dot(V[i].conj(), u)
+
+            # Full reorthogonalization with "twice is enough" strategy
+            # for improved numerical stability. This matches the approach
+            # used in robust Lanczos implementations.
+            # First pass
+            coeffs = V[: i + 1].conj() @ u
+            u -= coeffs @ V[: i + 1]
+            # Second pass for numerical stability
+            coeffs2 = V[: i + 1].conj() @ u
+            u -= coeffs2 @ V[: i + 1]
+
+            beta[i] = cn.linalg.norm(u)
+            if i >= i_end - 1:
+                break
+            V[i + 1] = u / beta[i]
+
+    def _eigsh_solve_ritz(alpha, beta, beta_k, k, which):
+        # Note: This is done on the CPU using numpy, following CuPy's approach.
+        # This avoids numerical issues that can occur with GPU-based eigh
+        # on small tridiagonal matrices from the thick-restart Lanczos.
+        alpha_np = np.array(alpha)
+        beta_np = np.array(beta)
+        t = np.diag(alpha_np)
+        t = t + np.diag(beta_np[:-1], k=1)
+        t = t + np.diag(beta_np[:-1], k=-1)
+        if beta_k is not None:
+            beta_k_np = np.array(beta_k)
+            t[k, :k] = beta_k_np
+            t[:k, k] = beta_k_np
+        w, s = np.linalg.eigh(t)
+
+        # Pick-up k ritz-values and ritz-vectors
+        if which == "LA":
+            idx = np.argsort(w)
+            wk = w[idx[-k:]]
+            sk = s[:, idx[-k:]]
+        elif which == "LM":
+            idx = np.argsort(np.absolute(w))
+            wk = w[idx[-k:]]
+            sk = s[:, idx[-k:]]
+        elif which == "SA":
+            idx = np.argsort(w)
+            wk = w[idx[:k]]
+            sk = s[:, idx[:k]]
+        # Convert back to cupynumeric arrays
+        return cn.array(wk), cn.array(sk)
+
+    # Convert to LinearOperator for uniform matvec interface
+    a = make_linear_operator(a)
+    n = a.shape[0]
+    if a.ndim != 2 or a.shape[0] != a.shape[1]:
+        raise ValueError("expected square matrix (shape: {})".format(a.shape))
+    if a.dtype.char not in "fdFD":
+        raise TypeError("unsupprted dtype (actual: {})".format(a.dtype))
+    if k <= 0:
+        raise ValueError("k must be greater than 0 (actual: {})".format(k))
+    if k >= n:
+        raise ValueError("k must be smaller than n (actual: {})".format(k))
+    if which not in ("LM", "LA", "SA"):
+        raise ValueError(
+            "which must be 'LM','LA'or'SA' (actual: {})".format(which)
+        )
+    if ncv is None:
+        ncv = min(max(2 * k, k + 32), n - 1)
+    else:
+        ncv = min(max(ncv, k + 2), n - 1)
+    if maxiter is None:
+        maxiter = 10 * n
+    if tol == 0:
+        tol = cn.finfo(a.dtype).eps
+
+    if k + 1 == ncv:
+        raise ValueError(
+            f"k must be smaller than ncv - 1 (k + 1 < ncv < n)."
+            f" ncv: {ncv}, k: {k}, n: {n}"
+        )
+
+    alpha = cn.zeros((ncv,), dtype=a.dtype)
+    beta = cn.zeros((ncv,), dtype=a.dtype.char.lower())
+    V = cn.empty((ncv, n), dtype=a.dtype)
+
+    if v0 is None:
+        u = cn.random.random((n,)).astype(a.dtype)
+        V[0] = u / cn.linalg.norm(u)
+    else:
+        u = v0
+        V[0] = v0 / cn.linalg.norm(v0)
+
+    _lanczos(a, V, u, alpha, beta, 0, ncv)
+
+    iter_current = ncv
+    w, s = _eigsh_solve_ritz(alpha, beta, None, k, which)
+    x = V.T @ s
+
+    beta_k = beta[-1] * s[-1, :]
+    res = cn.linalg.norm(beta_k)
+
+    iter_increment = ncv - k
+    # Track initial beta scale for detecting relative breakdown
+    # When beta[k] is too small relative to the typical beta values,
+    # the thick restart becomes numerically unstable
+    initial_beta_scale = cn.max(cn.abs(beta[:-1]))
+
+    while res > tol and iter_current < maxiter:
+        beta[:k] = 0
+        alpha[:k] = w
+        V[:k] = x.T
+
+        # Full reorthogonalization with "twice is enough" (same as in _lanczos)
+        coeffs = V[:k].conj() @ u
+        u = u - coeffs @ V[:k]
+        coeffs2 = V[:k].conj() @ u
+        u = u - coeffs2 @ V[:k]
+
+        u_norm = cn.linalg.norm(u)
+        # Check for numerical breakdown: if u_norm is too small relative
+        # to initial scale, the thick restart becomes numerically unstable.
+        # A ratio < 0.1 indicates potential numerical issues.
+        if u_norm < 0.1 * initial_beta_scale:
+            # Accept current eigenvalues as converged
+            break
+
+        V[k] = u / u_norm
+        u[...] = a.matvec(V[k])
+        alpha[k] = cn.dot(V[k].conj(), u)
+        u -= alpha[k] * V[k]
+        u -= V[:k].T @ beta_k
+        beta[k] = cn.linalg.norm(u)
+
+        # Check for numerical breakdown after computing beta[k]
+        # If beta[k] is very small relative to initial scale,
+        # continuing will cause numerical instability
+        if beta[k] < 0.1 * initial_beta_scale:
+            # Accept current eigenvalues as converged
+            break
+
+        # note that this can run into Out of bounds error
+        # in legate if `k` is not properly constrained
+        # in the initial part of the algorithm
+        V[k + 1] = u / beta[k]
+
+        _lanczos(a, V, u, alpha, beta, k + 1, ncv)
+        w, s = _eigsh_solve_ritz(alpha, beta, beta_k, k, which)
+        x = V.T @ s
+        beta_k = beta[-1] * s[-1, :]
+        res = cn.linalg.norm(beta_k)
+
+        iter_current += iter_increment
+
+    if return_eigenvectors:
+        idx = cn.argsort(w)
+        return w[idx], x[:, idx]
+    else:
+        return cn.sort(w)
diff --git a/legate_sparse/module.py b/legate_sparse/module.py
index 56f22fa1..2fe4c5dd 100644
--- a/legate_sparse/module.py
+++ b/legate_sparse/module.py
@@ -44,7 +44,9 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from __future__ import annotations
 
+from typing import Any
 
 from .csr import csr_array  # noqa: F401
 from .dia import dia_array  # noqa: F401
@@ -56,10 +58,11 @@
 
 
 # returns whether or not an object is a legate sparse created sparse matrix.
-def _is_sparse_matrix(obj) -> bool:
+def _is_sparse_matrix(obj: Any) -> bool:
     return any((isinstance(obj, csr_array), isinstance(obj, dia_array)))
 
-def isspmatrix(obj) -> bool:
+
+def isspmatrix(obj: Any) -> bool:
     """Check if an object is a legate sparse matrix.
 
     Parameters
@@ -81,7 +84,7 @@ def isspmatrix(obj) -> bool:
     return _is_sparse_matrix(obj)
 
 
-def issparse(obj) -> bool:
+def issparse(obj: Any) -> bool:
     """Check if an object is a legate sparse matrix.
 
     Parameters
@@ -104,7 +107,7 @@ def issparse(obj) -> bool:
 
 
 # Variants for each particular format type.
-def isspmatrix_csr(obj):
+def isspmatrix_csr(obj: Any) -> bool:
     """Check if an object is a CSR sparse matrix.
 
     Parameters
diff --git a/legate_sparse/runtime.py b/legate_sparse/runtime.py
index e7a3dc41..84e12d3e 100644
--- a/legate_sparse/runtime.py
+++ b/legate_sparse/runtime.py
@@ -30,11 +30,12 @@
 from .config import SparseOpCode, _library
 
 if TYPE_CHECKING:
-    from typing import Optional, Union
+    from typing import Any
 
     import numpy.typing as npt
+    from legate.core import Library
 
-TO_CORE_DTYPES = {
+TO_CORE_DTYPES: dict[npt.DTypeLike, types.Type] = {
     np.dtype(np.bool_): types.bool_,
     np.dtype(np.int8): types.int8,
     np.dtype(np.int16): types.int16,
@@ -54,7 +55,7 @@
 
 # TODO (marsaev): rename to SparseRuntime to avoid confusion?
 class Runtime:
-    def __init__(self, sparse_library):
+    def __init__(self, sparse_library: Library) -> None:
         self.sparse_library = sparse_library
         self.legate_runtime = get_legate_runtime()
         self.legate_machine = get_machine()
@@ -66,25 +67,25 @@ def __init__(self, sparse_library):
             task = self.legate_runtime.create_manual_task(
                 self.sparse_library,
                 SparseOpCode.LOAD_CUDALIBS,
-                launch_shape=Shape((self.num_gpus,)),
+                launch_shape=(self.num_gpus,),
             )
             task.execute()
             self.legate_runtime.issue_execution_fence(block=True)
 
     @property
-    def num_procs(self):
+    def num_procs(self) -> int:
         return self.legate_machine.count(self.legate_machine.preferred_target)
 
     @property
-    def num_gpus(self):
+    def num_gpus(self) -> int:
         return self.legate_machine.count(TaskTarget.GPU)
 
     def create_store(
         self,
-        ty: Union[npt.DTypeLike],
-        shape: Optional[Union[tuple[int, ...], Shape]] = None,
+        ty: npt.dtype[Any] | types.Type,
+        shape: Shape | tuple[int, ...] | None = None,
         optimize_scalar: bool = False,
-        ndim: Optional[int] = None,
+        ndim: int | None = None,
     ) -> LogicalStore:
         core_ty = TO_CORE_DTYPES[ty] if isinstance(ty, np.dtype) else ty
         return self.legate_runtime.create_store(
@@ -92,11 +93,13 @@ def create_store(
         )
 
     # only OpCode
-    def create_auto_task(self, OpCode) -> AutoTask:
-        return self.legate_runtime.create_auto_task(self.sparse_library, OpCode)
+    def create_auto_task(self, OpCode: int) -> AutoTask:
+        return self.legate_runtime.create_auto_task(
+            self.sparse_library, OpCode
+        )
 
     # OpCode and launch domains
-    def create_manual_task(self, OpCode, *args) -> ManualTask:
+    def create_manual_task(self, OpCode: int, *args: Any) -> ManualTask:
         return self.legate_runtime.create_manual_task(
             self.sparse_library, OpCode, *args
         )
diff --git a/legate_sparse/settings.py b/legate_sparse/settings.py
index 31e48a0c..7d518777 100644
--- a/legate_sparse/settings.py
+++ b/legate_sparse/settings.py
@@ -14,7 +14,12 @@
 #
 from __future__ import annotations
 
-from legate.util.settings import PrioritizedSetting, Settings, convert_bool
+from legate.util.settings import (
+    PrioritizedSetting,
+    Settings,
+    convert_bool,
+    convert_str,
+)
 
 __all__ = ("settings",)
 
@@ -32,5 +37,15 @@ class SparseRuntimeSettings(Settings):
         """,
     )
 
+    cudss_commnccl_loc: PrioritizedSetting[bool] = PrioritizedSetting(
+        "cudss-comm-lib",
+        "CUDSS_COMM_LIB",
+        default="",
+        convert=convert_str,
+        help="""
+        For multi-gpu runs, set CUDSS_COMM_LIB env to /path/to/libcudss_commlayer_nccl.so
+        """,
+    )
+
 
 settings = SparseRuntimeSettings()
diff --git a/legate_sparse/types.py b/legate_sparse/types.py
index 923767f2..a566f617 100644
--- a/legate_sparse/types.py
+++ b/legate_sparse/types.py
@@ -11,26 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
-import numpy
+import numpy as np
 
 # Define some common types. Hopefully as we make more
 # progress in generalizing the compute kernels, we can
 # remove this code.
-coord_ty = numpy.dtype(numpy.int64)
+coord_ty = np.dtype(np.int64)
 """Data type for coordinate indices in sparse matrices (int64)."""
 
-nnz_ty = numpy.dtype(numpy.uint64)
+nnz_ty = np.dtype(np.uint64)
 """Data type for non-zero counts in sparse matrices (uint64)."""
 
-float64 = numpy.dtype(numpy.float64)
+float64 = np.dtype(np.float64)
 """64-bit floating point data type."""
 
-int32 = numpy.dtype(numpy.int32)
+int32 = np.dtype(np.int32)
 """32-bit integer data type."""
 
-int64 = numpy.dtype(numpy.int64)
+int64 = np.dtype(np.int64)
 """64-bit integer data type."""
 
-uint64 = numpy.dtype(numpy.uint64)
+uint64 = np.dtype(np.uint64)
 """64-bit unsigned integer data type."""
diff --git a/legate_sparse/utils.py b/legate_sparse/utils.py
index 2c072f2b..31b28a28 100644
--- a/legate_sparse/utils.py
+++ b/legate_sparse/utils.py
@@ -11,26 +11,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import math
 import traceback
-from typing import Any
+from typing import TYPE_CHECKING, cast
 
-import cupynumeric
-import numpy
+import cupynumeric as cn
+import numpy as np
 from legate.core import LogicalStore
 
 import legate_sparse
 
 from .runtime import runtime
 
+if TYPE_CHECKING:
+    from typing import Any
+
+    import numpy.typing as npt
+
+    from .csr import csr_array
+
 # Datatypes that spmv and spgemm operations are supported for
-SUPPORTED_DATATYPES = (
-    numpy.float32,
-    numpy.float64,
-    numpy.complex64,
-    numpy.complex128,
-)
+SUPPORTED_DATATYPES = (np.float32, np.float64, np.complex64, np.complex128)
 """Supported datatypes for sparse matrix operations (SpMV and SpGEMM)."""
 
 
@@ -59,7 +62,7 @@ def find_last_user_stacklevel() -> int:
 
 
 # store_to_cupynumeric_array converts a store to a cuPyNumeric array.
-def store_to_cupynumeric_array(store: LogicalStore):
+def store_to_cupynumeric_array(store: LogicalStore) -> cn.ndarray:
     """Convert a LogicalStore to a cupynumeric array.
 
     Parameters
@@ -72,13 +75,12 @@ def store_to_cupynumeric_array(store: LogicalStore):
     cupynumeric.ndarray
         The cupynumeric array representation of the store.
     """
-    return cupynumeric.asarray(store)
+    return cn.asarray(store)
 
 
 # get_store_from_cupynumeric_array extracts a store from a cuPyNumeric array.
 def get_store_from_cupynumeric_array(
-    arr: cupynumeric.ndarray,
-    copy=False,
+    arr: cn.ndarray, copy: bool = False
 ) -> LogicalStore:
     """Extract a LogicalStore from a cupynumeric array.
 
@@ -96,17 +98,17 @@ def get_store_from_cupynumeric_array(
     """
     if copy:
         # If requested to make a copy, do so.
-        arr = cupynumeric.array(arr)
+        arr = cn.array(arr)
 
     data = arr.__legate_data_interface__["data"]
     array = data[next(iter(data))]
     store = array.data
 
-    return store
+    return cast(LogicalStore, store)
 
 
 # cast_to_store attempts to cast an arbitrary object into a store.
-def cast_to_store(arr):
+def cast_to_store(arr: cn.ndarray | LogicalStore) -> LogicalStore:
     """Cast an arbitrary object to a LogicalStore.
 
     Parameters
@@ -126,16 +128,18 @@ def cast_to_store(arr):
     """
     if isinstance(arr, LogicalStore):
         return arr
-    if isinstance(arr, numpy.ndarray):
-        arr = cupynumeric.array(arr)
-    if isinstance(arr, cupynumeric.ndarray):
+    if isinstance(arr, np.ndarray):
+        arr = cn.array(arr)
+    if isinstance(arr, cn.ndarray):
         return get_store_from_cupynumeric_array(arr)
     raise NotImplementedError
 
 
 # cast_arr attempts to cast an arbitrary object into a cupynumeric
 # ndarray, with an optional desired type.
-def cast_arr(arr, dtype=None):
+def cast_arr(
+    arr: cn.ndarray | LogicalStore, dtype: npt.dtype[Any] | None = None
+) -> cn.ndarray:
     """Cast an arbitrary object to a cupynumeric array.
 
     Parameters
@@ -152,14 +156,16 @@ def cast_arr(arr, dtype=None):
     """
     if isinstance(arr, LogicalStore):
         arr = store_to_cupynumeric_array(arr)
-    elif not isinstance(arr, cupynumeric.ndarray):
-        arr = cupynumeric.array(arr)
+    elif not isinstance(arr, cn.ndarray):
+        arr = cn.array(arr)
     if dtype is not None:
         arr = arr.astype(dtype)
     return arr
 
 
-def find_common_type(*args):
+def find_common_type(
+    *args: cn.ndarray | csr_array | np.ndarray,
+) -> npt.dtype[Any]:
     """Find the common data type for a set of arrays.
 
     This function performs a similar analysis to cupynumeric.ndarray.find_common_type
@@ -190,33 +196,10 @@ def find_common_type(*args):
             scalar_types.append(array.dtype)
         else:
             array_types.append(array.dtype)
-    return numpy.result_type(*array_types, *scalar_types)
-
-
-def cast_to_common_type(*args):
-    """Cast all arguments to the same common data type.
-
-    Parameters
-    ----------
-    *args : array_like
-        Arrays to cast to a common type.
-
-    Returns
-    -------
-    tuple
-        Tuple of arrays, all cast to the same common data type.
+    return np.result_type(*array_types, *scalar_types)
 
-    Notes
-    -----
-    This function first finds the common type using find_common_type,
-    then casts each input to that type. If all arguments are already
-    the common type, this will be a no-op.
-    """
-    common_type = find_common_type(*args)
-    return tuple(arg.astype(common_type, copy=False) for arg in args)
 
-
-def factor_int(n):
+def factor_int(n: int) -> tuple[int, int]:
     """Split an integer into two close factors.
 
     Parameters
@@ -242,7 +225,9 @@ def factor_int(n):
     return val, val2
 
 
-def broadcast_store(store: LogicalStore, shape: Any) -> LogicalStore:
+def broadcast_store(
+    store: LogicalStore, shape: tuple[int, ...]
+) -> LogicalStore:
     """Broadcast a LogicalStore to the desired shape.
 
     Parameters
@@ -294,12 +279,14 @@ def copy_store(store: LogicalStore) -> LogicalStore:
     LogicalStore
         A new LogicalStore with the same data as the input.
     """
-    res = runtime.create_store(store.type, store.shape)  # type: ignore
+    res = runtime.create_store(store.type, store.shape)
     runtime.legate_runtime.issue_copy(res, store)
     return res
 
 
-def store_from_store_or_array(src, copy=False) -> LogicalStore:  # type: ignore
+def store_from_store_or_array(
+    src: LogicalStore | cn.ndarray, copy: bool = False
+) -> LogicalStore:
     """Get LogicalStore from a LogicalStore or array, potentially creating a copy.
 
     Parameters
@@ -319,15 +306,19 @@ def store_from_store_or_array(src, copy=False) -> LogicalStore:  # type: ignore
     AssertionError
         If the input type is not supported.
     """
-    if isinstance(src, cupynumeric.ndarray):
+    if isinstance(src, cn.ndarray):
         return get_store_from_cupynumeric_array(src, copy)
     elif isinstance(src, LogicalStore):
         return copy_store(src) if copy else src
     else:
-        AssertionError("Wrong type for 'store_from_store_or_array()' utility")
+        raise AssertionError(
+            "Wrong type for 'store_from_store_or_array()' utility"
+        )
 
 
-def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type: ignore
+def array_from_store_or_array(
+    src: LogicalStore | cn.ndarray, copy: bool = False
+) -> cn.ndarray:
     """Get array from a LogicalStore or array, potentially creating a copy.
 
     Parameters
@@ -347,7 +338,7 @@ def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type:
     AssertionError
         If the input type is not supported.
     """
-    if isinstance(src, cupynumeric.ndarray):
+    if isinstance(src, cn.ndarray):
         return src.copy() if copy else src
     elif isinstance(src, LogicalStore):
         return (
@@ -356,11 +347,12 @@ def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type:
             else store_to_cupynumeric_array(src)
         )
     else:
-        AssertionError("Wrong type for 'array_from_store_or_array()' utility")
-    # type: ignore
+        raise AssertionError(
+            "Wrong type for 'array_from_store_or_array()' utility"
+        )
 
 
-def get_storage_type(src):
+def get_storage_type(src: LogicalStore | cn.ndarray) -> npt.dtype[Any]:
     """Get the storage type of an object.
 
     Parameters
@@ -378,18 +370,17 @@ def get_storage_type(src):
     AssertionError
         If the input type is not supported.
     """
-    if isinstance(src, cupynumeric.ndarray):
+    if isinstance(src, cn.ndarray):
         return src.dtype
     elif isinstance(src, LogicalStore):
         # there is legate.core to_core_dtype(), but here we need the opposite
         # doing via array now
         return cast_arr(src).dtype
     else:
-        AssertionError("Wrong type for 'get_storage_type()' utility")
-    # type: ignore
+        raise AssertionError("Wrong type for 'get_storage_type()' utility")
 
 
-def is_dtype_supported(dtype: numpy.dtype) -> bool:
+def is_dtype_supported(dtype: npt.dtype[Any]) -> bool:
     """Check if a datatype supports SpMV and SpGEMM operations.
 
     Parameters
@@ -409,7 +400,7 @@ def is_dtype_supported(dtype: numpy.dtype) -> bool:
     return dtype in SUPPORTED_DATATYPES
 
 
-def is_dense(x) -> bool:
+def is_dense(x: Any) -> bool:
     """Check if an object is a dense cupynumeric array.
 
     Parameters
@@ -422,10 +413,10 @@ def is_dense(x) -> bool:
     bool
         True if x is a cupynumeric.ndarray, False otherwise.
     """
-    return isinstance(x, cupynumeric.ndarray)
+    return isinstance(x, cn.ndarray)
 
 
-def is_scalar_like(x) -> bool:
+def is_scalar_like(x: Any) -> bool:
     """Check if an object is a scalar-like type.
 
     Parameters
@@ -445,10 +436,10 @@ def is_scalar_like(x) -> bool:
     """
     if isinstance(x, str):
         return False
-    return cupynumeric.isscalar(x) or (is_dense(x) and x.ndim == 0)
+    return cn.isscalar(x) or (is_dense(x) and x.ndim == 0)
 
 
-def is_sparse(x) -> bool:
+def is_sparse(x: Any) -> bool:
     """Check if an object is a legate sparse matrix.
 
     Parameters
@@ -464,7 +455,7 @@ def is_sparse(x) -> bool:
     return legate_sparse.isspmatrix(x)
 
 
-def sort_by_rows_then_cols(rows: cupynumeric.ndarray, cols: cupynumeric.ndarray):
+def sort_by_rows_then_cols(rows: cn.ndarray, cols: cn.ndarray) -> cn.ndarray:
     """Sort indices by rows first, then by columns.
 
     This function is a quick and dirty hack that does what np.lexsort does
@@ -501,7 +492,7 @@ def sort_by_rows_then_cols(rows: cupynumeric.ndarray, cols: cupynumeric.ndarray)
     # note that the lexsort reverses the order of key,
     # so this would be equivalent to np.lexsort((cols, rows))
 
-    indices = cupynumeric.argsort(cols, kind="stable")
-    order = cupynumeric.argsort(rows[indices], kind="stable")
+    indices = cn.argsort(cols, kind="stable")
+    order = cn.argsort(rows[indices], kind="stable")
 
     return indices[order]
diff --git a/legate_sparse_cpp.cmake b/legate_sparse_cpp.cmake
index 6a90e3b3..2f37b63d 100644
--- a/legate_sparse_cpp.cmake
+++ b/legate_sparse_cpp.cmake
@@ -105,6 +105,7 @@ if(Legion_USE_CUDA)
   )
 
   include(cmake/thirdparty/get_nccl.cmake)
+  include(cmake/thirdparty/get_cudss.cmake)
 endif()
 
 # End From cupynumeric
@@ -134,7 +135,7 @@ list(APPEND legate_sparse_SOURCES
   src/legate_sparse/array/csr/spmv.cc
   src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
   src/legate_sparse/array/csr/indexing.cc
-  
+
   src/legate_sparse/array/util/unzip_rect.cc
   src/legate_sparse/array/util/zip_to_rect.cc
 
@@ -142,6 +143,8 @@ list(APPEND legate_sparse_SOURCES
 
   src/legate_sparse/io/mtx_to_coo.cc
   src/legate_sparse/linalg/axpby.cc
+  src/legate_sparse/linalg/spsolve.cc
+  src/legate_sparse/array/csr/geam.cc
 )
 
 if(Legion_USE_OpenMP)
@@ -154,6 +157,7 @@ if(Legion_USE_OpenMP)
     src/legate_sparse/array/csr/spmv_omp.cc
     src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
     src/legate_sparse/array/csr/indexing_omp.cc
+    src/legate_sparse/array/csr/geam_omp.cc
 
     src/legate_sparse/array/util/unzip_rect_omp.cc
     src/legate_sparse/array/util/zip_to_rect_omp.cc
@@ -164,7 +168,7 @@ endif()
 
 if(Legion_USE_CUDA)
   list(APPEND legate_sparse_SOURCES
-    src/legate_sparse/cudalibs.cu 
+    src/legate_sparse/cudalibs.cu
 
     src/legate_sparse/array/conv/dense_to_csr.cu
     src/legate_sparse/array/conv/csr_to_dense.cu
@@ -174,19 +178,21 @@ if(Legion_USE_CUDA)
     src/legate_sparse/array/csr/spmv.cu
     src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
     src/legate_sparse/array/csr/indexing.cu
+    src/legate_sparse/array/csr/geam.cu
 
     src/legate_sparse/array/util/unzip_rect.cu
     src/legate_sparse/array/util/zip_to_rect.cu
-    
+
     src/legate_sparse/partition/fast_image_partition.cu
 
     src/legate_sparse/linalg/axpby.cu
+    src/legate_sparse/linalg/spsolve.cu
   )
 endif()
 
 
 list(APPEND legate_sparse_SOURCES
-  
+
   # This must always be the last file!
   # It guarantees we do our registration callback
   # only after all task variants are recorded
@@ -237,17 +243,21 @@ set_target_properties(legate_sparse
                       CUDA_STANDARD_REQUIRED              ON
                       LIBRARY_OUTPUT_DIRECTORY            lib)
 
+# NOTE: For multi-GPU runs, the env CUDSS_COMM_LIB must be set to path to libcudss_commlayer_nccl.so
+# conda install -c conda-forge libcudss libcudss-dev libcudss-commlayer-nccl
+# should install it in ${CONDA_PREFIX}/lib/
 target_link_libraries(legate_sparse
    PUBLIC legate::legate
           $<TARGET_NAME_IF_EXISTS:NCCL::NCCL>
           # do we need to put this dependency here?
           # what is the correct target?
           # cupynumeric::cupynumeric
-  PRIVATE 
+  PRIVATE
           # Add Conda library and include paths
           $<TARGET_NAME_IF_EXISTS:conda_env>
           $<TARGET_NAME_IF_EXISTS:CUDA::cublas>
           $<TARGET_NAME_IF_EXISTS:CUDA::cusparse>
+          $<TARGET_NAME_IF_EXISTS:cudss>
           $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>)
 
 
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..93344517
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,118 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+requires = [
+    "wheel",
+    "ninja",
+    "setuptools",
+    "scikit-build>=0.13.1",
+    "cmake>=3.30.4",
+]
+build-backend = "setuptools.build_meta"
+
+[tool.pytest.ini_options]
+addopts = "--capture=sys"
+cache_dir = "./.cache/pytest"
+
+[tool.mypy]
+python_version = "3.11"
+cache_dir = "./.cache/mypy"
+
+pretty = true
+show_error_codes = true
+show_error_context = true
+show_column_numbers = true
+
+namespace_packages = true
+ignore_missing_imports = false
+
+disallow_any_unimported = true
+disallow_any_expr = false
+disallow_any_decorated = false
+disallow_any_explicit = false
+disallow_any_generics = true
+disallow_subclassing_any = true
+
+disallow_untyped_calls = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+check_untyped_defs = true
+disallow_untyped_decorators = true
+no_implicit_optional = true
+strict_optional = true
+
+warn_redundant_casts = true
+warn_unused_ignores = false
+warn_no_return = true
+warn_return_any = true
+warn_unreachable = true
+
+ignore_errors = false
+
+allow_untyped_globals = false
+allow_redefinition = false
+implicit_reexport = true
+strict_equality = true
+
+warn_unused_configs = true
+
+[[tool.mypy.overrides]]
+# ignore auto-generated files
+# or files depending on auto-generated field
+module = [
+  "legate_sparse.install_info",
+  "legate_sparse._version",
+  "legate._version",
+  "legate.__main__",
+  "legate.install_info",
+]
+ignore_errors = true
+
+[tool.ruff]
+cache-dir = "./.cache/ruff"
+extend-exclude = [
+    "arch-*",
+    "*-arch",
+    "venv",
+    "*venv",
+    "build",
+]
+line-length = 79
+src = [".", "legate_sparse"]
+
+[tool.ruff.format]
+skip-magic-trailing-comma = true
+
+[tool.ruff.lint.isort.sections]
+legion = ["legion_cffi", "legion_top"]
+legate = ["legate"]
+testing = ["pytest", "pytest_mock"]
+
+[tool.ruff.lint.isort]
+known-third-party = ["numpy", "scipy"]
+known-first-party = ["legate_sparse"]
+length-sort-straight = true
+combine-as-imports = true
+split-on-trailing-comma = false
+required-imports = ["from __future__ import annotations"]
+section-order = [
+    "future",
+    "standard-library",
+    "third-party",
+    "legion",
+    "legate",
+    "first-party",
+    "local-folder",
+]
diff --git a/scripts/memlog_analysis.py b/scripts/memlog_analysis.py
old mode 100644
new mode 100755
index ee8bd3c6..e16a5369
--- a/scripts/memlog_analysis.py
+++ b/scripts/memlog_analysis.py
@@ -16,10 +16,10 @@
 
     # Parse the log file
     allocations = parse_memlog('memlog.txt')
-    
+
     # Export to CSV
     export_to_csv(allocations, 'memory_analysis.csv')
-    
+
     # Create visualizations (requires pandas, matplotlib, seaborn)
     visualize_allocations(allocations)
 """  # noqa: W293
@@ -116,7 +116,9 @@ def export_to_csv(
                 # If unique_mb_only is enabled, check for similar memory sizes
                 if unique_mb_only:
                     is_similar = any(
-                        are_similar_sizes(mb_size, seen_size, threshold_percent)
+                        are_similar_sizes(
+                            mb_size, seen_size, threshold_percent
+                        )
                         for seen_size in seen_mb_sizes
                     )
                     if is_similar:
@@ -145,7 +147,9 @@ def export_to_csv(
                 )
 
 
-def export_to_excel(allocations: List[BufferAllocation], output_file: str) -> bool:
+def export_to_excel(
+    allocations: List[BufferAllocation], output_file: str
+) -> bool:
     """
     Export memory allocation data to formatted Excel file.
 
@@ -299,7 +303,9 @@ def visualize_allocations(
     """
     if not all([PANDAS_AVAILABLE, MATPLOTLIB_AVAILABLE, SEABORN_AVAILABLE]):
         print("Error: Visualization requires pandas, matplotlib, and seaborn.")
-        print("Please install them with: pip install pandas matplotlib seaborn")
+        print(
+            "Please install them with: pip install pandas matplotlib seaborn"
+        )
         return False
 
     # Convert to DataFrame
@@ -360,7 +366,9 @@ def visualize_allocations(
     else:
         # Memory usage by description (top 10)
         plt.subplot(2, 2, 1)
-        top_descriptions = df.groupby("Description")["Size_MB"].sum().nlargest(10)
+        top_descriptions = (
+            df.groupby("Description")["Size_MB"].sum().nlargest(10)
+        )
         sns.barplot(x=top_descriptions.values, y=top_descriptions.index)
         plt.title("Top 10 Memory Usage by Description")
         plt.xlabel("Memory (MB)")
@@ -372,7 +380,9 @@ def visualize_allocations(
         plt.title("Memory Distribution by Type")
 
     plt.tight_layout()
-    plt.savefig(f"{output_dir}/memory_analysis.png", dpi=300, bbox_inches="tight")
+    plt.savefig(
+        f"{output_dir}/memory_analysis.png", dpi=300, bbox_inches="tight"
+    )
     plt.close()
     return True
 
@@ -383,7 +393,9 @@ def main():
 
     from memlog_parser import parse_memlog
 
-    parser = argparse.ArgumentParser(description="Analyze memory allocation logs")
+    parser = argparse.ArgumentParser(
+        description="Analyze memory allocation logs"
+    )
     parser.add_argument("file", help="Path to the memory log file")
     parser.add_argument(
         "--output-dir", default=".", help="Directory to save output files"
diff --git a/scripts/memlog_cli.py b/scripts/memlog_cli.py
old mode 100644
new mode 100755
index ef45a129..94cf430c
--- a/scripts/memlog_cli.py
+++ b/scripts/memlog_cli.py
@@ -14,7 +14,11 @@
 import os
 import sys
 
-from memlog_analysis import export_to_csv, export_to_excel, visualize_allocations
+from memlog_analysis import (
+    export_to_csv,
+    export_to_excel,
+    visualize_allocations,
+)
 from memlog_parser import (
     filter_allocations,
     parse_memlog,
@@ -49,8 +53,12 @@ def check_dependencies(format: str) -> bool:
             import pandas  # noqa:  F401
             import seaborn  # noqa:  F401
         except ImportError:
-            print("Error: Visualization requires pandas, matplotlib, and seaborn.")
-            print("Please install them with: pip install pandas matplotlib seaborn")
+            print(
+                "Error: Visualization requires pandas, matplotlib, and seaborn."
+            )
+            print(
+                "Please install them with: pip install pandas matplotlib seaborn"
+            )
             return False
 
     return True
diff --git a/scripts/memlog_parser.py b/scripts/memlog_parser.py
old mode 100644
new mode 100755
index 0024b41d..854b3a72
--- a/scripts/memlog_parser.py
+++ b/scripts/memlog_parser.py
@@ -60,7 +60,9 @@ class BufferAllocation:
 
     def total_bytes(self) -> int:
         """Calculate total bytes allocated including data type size."""
-        type_size = TYPE_SIZES.get(self.type, 1)  # Default to 1 byte if type not found
+        type_size = TYPE_SIZES.get(
+            self.type, 1
+        )  # Default to 1 byte if type not found
         return self.size * type_size
 
     def total_mb(self) -> float:
@@ -68,7 +70,9 @@ def total_mb(self) -> float:
         return self.total_bytes() / (1024 * 1024)
 
 
-def are_similar_sizes(size1: float, size2: float, threshold_percent: float) -> bool:
+def are_similar_sizes(
+    size1: float, size2: float, threshold_percent: float
+) -> bool:
     """
     Check if two sizes are similar within the given percentage threshold.
 
@@ -190,7 +194,10 @@ def filter_allocations(
 
     filtered = []
     for alloc in allocations:
-        if alloc.description not in ignore_descriptions and alloc.total_mb() >= min_mb:
+        if (
+            alloc.description not in ignore_descriptions
+            and alloc.total_mb() >= min_mb
+        ):
             filtered.append(alloc)
     return filtered
 
@@ -215,9 +222,9 @@ def print_description_group(
         max_bytes = max(alloc.total_bytes() for alloc in allocs)
         print(f"\n{desc}:")
         print(
-            f"  Total bytes (includes non-unique allocs): {desc_total_bytes / (1024*1024):.2f} MB"
+            f"  Total bytes (includes non-unique allocs): {desc_total_bytes / (1024 * 1024):.2f} MB"
         )
-        print(f"  Max bytes  : {max_bytes / (1024*1024):.2f} MB")
+        print(f"  Max bytes  : {max_bytes / (1024 * 1024):.2f} MB")
 
         # Track seen entries for this description
         seen_entries = set()
@@ -275,9 +282,9 @@ def print_size_group(
 
         print(f"\nSize: {size} elements:")
         print(
-            f"  Total bytes (includes non-unique allocs): {size_total_bytes / (1024*1024):.2f} MB"
+            f"  Total bytes (includes non-unique allocs): {size_total_bytes / (1024 * 1024):.2f} MB"
         )
-        print(f"  Max bytes  : {max_bytes / (1024*1024):.2f} MB")
+        print(f"  Max bytes  : {max_bytes / (1024 * 1024):.2f} MB")
 
         for alloc in allocs:
             mb_size = alloc.total_mb()
diff --git a/scripts/pre-commit/yamllint.yml b/scripts/pre-commit/yamllint.yml
new file mode 100644
index 00000000..2017e01d
--- /dev/null
+++ b/scripts/pre-commit/yamllint.yml
@@ -0,0 +1,6 @@
+---
+extends: default
+rules:
+  truthy:
+    ignore: ".github/workflows/*.yml"
+  line-length: disable
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..eed372ce
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,7 @@
+[mypy]
+python_version = 3.11
+strict = True
+implicit_reexport = true
+
+[mypy-legate_sparse._version]
+ignore_errors = True
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 68efb75c..c358d32e
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 
 setup(
     name="legate-sparse",
-    version="25.07.00",
+    version="26.02.00",
     description="An Aspiring Drop-In Replacement for SciPy Sparse module at Scale",
     author="NVIDIA Corporation",
     license="Apache 2.0",
@@ -52,10 +52,7 @@
         "Programming Language :: Python :: 3.12",
         "Programming Language :: Python :: 3.13",
     ],
-    packages=find_packages(
-        where=".",
-        include=["legate_sparse*"],
-    ),
+    packages=find_packages(where=".", include=["legate_sparse*"]),
     include_package_data=True,
     zip_safe=False,
 )
diff --git a/src/legate_sparse/array/conv/csr_to_dense.cc b/src/legate_sparse/array/conv/csr_to_dense.cc
index de9a8958..5ff66d63 100644
--- a/src/legate_sparse/array/conv/csr_to_dense.cc
+++ b/src/legate_sparse/array/conv/csr_to_dense.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRToDenseImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRToDenseImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/conv/csr_to_dense.cu b/src/legate_sparse/array/conv/csr_to_dense.cu
index 2d3c159f..de98b015 100644
--- a/src/legate_sparse/array/conv/csr_to_dense.cu
+++ b/src/legate_sparse/array/conv/csr_to_dense.cu
@@ -48,6 +48,9 @@ __global__ void CSRtoDenseKernel(size_t rows,
 
 template <>
 struct CSRToDenseImpl<VariantKind::GPU> {
+  TaskContext context;
+  explicit CSRToDenseImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(CSRToDenseArgs& args) const
   {
@@ -64,7 +67,7 @@ struct CSRToDenseImpl<VariantKind::GPU> {
       return;
     }
 
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
 
     auto B_domain = B_pos.domain();
     auto rows     = B_domain.hi()[0] - B_domain.lo()[0] + 1;
diff --git a/src/legate_sparse/array/conv/csr_to_dense_omp.cc b/src/legate_sparse/array/conv/csr_to_dense_omp.cc
index ec5da532..d048e0d6 100644
--- a/src/legate_sparse/array/conv/csr_to_dense_omp.cc
+++ b/src/legate_sparse/array/conv/csr_to_dense_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRToDenseImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRToDenseImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/conv/csr_to_dense_template.inl b/src/legate_sparse/array/conv/csr_to_dense_template.inl
index 58529312..9fb8d4dd 100644
--- a/src/legate_sparse/array/conv/csr_to_dense_template.inl
+++ b/src/legate_sparse/array/conv/csr_to_dense_template.inl
@@ -31,6 +31,9 @@ struct CSRToDenseImplBody;
 
 template <VariantKind KIND>
 struct CSRToDenseImpl {
+  TaskContext context;
+  explicit CSRToDenseImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(CSRToDenseArgs& args) const
   {
@@ -45,7 +48,7 @@ struct CSRToDenseImpl {
     if (args.A_vals.domain().empty()) {
       return;
     }
-    CSRToDenseImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+    CSRToDenseImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(
       A_vals, B_pos, B_crd, B_vals, args.A_vals.shape<2>());
   }
 };
@@ -61,7 +64,7 @@ static void csr_to_dense_template(TaskContext context)
   CSRToDenseArgs args{outputs[0], context.inputs()[0], context.inputs()[1], context.inputs()[2]};
 
   index_type_value_type_dispatch(
-    args.B_crd.code(), args.A_vals.code(), CSRToDenseImpl<KIND>{}, args);
+    args.B_crd.code(), args.A_vals.code(), CSRToDenseImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/conv/dense_to_csr.cc b/src/legate_sparse/array/conv/dense_to_csr.cc
index 3304b558..410b37d8 100644
--- a/src/legate_sparse/array/conv/dense_to_csr.cc
+++ b/src/legate_sparse/array/conv/dense_to_csr.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code VAL_CODE>
 struct DenseToCSRNNZImplBody<VariantKind::CPU, VAL_CODE> {
+  TaskContext context;
+  explicit DenseToCSRNNZImplBody(TaskContext context) : context(context) {}
+
   using VAL_TY = type_of<VAL_CODE>;
 
   void operator()(const AccessorWO<nnz_ty, 2>& nnz,
@@ -43,6 +46,9 @@ struct DenseToCSRNNZImplBody<VariantKind::CPU, VAL_CODE> {
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct DenseToCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit DenseToCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/conv/dense_to_csr.cu b/src/legate_sparse/array/conv/dense_to_csr.cu
index e38d906b..34698a4d 100644
--- a/src/legate_sparse/array/conv/dense_to_csr.cu
+++ b/src/legate_sparse/array/conv/dense_to_csr.cu
@@ -44,6 +44,9 @@ __global__ void denseToCSRNNZKernel(size_t rows,
 
 template <>
 struct DenseToCSRNNZImpl<VariantKind::GPU> {
+  TaskContext context;
+  explicit DenseToCSRNNZImpl(TaskContext context) : context(context) {}
+
   template <Type::Code VAL_CODE>
   void operator()(DenseToCSRNNZArgs& args) const
   {
@@ -57,7 +60,7 @@ struct DenseToCSRNNZImpl<VariantKind::GPU> {
       return;
     }
 
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
 
 // #if (CUSPARSE_VER_MAJOR < 11 || (CUSPARSE_VER_MAJOR == 11 && CUSPARSE_VER_MINOR < 2))
 #if 1
@@ -149,6 +152,9 @@ __global__ void denseToCSRKernel(size_t rows,
 
 template <>
 struct DenseToCSRImpl<VariantKind::GPU> {
+  TaskContext context;
+  explicit DenseToCSRImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(DenseToCSRArgs& args) const
   {
@@ -166,7 +172,7 @@ struct DenseToCSRImpl<VariantKind::GPU> {
     }
 
     // Get context sensitive objects.
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
 
     auto B_domain = B_vals.domain();
     auto rows     = B_domain.hi()[0] - B_domain.lo()[0] + 1;
diff --git a/src/legate_sparse/array/conv/dense_to_csr_omp.cc b/src/legate_sparse/array/conv/dense_to_csr_omp.cc
index 78e060de..7de5334d 100644
--- a/src/legate_sparse/array/conv/dense_to_csr_omp.cc
+++ b/src/legate_sparse/array/conv/dense_to_csr_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code VAL_CODE>
 struct DenseToCSRNNZImplBody<VariantKind::OMP, VAL_CODE> {
+  TaskContext context;
+  explicit DenseToCSRNNZImplBody(TaskContext context) : context(context) {}
+
   using VAL_TY = type_of<VAL_CODE>;
 
   void operator()(const AccessorWO<nnz_ty, 2>& nnz,
@@ -44,6 +47,9 @@ struct DenseToCSRNNZImplBody<VariantKind::OMP, VAL_CODE> {
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct DenseToCSRImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit DenseToCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/conv/dense_to_csr_template.inl b/src/legate_sparse/array/conv/dense_to_csr_template.inl
index 31c81686..bbf98cb4 100644
--- a/src/legate_sparse/array/conv/dense_to_csr_template.inl
+++ b/src/legate_sparse/array/conv/dense_to_csr_template.inl
@@ -32,6 +32,9 @@ struct DenseToCSRNNZImplBody;
 
 template <VariantKind KIND>
 struct DenseToCSRNNZImpl {
+  TaskContext context;
+  explicit DenseToCSRNNZImpl(TaskContext context) : context(context) {}
+
   template <Type::Code VAL_CODE>
   void operator()(DenseToCSRNNZArgs& args) const
   {
@@ -43,7 +46,7 @@ struct DenseToCSRNNZImpl {
     if (args.nnz.domain().empty()) {
       return;
     }
-    DenseToCSRNNZImplBody<KIND, VAL_CODE>()(nnz, B_vals, args.B_vals.shape<2>());
+    DenseToCSRNNZImplBody<KIND, VAL_CODE>{context}(nnz, B_vals, args.B_vals.shape<2>());
   }
 };
 
@@ -52,6 +55,9 @@ struct DenseToCSRImplBody;
 
 template <VariantKind KIND>
 struct DenseToCSRImpl {
+  TaskContext context;
+  explicit DenseToCSRImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(DenseToCSRArgs& args) const
   {
@@ -66,7 +72,7 @@ struct DenseToCSRImpl {
     if (args.A_pos.domain().empty()) {
       return;
     }
-    DenseToCSRImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+    DenseToCSRImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(
       A_pos, A_crd, A_vals, B_vals, args.B_vals.shape<2>());
   }
 };
@@ -78,7 +84,7 @@ static void dense_to_csr_nnz_template(TaskContext context)
     context.output(0),  // nnz_per_row
     context.input(0)    // B_vals
   };
-  value_type_dispatch(args.B_vals.code(), DenseToCSRNNZImpl<KIND>{}, args);
+  value_type_dispatch(args.B_vals.code(), DenseToCSRNNZImpl<KIND>{context}, args);
 }
 
 template <VariantKind KIND>
@@ -92,7 +98,7 @@ static void dense_to_csr_template(TaskContext context)
   };
 
   index_type_value_type_dispatch(
-    args.A_crd.code(), args.A_vals.code(), DenseToCSRImpl<KIND>{}, args);
+    args.A_crd.code(), args.A_vals.code(), DenseToCSRImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cc b/src/legate_sparse/array/conv/pos_to_coordinates.cc
index 7cadb10e..20773a22 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.cc
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE>
 struct ExpandPosToCoordinatesImplBody<VariantKind::CPU, INDEX_CODE> {
+  TaskContext context;
+  explicit ExpandPosToCoordinatesImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorRO<Rect<1>, 1>& pos,
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cu b/src/legate_sparse/array/conv/pos_to_coordinates.cu
index c74a5c3f..ced2335d 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.cu
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.cu
@@ -44,13 +44,16 @@ __global__ void fill_row_indices(size_t rows,
 
 template <Type::Code INDEX_CODE>
 struct ExpandPosToCoordinatesImplBody<VariantKind::GPU, INDEX_CODE> {
+  TaskContext context;
+  explicit ExpandPosToCoordinatesImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorRO<Rect<1>, 1>& pos,
                   const AccessorWO<INDEX_TY, 1>& row_indices,
                   const Rect<1>& rect)
   {
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     auto blocks = get_num_blocks_1d(rect.volume());
     size_t rows = rect.volume();
 
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc b/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc
index 80da99a5..01d51002 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc
+++ b/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE>
 struct ExpandPosToCoordinatesImplBody<VariantKind::OMP, INDEX_CODE> {
+  TaskContext context;
+  explicit ExpandPosToCoordinatesImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorRO<Rect<1>, 1>& pos,
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates_template.inl b/src/legate_sparse/array/conv/pos_to_coordinates_template.inl
index 39142e70..160bc53f 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates_template.inl
+++ b/src/legate_sparse/array/conv/pos_to_coordinates_template.inl
@@ -28,6 +28,9 @@ struct ExpandPosToCoordinatesImplBody;
 
 template <VariantKind KIND>
 struct ExpandPosToCoordinatesImpl {
+  TaskContext context;
+  explicit ExpandPosToCoordinatesImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE>
   void operator()(ExpandPosToCoordinatesArgs& args) const
   {
@@ -41,7 +44,8 @@ struct ExpandPosToCoordinatesImpl {
     if (pos_domain.empty() || row_indices_domain.empty()) {
       return;
     }
-    ExpandPosToCoordinatesImplBody<KIND, INDEX_CODE>()(pos, row_indices, args.pos.shape<1>());
+    ExpandPosToCoordinatesImplBody<KIND, INDEX_CODE>{context}(
+      pos, row_indices, args.pos.shape<1>());
   }
 };
 
@@ -52,7 +56,7 @@ static void pos_to_coordinates_template(TaskContext context)
     context.outputs()[0],
     context.inputs()[0],
   };
-  index_type_dispatch(args.row_indices.code(), ExpandPosToCoordinatesImpl<KIND>(), args);
+  index_type_dispatch(args.row_indices.code(), ExpandPosToCoordinatesImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam.cc b/src/legate_sparse/array/csr/geam.cc
new file mode 100644
index 00000000..ced2d73b
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam.cc
@@ -0,0 +1,79 @@
+#include "legate_sparse/array/csr/geam.h"
+#include "legate_sparse/array/csr/geam_template.inl"
+#include "legate_sparse/array/csr/geam_kernels.h"
+
+namespace sparse {
+using namespace legate;
+
+template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct GeamComputeImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GeamComputeImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+  using VAL_TY   = type_of<VAL_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<VAL_TY, 1>& A_vals,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRO<VAL_TY, 1>& B_vals,
+                  const AccessorRO<Rect<1>, 1>& C_pos,
+                  const AccessorWO<INDEX_TY, 1>& C_crd,
+                  const AccessorWO<VAL_TY, 1>& C_vals,
+                  const AccessorRO<VAL_TY, 1>& alpha,
+                  const AccessorRO<VAL_TY, 1>& beta,
+                  const Rect<1>& rect)
+  {
+    VAL_TY alpha_val = alpha[0];
+    VAL_TY beta_val  = beta[0];
+
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      geam_compute_row(
+        row, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha_val, beta_val);
+    }
+  }
+};
+
+template <Type::Code INDEX_CODE>
+struct GeamSymbolicImplBody<VariantKind::CPU, INDEX_CODE> {
+  TaskContext context;
+  explicit GeamSymbolicImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRW<nnz_ty, 1>& nnz_per_row,
+                  const Rect<1>& rect)
+  {
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      nnz_per_row[row] = geam_symbolic_row(row, A_pos, A_crd, B_pos, B_crd);
+    }
+  }
+};
+
+/* static */ void GeamCSRCSRSymbolic::cpu_variant(legate::TaskContext context)
+{
+  geam_csr_csr_symbolic_template<VariantKind::CPU>(context);
+}
+
+/* static */ void GeamCSRCSRCompute::cpu_variant(legate::TaskContext context)
+{
+  geam_csr_csr_compute_template<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static const auto sparse_reg_task_ = []() -> char {
+  GeamCSRCSRSymbolic::register_variants();
+  GeamCSRCSRCompute::register_variants();
+  return 0;
+}();
+
+}  // namespace
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam.cu b/src/legate_sparse/array/csr/geam.cu
new file mode 100644
index 00000000..fdf467d3
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam.cu
@@ -0,0 +1,144 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/array/csr/geam.h"
+#include "legate_sparse/array/csr/geam_template.inl"
+#include "legate_sparse/array/csr/geam_kernels.h"
+#include "legate_sparse/util/cuda_help.h"
+
+namespace sparse {
+using namespace legate;
+
+// GPU kernel for symbolic phase: compute nnz_per_row
+template <typename INDEX_TY>
+__global__ void geam_symbolic_kernel(const size_t nrows,
+                                     const AccessorRO<Rect<1>, 1> A_pos,
+                                     const AccessorRO<INDEX_TY, 1> A_crd,
+                                     const AccessorRO<Rect<1>, 1> B_pos,
+                                     const AccessorRO<INDEX_TY, 1> B_crd,
+                                     const AccessorRW<nnz_ty, 1> nnz_per_row)
+{
+  const size_t row = global_tid_1d();
+  if (row >= nrows) {
+    return;
+  }
+
+  nnz_per_row[row] = geam_symbolic_row(row, A_pos, A_crd, B_pos, B_crd);
+}
+
+// GPU kernel for compute phase: C = alpha * A + beta * B
+template <typename INDEX_TY, typename VAL_TY>
+__global__ void geam_compute_kernel(const size_t nrows,
+                                    const AccessorRO<Rect<1>, 1> A_pos,
+                                    const AccessorRO<INDEX_TY, 1> A_crd,
+                                    const AccessorRO<VAL_TY, 1> A_vals,
+                                    const AccessorRO<Rect<1>, 1> B_pos,
+                                    const AccessorRO<INDEX_TY, 1> B_crd,
+                                    const AccessorRO<VAL_TY, 1> B_vals,
+                                    const AccessorRO<Rect<1>, 1> C_pos,
+                                    const AccessorWO<INDEX_TY, 1> C_crd,
+                                    const AccessorWO<VAL_TY, 1> C_vals,
+                                    const AccessorRO<VAL_TY, 1> alpha_acc,
+                                    const AccessorRO<VAL_TY, 1> beta_acc)
+{
+  const size_t row = global_tid_1d();
+  if (row >= nrows) {
+    return;
+  }
+
+  VAL_TY alpha = alpha_acc[0];
+  VAL_TY beta  = beta_acc[0];
+
+  geam_compute_row(
+    row, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha, beta);
+}
+
+// GPU implementation of the symbolic phase
+template <Type::Code INDEX_CODE>
+struct GeamSymbolicImplBody<VariantKind::GPU, INDEX_CODE> {
+  TaskContext context;
+  explicit GeamSymbolicImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRW<nnz_ty, 1>& nnz_per_row,
+                  const Rect<1>& rect)
+  {
+    auto stream     = context.get_task_stream();
+    auto nrows      = rect.hi[0] - rect.lo[0] + 1;
+    auto num_blocks = get_num_blocks_1d(nrows);
+
+    if (nrows == 0) {
+      return;
+    }
+
+    geam_symbolic_kernel<INDEX_TY><<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
+      nrows, A_pos, A_crd, B_pos, B_crd, nnz_per_row);
+    LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
+  }
+};
+
+/*static*/ void GeamCSRCSRSymbolic::gpu_variant(TaskContext context)
+{
+  geam_csr_csr_symbolic_template<VariantKind::GPU>(context);
+}
+
+// GPU implementation of the compute phase
+template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct GeamComputeImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GeamComputeImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+  using VAL_TY   = type_of<VAL_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<VAL_TY, 1>& A_vals,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRO<VAL_TY, 1>& B_vals,
+                  const AccessorRO<Rect<1>, 1>& C_pos,
+                  const AccessorWO<INDEX_TY, 1>& C_crd,
+                  const AccessorWO<VAL_TY, 1>& C_vals,
+                  const AccessorRO<VAL_TY, 1>& alpha,
+                  const AccessorRO<VAL_TY, 1>& beta,
+                  const Rect<1>& rect)
+  {
+    auto stream     = context.get_task_stream();
+    auto nrows      = rect.hi[0] - rect.lo[0] + 1;
+    auto num_blocks = get_num_blocks_1d(nrows);
+
+    if (nrows == 0) {
+      return;
+    }
+
+    geam_compute_kernel<INDEX_TY, VAL_TY><<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
+      nrows, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha, beta);
+    LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
+  }
+};
+
+/*static*/ void GeamCSRCSRCompute::gpu_variant(TaskContext context)
+{
+  geam_csr_csr_compute_template<VariantKind::GPU>(context);
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam.h b/src/legate_sparse/array/csr/geam.h
new file mode 100644
index 00000000..6329f307
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam.h
@@ -0,0 +1,91 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/sparse.h"
+#include "legate_sparse/sparse_c.h"
+#include "legate.h"
+
+namespace sparse {
+
+struct GeamCSRCSRSymbolicArgs {
+  // Symbolic phase: compute the sparsity pattern of C = alpha * A + beta * B
+  // This phase only needs the positions and coordinates, not the values or scalars
+  const legate::PhysicalStore& A_pos;
+  const legate::PhysicalStore& A_crd;
+  const legate::PhysicalStore& B_pos;
+  const legate::PhysicalStore& B_crd;
+  const legate::PhysicalStore& nnz_per_row;  // output: number of non-zeros per row
+};
+
+struct GeamCSRCSRComputeArgs {
+  // Compute phase: compute the output C where C = alpha * A + beta * B
+  // Inputs
+  const legate::PhysicalStore& A_pos;
+  const legate::PhysicalStore& A_crd;
+  const legate::PhysicalStore& A_vals;
+  const legate::PhysicalStore& B_pos;
+  const legate::PhysicalStore& B_crd;
+  const legate::PhysicalStore& B_vals;
+
+  // C_pos is an INPUT (computed in symbolic phase, read-only here)
+  const legate::PhysicalStore& C_pos;
+
+  // C_crd and C_vals are outputs
+  const legate::PhysicalStore& C_crd;
+  const legate::PhysicalStore& C_vals;
+
+  // Scalar constants
+  const legate::PhysicalStore& alpha;
+  const legate::PhysicalStore& beta;
+};
+
+class GeamCSRCSRCompute : public SparseTask<GeamCSRCSRCompute> {
+ public:
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE}};
+
+ public:
+  static void cpu_variant(legate::TaskContext context);
+
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext context);
+#endif
+
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+class GeamCSRCSRSymbolic : public SparseTask<GeamCSRCSRSymbolic> {
+ public:
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC}};
+
+ public:
+  static void cpu_variant(legate::TaskContext context);
+
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext context);
+#endif
+
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam_kernels.h b/src/legate_sparse/array/csr/geam_kernels.h
new file mode 100644
index 00000000..6ae7c6f0
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam_kernels.h
@@ -0,0 +1,129 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/util/typedefs.h"
+#include "legate.h"
+
+namespace sparse {
+using namespace legate;
+
+// =============================================================================
+// Symbolic Phase: Compute nnz per row for C = A + B
+// =============================================================================
+
+// Computes the number of non-zeros in a single row of C = A + B
+template <typename INDEX_TY>
+LEGATE_HOST_DEVICE inline nnz_ty geam_symbolic_row(size_t row,
+                                                   const AccessorRO<Rect<1>, 1> A_pos,
+                                                   const AccessorRO<INDEX_TY, 1> A_crd,
+                                                   const AccessorRO<Rect<1>, 1> B_pos,
+                                                   const AccessorRO<INDEX_TY, 1> B_crd)
+{
+  size_t A_pos_start = A_pos[row].lo;
+  size_t A_pos_end   = A_pos[row].hi + 1;
+  size_t B_pos_start = B_pos[row].lo;
+  size_t B_pos_end   = B_pos[row].hi + 1;
+
+  size_t a_pos = A_pos_start;
+  size_t b_pos = B_pos_start;
+  nnz_ty count = 0;
+
+  // Merge sorted column indices and count unique entries
+  while (a_pos < A_pos_end && b_pos < B_pos_end) {
+    if (A_crd[a_pos] < B_crd[b_pos]) {
+      a_pos++;
+    } else if (A_crd[a_pos] > B_crd[b_pos]) {
+      b_pos++;
+    } else {
+      a_pos++;
+      b_pos++;
+    }
+    count++;
+  }
+
+  // Add remaining elements
+  count += (A_pos_end - a_pos) + (B_pos_end - b_pos);
+  return count;
+}
+
+// =============================================================================
+// Compute Phase: Compute C = alpha * A + beta * B for a single row
+// =============================================================================
+
+// Computes a single row of C = alpha * A + beta * B
+template <typename INDEX_TY, typename VAL_TY>
+LEGATE_HOST_DEVICE inline void geam_compute_row(size_t row,
+                                                const AccessorRO<Rect<1>, 1> A_pos,
+                                                const AccessorRO<INDEX_TY, 1> A_crd,
+                                                const AccessorRO<VAL_TY, 1> A_vals,
+                                                const AccessorRO<Rect<1>, 1> B_pos,
+                                                const AccessorRO<INDEX_TY, 1> B_crd,
+                                                const AccessorRO<VAL_TY, 1> B_vals,
+                                                const AccessorRO<Rect<1>, 1> C_pos,
+                                                const AccessorWO<INDEX_TY, 1> C_crd,
+                                                const AccessorWO<VAL_TY, 1> C_vals,
+                                                VAL_TY alpha,
+                                                VAL_TY beta)
+{
+  size_t A_pos_start = A_pos[row].lo;
+  size_t A_pos_end   = A_pos[row].hi + 1;
+  size_t B_pos_start = B_pos[row].lo;
+  size_t B_pos_end   = B_pos[row].hi + 1;
+  size_t C_pos_start = C_pos[row].lo;
+
+  size_t a_pos = A_pos_start;
+  size_t b_pos = B_pos_start;
+  size_t c_pos = C_pos_start;
+
+  // Merge sorted column indices and compute values
+  while (a_pos < A_pos_end && b_pos < B_pos_end) {
+    if (A_crd[a_pos] < B_crd[b_pos]) {
+      C_crd[c_pos]  = A_crd[a_pos];
+      C_vals[c_pos] = alpha * A_vals[a_pos];
+      a_pos++;
+    } else if (A_crd[a_pos] > B_crd[b_pos]) {
+      C_crd[c_pos]  = B_crd[b_pos];
+      C_vals[c_pos] = beta * B_vals[b_pos];
+      b_pos++;
+    } else {
+      C_crd[c_pos]  = A_crd[a_pos];
+      C_vals[c_pos] = alpha * A_vals[a_pos] + beta * B_vals[b_pos];
+      a_pos++;
+      b_pos++;
+    }
+    c_pos++;
+  }
+
+  // Add remaining elements from A
+  while (a_pos < A_pos_end) {
+    C_crd[c_pos]  = A_crd[a_pos];
+    C_vals[c_pos] = alpha * A_vals[a_pos];
+    a_pos++;
+    c_pos++;
+  }
+
+  // Add remaining elements from B
+  while (b_pos < B_pos_end) {
+    C_crd[c_pos]  = B_crd[b_pos];
+    C_vals[c_pos] = beta * B_vals[b_pos];
+    b_pos++;
+    c_pos++;
+  }
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam_omp.cc b/src/legate_sparse/array/csr/geam_omp.cc
new file mode 100644
index 00000000..26c52361
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam_omp.cc
@@ -0,0 +1,87 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/array/csr/geam.h"
+#include "legate_sparse/array/csr/geam_template.inl"
+#include "legate_sparse/array/csr/geam_kernels.h"
+
+namespace sparse {
+using namespace legate;
+
+template <Type::Code INDEX_CODE>
+struct GeamSymbolicImplBody<VariantKind::OMP, INDEX_CODE> {
+  TaskContext context;
+  explicit GeamSymbolicImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRW<nnz_ty, 1>& nnz_per_row,
+                  const Rect<1>& rect)
+  {
+#pragma omp parallel for
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      nnz_per_row[row] = geam_symbolic_row(row, A_pos, A_crd, B_pos, B_crd);
+    }
+  }
+};
+
+/* static */ void GeamCSRCSRSymbolic::omp_variant(TaskContext context)
+{
+  geam_csr_csr_symbolic_template<VariantKind::OMP>(context);
+}
+
+template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct GeamComputeImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GeamComputeImplBody(TaskContext context) : context(context) {}
+
+  using INDEX_TY = type_of<INDEX_CODE>;
+  using VAL_TY   = type_of<VAL_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRO<VAL_TY, 1>& A_vals,
+                  const AccessorRO<Rect<1>, 1>& B_pos,
+                  const AccessorRO<INDEX_TY, 1>& B_crd,
+                  const AccessorRO<VAL_TY, 1>& B_vals,
+                  const AccessorRO<Rect<1>, 1>& C_pos,
+                  const AccessorWO<INDEX_TY, 1>& C_crd,
+                  const AccessorWO<VAL_TY, 1>& C_vals,
+                  const AccessorRO<VAL_TY, 1>& alpha,
+                  const AccessorRO<VAL_TY, 1>& beta,
+                  const Rect<1>& rect)
+  {
+    VAL_TY alpha_val = alpha[0];
+    VAL_TY beta_val  = beta[0];
+
+#pragma omp parallel for
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      geam_compute_row(
+        row, A_pos, A_crd, A_vals, B_pos, B_crd, B_vals, C_pos, C_crd, C_vals, alpha_val, beta_val);
+    }
+  }
+};
+
+/* static */ void GeamCSRCSRCompute::omp_variant(TaskContext context)
+{
+  geam_csr_csr_compute_template<VariantKind::OMP>(context);
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/geam_template.inl b/src/legate_sparse/array/csr/geam_template.inl
new file mode 100644
index 00000000..0dc13513
--- /dev/null
+++ b/src/legate_sparse/array/csr/geam_template.inl
@@ -0,0 +1,139 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/array/csr/geam.h"
+#include "legate_sparse/util/dispatch.h"
+#include "legate_sparse/util/typedefs.h"
+
+namespace sparse {
+using namespace legate;
+
+// ============================================================================
+// Symbolic phase templates
+// ============================================================================
+
+template <VariantKind KIND, Type::Code INDEX_TY>
+struct GeamSymbolicImplBody;
+
+template <VariantKind KIND>
+struct GeamSymbolicImpl {
+  TaskContext context;
+  explicit GeamSymbolicImpl(TaskContext context) : context(context) {}
+
+  template <Type::Code INDEX_CODE>
+  void operator()(const GeamCSRCSRSymbolicArgs& args)
+  {
+    using INDEX_TY = type_of<INDEX_CODE>;
+
+    auto A_pos = args.A_pos.read_accessor<Rect<1>, 1>();
+    auto A_crd = args.A_crd.read_accessor<INDEX_TY, 1>();
+    auto B_pos = args.B_pos.read_accessor<Rect<1>, 1>();
+    auto B_crd = args.B_crd.read_accessor<INDEX_TY, 1>();
+
+    auto nnz_per_row = args.nnz_per_row.read_write_accessor<nnz_ty, 1>();
+
+    GeamSymbolicImplBody<KIND, INDEX_CODE>{context}(
+      A_pos, A_crd, B_pos, B_crd, nnz_per_row, args.A_pos.shape<1>());
+  }
+};
+
+template <VariantKind KIND>
+static void geam_csr_csr_symbolic_template(TaskContext context)
+{
+  GeamCSRCSRSymbolicArgs args{
+    context.inputs()[0],   // A_pos
+    context.inputs()[1],   // A_crd
+    context.inputs()[2],   // B_pos
+    context.inputs()[3],   // B_crd
+    context.outputs()[0],  // nnz_per_row
+  };
+
+  index_type_dispatch(args.A_crd.code(), GeamSymbolicImpl<KIND>{context}, args);
+}
+
+// ============================================================================
+// Compute phase templates
+// ============================================================================
+
+template <VariantKind KIND, Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct GeamComputeImplBody;
+
+template <VariantKind KIND>
+struct GeamComputeImpl {
+  TaskContext context;
+  explicit GeamComputeImpl(TaskContext context) : context(context) {}
+
+  template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+  void operator()(const GeamCSRCSRComputeArgs& args)
+  {
+    using INDEX_TY = type_of<INDEX_CODE>;
+    using VAL_TY   = type_of<VAL_CODE>;
+
+    auto A_pos  = args.A_pos.read_accessor<Rect<1>, 1>();
+    auto A_crd  = args.A_crd.read_accessor<INDEX_TY, 1>();
+    auto A_vals = args.A_vals.read_accessor<VAL_TY, 1>();
+    auto B_pos  = args.B_pos.read_accessor<Rect<1>, 1>();
+    auto B_crd  = args.B_crd.read_accessor<INDEX_TY, 1>();
+    auto B_vals = args.B_vals.read_accessor<VAL_TY, 1>();
+
+    // C_pos is read-only (computed in symbolic phase)
+    auto C_pos  = args.C_pos.read_accessor<Rect<1>, 1>();
+    auto C_crd  = args.C_crd.write_accessor<INDEX_TY, 1>();
+    auto C_vals = args.C_vals.write_accessor<VAL_TY, 1>();
+
+    // Read scalar values
+    auto alpha = args.alpha.read_accessor<VAL_TY, 1>();
+    auto beta  = args.beta.read_accessor<VAL_TY, 1>();
+
+    GeamComputeImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(A_pos,
+                                                             A_crd,
+                                                             A_vals,
+                                                             B_pos,
+                                                             B_crd,
+                                                             B_vals,
+                                                             C_pos,
+                                                             C_crd,
+                                                             C_vals,
+                                                             alpha,
+                                                             beta,
+                                                             args.A_pos.shape<1>());
+  }
+};
+
+template <VariantKind KIND>
+static void geam_csr_csr_compute_template(TaskContext context)
+{
+  GeamCSRCSRComputeArgs args{
+    context.inputs()[0],   // A_pos
+    context.inputs()[1],   // A_crd
+    context.inputs()[2],   // A_vals
+    context.inputs()[3],   // B_pos
+    context.inputs()[4],   // B_crd
+    context.inputs()[5],   // B_vals
+    context.inputs()[6],   // C_pos (read-only, computed in symbolic phase)
+    context.outputs()[0],  // C_crd
+    context.outputs()[1],  // C_vals
+    context.inputs()[7],   // alpha
+    context.inputs()[8],   // beta
+  };
+
+  index_type_value_type_dispatch(
+    args.A_crd.code(), args.A_vals.code(), GeamComputeImpl<KIND>{context}, args);
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/get_diagonal.cc b/src/legate_sparse/array/csr/get_diagonal.cc
index cace6438..47a8c7d1 100644
--- a/src/legate_sparse/array/csr/get_diagonal.cc
+++ b/src/legate_sparse/array/csr/get_diagonal.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct GetCSRDiagonalImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GetCSRDiagonalImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/get_diagonal.cu b/src/legate_sparse/array/csr/get_diagonal.cu
index 15e5e8a4..f0a32dfd 100644
--- a/src/legate_sparse/array/csr/get_diagonal.cu
+++ b/src/legate_sparse/array/csr/get_diagonal.cu
@@ -45,6 +45,9 @@ __global__ void compute_diag_kernel(size_t rows,
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct GetCSRDiagonalImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GetCSRDiagonalImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
@@ -54,7 +57,7 @@ struct GetCSRDiagonalImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
                   const AccessorRO<VAL_TY, 1>& vals,
                   const Rect<1>& rect)
   {
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     auto blocks = get_num_blocks_1d(rect.volume());
     compute_diag_kernel<INDEX_TY, VAL_TY>
       <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(rect.volume(), rect.lo[0], diag, pos, crd, vals);
diff --git a/src/legate_sparse/array/csr/get_diagonal_omp.cc b/src/legate_sparse/array/csr/get_diagonal_omp.cc
index ad698eed..c3d114ef 100644
--- a/src/legate_sparse/array/csr/get_diagonal_omp.cc
+++ b/src/legate_sparse/array/csr/get_diagonal_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct GetCSRDiagonalImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit GetCSRDiagonalImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/get_diagonal_template.inl b/src/legate_sparse/array/csr/get_diagonal_template.inl
index 74ac61bb..0cee5e9a 100644
--- a/src/legate_sparse/array/csr/get_diagonal_template.inl
+++ b/src/legate_sparse/array/csr/get_diagonal_template.inl
@@ -29,6 +29,9 @@ struct GetCSRDiagonalImplBody;
 
 template <VariantKind KIND>
 struct GetCSRDiagonalImpl {
+  TaskContext context;
+  explicit GetCSRDiagonalImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(GetCSRDiagonalArgs& args) const
   {
@@ -45,7 +48,7 @@ struct GetCSRDiagonalImpl {
       return;
     }
 
-    GetCSRDiagonalImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+    GetCSRDiagonalImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(
       diag, pos, crd, vals, args.diag.shape<1>());
   }
 };
@@ -56,6 +59,6 @@ static void get_csr_diagonal_template(TaskContext context)
   auto inputs = context.inputs();
   GetCSRDiagonalArgs args{context.outputs()[0], inputs[0], inputs[1], inputs[2]};
   index_type_value_type_dispatch(
-    args.crd.code(), args.diag.code(), GetCSRDiagonalImpl<KIND>{}, args);
+    args.crd.code(), args.diag.code(), GetCSRDiagonalImpl<KIND>{context}, args);
 }
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/indexing.cc b/src/legate_sparse/array/csr/indexing.cc
index f40c901b..1ec4f57a 100644
--- a/src/legate_sparse/array/csr/indexing.cc
+++ b/src/legate_sparse/array/csr/indexing.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRIndexingCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRIndexingCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/indexing.cu b/src/legate_sparse/array/csr/indexing.cu
index 25e96097..68d03ffd 100644
--- a/src/legate_sparse/array/csr/indexing.cu
+++ b/src/legate_sparse/array/csr/indexing.cu
@@ -84,6 +84,9 @@ __global__ void csr_indexing_csr_kernel(const size_t num_rows,
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRIndexingCSRImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRIndexingCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
@@ -98,9 +101,7 @@ struct CSRIndexingCSRImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
     // Get the number of rows in the matrix
     size_t num_rows = rect.hi[0] - rect.lo[0] + 1;
 
-    std::cout << "GPU variant" << std::endl;
-
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     auto blocks = get_num_blocks_1d(rect.volume());
     csr_indexing_csr_kernel<INDEX_TY, VAL_TY><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       num_rows, A_pos, A_crd, A_vals, mask_pos, mask_crd, value);
diff --git a/src/legate_sparse/array/csr/indexing_omp.cc b/src/legate_sparse/array/csr/indexing_omp.cc
index c429481f..a96fc270 100644
--- a/src/legate_sparse/array/csr/indexing_omp.cc
+++ b/src/legate_sparse/array/csr/indexing_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRIndexingCSRImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRIndexingCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
@@ -34,7 +37,6 @@ struct CSRIndexingCSRImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
                   const AccessorRO<VAL_TY, 1>& value,
                   const Rect<1>& rect)
   {
-    std::cout << "OMP variant" << std::endl;
 #pragma omp parallel for
     for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
       size_t j_pos_start = A_pos[row].lo;
diff --git a/src/legate_sparse/array/csr/indexing_template.inl b/src/legate_sparse/array/csr/indexing_template.inl
index 381ca45b..e73efa60 100644
--- a/src/legate_sparse/array/csr/indexing_template.inl
+++ b/src/legate_sparse/array/csr/indexing_template.inl
@@ -28,6 +28,9 @@ struct CSRIndexingCSRImplBody;
 
 template <VariantKind KIND>
 struct CSRIndexingCSRImpl {
+  TaskContext context;
+  explicit CSRIndexingCSRImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(const CSRIndexingCSRArgs& args)
   {
@@ -44,7 +47,7 @@ struct CSRIndexingCSRImpl {
     auto value = args.value.read_accessor<VAL_TY, 1>();
 
     // TODO: Rect is based on A_pos.shape, is that correct?
-    CSRIndexingCSRImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+    CSRIndexingCSRImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(
       A_pos, A_crd, A_vals, key_pos, key_crd, value, args.A_pos.shape<1>());
   }
 };
@@ -62,7 +65,7 @@ static void csr_indexing_csr_template(TaskContext context)
   };
 
   index_type_value_type_dispatch(
-    args.A_crd.code(), args.A_vals.code(), CSRIndexingCSRImpl<KIND>(), args);
+    args.A_crd.code(), args.A_vals.code(), CSRIndexingCSRImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
index 6c4945de..71728397 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
@@ -27,6 +27,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE>
 struct SpGEMMCSRxCSRxCSRNNZImplBody<VariantKind::CPU, INDEX_CODE> {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRNNZImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorWO<nnz_ty, 1>& nnz,
@@ -94,6 +97,9 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody<VariantKind::CPU, INDEX_CODE> {
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct SpGEMMCSRxCSRxCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
index 66827db6..3aa14a26 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
@@ -41,10 +41,10 @@ __global__ void cast_and_offset(size_t elems, DST* dst, const SRC* src, int64_t
   dst[idx] = static_cast<DST>(src[idx] - offset);
 }
 
-int64_t local_offset_from_nnz(ncclComm_t comm, coord_t task_id, coord_t task_num, int64_t A_nnz)
+int64_t local_offset_from_nnz(
+  ncclComm_t comm, coord_t task_id, coord_t task_num, int64_t A_nnz, cudaStream_t stream)
 {
   ThrustAllocator alloc(Memory::GPU_FB_MEM);
-  auto stream         = get_cached_stream();
   auto policy         = thrust::cuda::par(alloc).on(stream);
   auto buf            = CREATE_BUFFER(int64_t, task_num, Memory::GPU_FB_MEM, "nnz_reduce_buf");
   auto nnz_reduce_buf = buf.ptr(0);
@@ -67,6 +67,9 @@ int64_t local_offset_from_nnz(ncclComm_t comm, coord_t task_id, coord_t task_num
 }
 
 struct SpGEMMCSRxCSRxCSRGPUImpl {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRGPUImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(SpGEMMCSRxCSRxCSRGPUArgs& args, coord_t task_id, coord_t task_size) const
   {
@@ -106,7 +109,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
 
     // Get context sensitive objects.
     auto handle = get_cusparse();
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     CHECK_CUSPARSE(cusparseSetStream(handle, stream));
 
     auto B_rows      = B_pos.domain().get_volume();
@@ -331,7 +334,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
         //@TODO (marsaev): we don't really need nccl comm here
         // latency for 1 int and host comm should be much better
         ncclComm_t* comm = args.comms[0].get<ncclComm_t*>();
-        offset_nnz       = local_offset_from_nnz(*comm, task_id, task_num, A_nnz);
+        offset_nnz       = local_offset_from_nnz(*comm, task_id, task_num, A_nnz, stream);
       }
 
       // Convert the A_indptr array into a pos array.
@@ -472,7 +475,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
         //@TODO (marsaev): we don't really need nccl comm here
         // latency for 1 int and host comm should be much better
         ncclComm_t* comm = args.comms[0].get<ncclComm_t*>();
-        offset_nnz       = local_offset_from_nnz(*comm, task_id, task_num, A_nnz);
+        offset_nnz       = local_offset_from_nnz(*comm, task_id, task_num, A_nnz, stream);
       }
 
       // Convert the A_indptr array into a pos array.
@@ -524,7 +527,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
                                 context.communicators()};
   index_type_floating_point_value_type_dispatch(args.A_crd.code(),
                                                 args.A_vals.code(),
-                                                SpGEMMCSRxCSRxCSRGPUImpl{},
+                                                SpGEMMCSRxCSRxCSRGPUImpl{context},
                                                 args,
                                                 context.get_task_index()[0],
                                                 context.get_launch_domain().hi()[0]);
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
index e6ac4ef6..addf5d59 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
@@ -28,6 +28,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE>
 struct SpGEMMCSRxCSRxCSRNNZImplBody<VariantKind::OMP, INDEX_CODE> {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRNNZImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorWO<nnz_ty, 1>& nnz,
@@ -96,6 +99,9 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody<VariantKind::OMP, INDEX_CODE> {
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct SpGEMMCSRxCSRxCSRImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl
index c958752a..0b99743a 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl
@@ -32,6 +32,9 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody;
 
 template <VariantKind KIND>
 struct SpGEMMCSRxCSRxCSRNNZImpl {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRNNZImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE>
   void operator()(SpGEMMCSRxCSRxCSRNNZArgs& args) const
   {
@@ -43,7 +46,7 @@ struct SpGEMMCSRxCSRxCSRNNZImpl {
     auto C_pos = args.C_pos.read_accessor<Rect<1>, 1>();
     auto C_crd = args.C_crd.read_accessor<INDEX_TY, 1>();
 
-    SpGEMMCSRxCSRxCSRNNZImplBody<KIND, INDEX_CODE>()(
+    SpGEMMCSRxCSRxCSRNNZImplBody<KIND, INDEX_CODE>{context}(
       nnz, B_pos, B_crd, C_pos, C_crd, args.B_pos.shape<1>(), args.C_crd.shape<1>());
   }
 };
@@ -53,6 +56,9 @@ struct SpGEMMCSRxCSRxCSRImplBody;
 
 template <VariantKind KIND>
 struct SpGEMMCSRxCSRxCSRImpl {
+  TaskContext context;
+  explicit SpGEMMCSRxCSRxCSRImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(SpGEMMCSRxCSRxCSRArgs& args) const
   {
@@ -69,17 +75,17 @@ struct SpGEMMCSRxCSRxCSRImpl {
     auto C_crd  = args.C_crd.read_accessor<INDEX_TY, 1>();
     auto C_vals = args.C_vals.read_accessor<VAL_TY, 1>();
 
-    SpGEMMCSRxCSRxCSRImplBody<KIND, INDEX_CODE, VAL_CODE>()(A_pos,
-                                                            A_crd,
-                                                            A_vals,
-                                                            B_pos,
-                                                            B_crd,
-                                                            B_vals,
-                                                            C_pos,
-                                                            C_crd,
-                                                            C_vals,
-                                                            args.B_pos.shape<1>(),
-                                                            args.C_crd.shape<1>());
+    SpGEMMCSRxCSRxCSRImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(A_pos,
+                                                                   A_crd,
+                                                                   A_vals,
+                                                                   B_pos,
+                                                                   B_crd,
+                                                                   B_vals,
+                                                                   C_pos,
+                                                                   C_crd,
+                                                                   C_vals,
+                                                                   args.B_pos.shape<1>(),
+                                                                   args.C_crd.shape<1>());
   }
 };
 
@@ -95,7 +101,7 @@ static void spgemm_csr_csr_csr_nnz_template(TaskContext context)
     inputs[3],
   };
 
-  index_type_dispatch(args.B_crd.code(), SpGEMMCSRxCSRxCSRNNZImpl<KIND>{}, args);
+  index_type_dispatch(args.B_crd.code(), SpGEMMCSRxCSRxCSRNNZImpl<KIND>{context}, args);
 }
 
 template <VariantKind KIND>
@@ -115,7 +121,7 @@ static void spgemm_csr_csr_csr_template(TaskContext context)
     inputs[5],
   };
   index_type_floating_point_value_type_dispatch(
-    args.A_crd.code(), args.A_vals.code(), SpGEMMCSRxCSRxCSRImpl<KIND>{}, args);
+    args.A_crd.code(), args.A_vals.code(), SpGEMMCSRxCSRxCSRImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spmv.cc b/src/legate_sparse/array/csr/spmv.cc
index d9efa4fd..42d2576c 100644
--- a/src/legate_sparse/array/csr/spmv.cc
+++ b/src/legate_sparse/array/csr/spmv.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRSpMVRowSplitImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRSpMVRowSplitImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/spmv.cu b/src/legate_sparse/array/csr/spmv.cu
index f2c5f1a1..536a18d8 100644
--- a/src/legate_sparse/array/csr/spmv.cu
+++ b/src/legate_sparse/array/csr/spmv.cu
@@ -29,6 +29,9 @@ namespace sparse {
 
 template <>
 struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
+  TaskContext context;
+  explicit CSRSpMVRowSplitImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(CSRSpMVRowSplitArgs& args) const
   {
@@ -48,7 +51,7 @@ struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
 
     // Get context sensitive objects.
     auto handle = get_cusparse();
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     CHECK_CUSPARSE(cusparseSetStream(handle, stream));
 
     // Older cusparse has bug when output vector is not aligned to 16 bytes
@@ -109,7 +112,7 @@ struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
     CHECK_CUSPARSE(cusparseCreateDnVec(
       &cusparse_y, y_domain_size /* size */, output_ptr, cusparseDataType<VAL_TY>()));
 
-    auto cusparse_A = makeCuSparseCSR<INDEX_TY, VAL_TY>(A_pos, A_crd, A_vals, cols);
+    auto cusparse_A = makeCuSparseCSR<INDEX_TY, VAL_TY>(A_pos, A_crd, A_vals, cols, stream);
 
     // Make the CUSPARSE calls.
     VAL_TY alpha   = 1.0;
@@ -153,7 +156,7 @@ struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
                                 workspacePtr));
     // if we used temporary buffer, copy result to output
     if (y_aligned) {
-      LEGATE_CHECK_CUDA(cudaMemcpyAsync(
+      LEGATE_SPARSE_CHECK_CUDA(cudaMemcpyAsync(
         y_raw_ptr, output_ptr, y_domain_size * sizeof(VAL_TY), cudaMemcpyDeviceToDevice, stream));
     }
     // Destroy the created objects.
@@ -170,7 +173,7 @@ struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
   CSRSpMVRowSplitArgs args{context.outputs()[0], inputs[0], inputs[1], inputs[2], inputs[3]};
 
   index_type_floating_point_value_type_dispatch(
-    args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl<VariantKind::GPU>{}, args);
+    args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl<VariantKind::GPU>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spmv_omp.cc b/src/legate_sparse/array/csr/spmv_omp.cc
index 40b84e83..2937d848 100644
--- a/src/legate_sparse/array/csr/spmv_omp.cc
+++ b/src/legate_sparse/array/csr/spmv_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
 struct CSRSpMVRowSplitImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  TaskContext context;
+  explicit CSRSpMVRowSplitImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
   using VAL_TY   = type_of<VAL_CODE>;
 
diff --git a/src/legate_sparse/array/csr/spmv_template.inl b/src/legate_sparse/array/csr/spmv_template.inl
index f339e2d1..ba55490a 100644
--- a/src/legate_sparse/array/csr/spmv_template.inl
+++ b/src/legate_sparse/array/csr/spmv_template.inl
@@ -31,6 +31,9 @@ struct CSRSpMVRowSplitImplBody;
 
 template <VariantKind KIND>
 struct CSRSpMVRowSplitImpl {
+  TaskContext context;
+  explicit CSRSpMVRowSplitImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
   void operator()(CSRSpMVRowSplitArgs& args) const
   {
@@ -48,7 +51,7 @@ struct CSRSpMVRowSplitImpl {
       return;
     }
 
-    CSRSpMVRowSplitImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+    CSRSpMVRowSplitImplBody<KIND, INDEX_CODE, VAL_CODE>{context}(
       y, A_pos, A_crd, A_vals, x, args.y.shape<1>());
   }
 };
@@ -60,7 +63,7 @@ static void csr_spmv_row_split_template(TaskContext context)
   CSRSpMVRowSplitArgs args{context.outputs()[0], inputs[0], inputs[1], inputs[2], inputs[3]};
 
   index_type_value_type_dispatch(
-    args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl<KIND>{}, args);
+    args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/scale_rect.cc b/src/legate_sparse/array/util/scale_rect.cc
index c2d2df90..50bbedd8 100644
--- a/src/legate_sparse/array/util/scale_rect.cc
+++ b/src/legate_sparse/array/util/scale_rect.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <>
 struct ScaleRect1ImplBody<VariantKind::CPU> {
+  TaskContext context;
+  explicit ScaleRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorRW<Rect<1>, 1>& output, const int64_t scale, const Rect<1>& rect)
   {
     for (coord_t i = rect.lo[0]; i < rect.hi[0] + 1; i++) {
diff --git a/src/legate_sparse/array/util/scale_rect.cu b/src/legate_sparse/array/util/scale_rect.cu
index 22132340..fc07bcd6 100644
--- a/src/legate_sparse/array/util/scale_rect.cu
+++ b/src/legate_sparse/array/util/scale_rect.cu
@@ -38,11 +38,14 @@ __global__ void scale_rect1_kernel(size_t elems,
 
 template <>
 struct ScaleRect1ImplBody<VariantKind::GPU> {
+  TaskContext context;
+  explicit ScaleRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorRW<Rect<1>, 1>& output, const int64_t scale, const Rect<1>& rect)
   {
     auto elems  = rect.volume();
     auto blocks = get_num_blocks_1d(elems);
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     scale_rect1_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(elems, rect.lo, output, scale);
     LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
   }
diff --git a/src/legate_sparse/array/util/scale_rect_omp.cc b/src/legate_sparse/array/util/scale_rect_omp.cc
index fc850bf3..1783d335 100644
--- a/src/legate_sparse/array/util/scale_rect_omp.cc
+++ b/src/legate_sparse/array/util/scale_rect_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <>
 struct ScaleRect1ImplBody<VariantKind::OMP> {
+  TaskContext context;
+  explicit ScaleRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorRW<Rect<1>, 1>& output, const int64_t scale, const Rect<1>& rect)
   {
 #pragma omp parallel for schedule(static)
diff --git a/src/legate_sparse/array/util/scale_rect_template.inl b/src/legate_sparse/array/util/scale_rect_template.inl
index 11724c24..512dc2c8 100644
--- a/src/legate_sparse/array/util/scale_rect_template.inl
+++ b/src/legate_sparse/array/util/scale_rect_template.inl
@@ -29,13 +29,16 @@ struct ScaleRect1ImplBody;
 
 template <VariantKind KIND>
 struct ScaleRect1Impl {
+  TaskContext context;
+  explicit ScaleRect1Impl(TaskContext context) : context(context) {}
+
   void operator()(ScaleRect1Args& args) const
   {
     auto output = args.out.read_write_accessor<Rect<1>, 1>();
     if (args.out.domain().empty()) {
       return;
     }
-    ScaleRect1ImplBody<KIND>()(output, args.scale, args.out.shape<1>());
+    ScaleRect1ImplBody<KIND>{context}(output, args.scale, args.out.shape<1>());
   }
 };
 
@@ -45,7 +48,7 @@ static void scale_rect_1_template(TaskContext context)
   auto task  = context.task_;
   auto scale = task->futures[0].get_result<int64_t>();
   ScaleRect1Args args{context.outputs()[0], scale};
-  ScaleRect1Impl<KIND>{}(args);
+  ScaleRect1Impl<KIND>{context}(args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/unzip_rect.cc b/src/legate_sparse/array/util/unzip_rect.cc
index 1272e9cc..08170da7 100644
--- a/src/legate_sparse/array/util/unzip_rect.cc
+++ b/src/legate_sparse/array/util/unzip_rect.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <>
 struct UnZipRect1ImplBody<VariantKind::CPU> {
+  TaskContext context;
+  explicit UnZipRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<int64_t, 1>& out1,
                   const AccessorWO<int64_t, 1>& out2,
                   const AccessorRO<Rect<1>, 1>& in,
diff --git a/src/legate_sparse/array/util/unzip_rect.cu b/src/legate_sparse/array/util/unzip_rect.cu
index 28067190..d7964c01 100644
--- a/src/legate_sparse/array/util/unzip_rect.cu
+++ b/src/legate_sparse/array/util/unzip_rect.cu
@@ -39,6 +39,9 @@ __global__ void unzip_rect1_kernel(size_t elems,
 
 template <>
 struct UnZipRect1ImplBody<VariantKind::GPU> {
+  TaskContext context;
+  explicit UnZipRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<int64_t, 1>& out1,
                   const AccessorWO<int64_t, 1>& out2,
                   const AccessorRO<Rect<1>, 1>& in,
@@ -46,7 +49,7 @@ struct UnZipRect1ImplBody<VariantKind::GPU> {
   {
     auto elems  = rect.volume();
     auto blocks = get_num_blocks_1d(elems);
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     unzip_rect1_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(elems, rect.lo, out1, out2, in);
     LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
   }
diff --git a/src/legate_sparse/array/util/unzip_rect_omp.cc b/src/legate_sparse/array/util/unzip_rect_omp.cc
index e57c43cd..b0345795 100644
--- a/src/legate_sparse/array/util/unzip_rect_omp.cc
+++ b/src/legate_sparse/array/util/unzip_rect_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <>
 struct UnZipRect1ImplBody<VariantKind::OMP> {
+  TaskContext context;
+  explicit UnZipRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<int64_t, 1>& out1,
                   const AccessorWO<int64_t, 1>& out2,
                   const AccessorRO<Rect<1>, 1>& in,
diff --git a/src/legate_sparse/array/util/unzip_rect_template.inl b/src/legate_sparse/array/util/unzip_rect_template.inl
index d8bd9d2e..2c97d28d 100644
--- a/src/legate_sparse/array/util/unzip_rect_template.inl
+++ b/src/legate_sparse/array/util/unzip_rect_template.inl
@@ -29,6 +29,9 @@ struct UnZipRect1ImplBody;
 
 template <VariantKind KIND>
 struct UnZipRect1Impl {
+  TaskContext context;
+  explicit UnZipRect1Impl(TaskContext context) : context(context) {}
+
   void operator()(UnZipRect1Args& args) const
   {
     auto out1 = args.out1.write_accessor<int64_t, 1>();
@@ -37,7 +40,7 @@ struct UnZipRect1Impl {
     if (args.in.domain().empty()) {
       return;
     }
-    UnZipRect1ImplBody<KIND>()(out1, out2, in, args.in.shape<1>());
+    UnZipRect1ImplBody<KIND>{context}(out1, out2, in, args.in.shape<1>());
   }
 };
 
@@ -46,7 +49,7 @@ static void unzip_rect_1_template(TaskContext context)
 {
   auto outputs = context.outputs();
   UnZipRect1Args args{outputs[0], outputs[1], context.inputs()[0]};
-  UnZipRect1Impl<KIND>{}(args);
+  UnZipRect1Impl<KIND>{context}(args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/zip_to_rect.cc b/src/legate_sparse/array/util/zip_to_rect.cc
index c8871583..dcbd8dfb 100644
--- a/src/legate_sparse/array/util/zip_to_rect.cc
+++ b/src/legate_sparse/array/util/zip_to_rect.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <typename VAL>
 struct ZipToRect1ImplBody<VariantKind::CPU, VAL> {
+  TaskContext context;
+  explicit ZipToRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<Rect<1>, 1>& output,
                   const AccessorRO<VAL, 1>& lo,
                   const AccessorRO<VAL, 1>& hi,
diff --git a/src/legate_sparse/array/util/zip_to_rect.cu b/src/legate_sparse/array/util/zip_to_rect.cu
index 697422e0..393c5860 100644
--- a/src/legate_sparse/array/util/zip_to_rect.cu
+++ b/src/legate_sparse/array/util/zip_to_rect.cu
@@ -39,12 +39,15 @@ __global__ void zip_rect1_kernel(size_t elems,
 
 template <typename VAL>
 struct ZipToRect1ImplBody<VariantKind::GPU, VAL> {
+  TaskContext context;
+  explicit ZipToRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<Rect<1>, 1>& output,
                   const AccessorRO<VAL, 1>& lo,
                   const AccessorRO<VAL, 1>& hi,
                   const Rect<1>& rect)
   {
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     auto elems  = rect.volume();
     auto blocks = get_num_blocks_1d(elems);
     zip_rect1_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(elems, rect.lo, output, lo, hi);
diff --git a/src/legate_sparse/array/util/zip_to_rect_omp.cc b/src/legate_sparse/array/util/zip_to_rect_omp.cc
index 03738d36..0b72d8d3 100644
--- a/src/legate_sparse/array/util/zip_to_rect_omp.cc
+++ b/src/legate_sparse/array/util/zip_to_rect_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <typename VAL>
 struct ZipToRect1ImplBody<VariantKind::OMP, VAL> {
+  TaskContext context;
+  explicit ZipToRect1ImplBody(TaskContext context) : context(context) {}
+
   void operator()(const AccessorWO<Rect<1>, 1>& output,
                   const AccessorRO<VAL, 1>& lo,
                   const AccessorRO<VAL, 1>& hi,
diff --git a/src/legate_sparse/array/util/zip_to_rect_template.inl b/src/legate_sparse/array/util/zip_to_rect_template.inl
index c53411e8..7ebad169 100644
--- a/src/legate_sparse/array/util/zip_to_rect_template.inl
+++ b/src/legate_sparse/array/util/zip_to_rect_template.inl
@@ -29,6 +29,9 @@ struct ZipToRect1ImplBody;
 
 template <VariantKind KIND, typename VAL>
 struct ZipToRect1Impl {
+  TaskContext context;
+  explicit ZipToRect1Impl(TaskContext context) : context(context) {}
+
   void operator()(ZipToRect1Args& args) const
   {
     auto output = args.out.write_accessor<Rect<1>, 1>();
@@ -37,7 +40,7 @@ struct ZipToRect1Impl {
     if (args.out.domain().empty()) {
       return;
     }
-    ZipToRect1ImplBody<KIND, VAL>()(output, lo, hi, args.out.shape<1>());
+    ZipToRect1ImplBody<KIND, VAL>{context}(output, lo, hi, args.out.shape<1>());
   }
 };
 
@@ -47,10 +50,10 @@ static void zip_to_rect_1_template(TaskContext context)
   auto inputs = context.inputs();
   ZipToRect1Args args{context.outputs()[0], inputs[0], inputs[1]};
   if (inputs[0].data().type().code() == legate::Type::Code::INT64) {
-    ZipToRect1Impl<KIND, int64_t>{}(args);
+    ZipToRect1Impl<KIND, int64_t>{context}(args);
   } else {
     assert(inputs[0].data().type().code() == legate::Type::Code::UINT64);
-    ZipToRect1Impl<KIND, uint64_t>{}(args);
+    ZipToRect1Impl<KIND, uint64_t>{context}(args);
   }
 }
 
diff --git a/src/legate_sparse/cffi.h b/src/legate_sparse/cffi.h
index cdd53926..584311aa 100644
--- a/src/legate_sparse/cffi.h
+++ b/src/legate_sparse/cffi.h
@@ -45,6 +45,13 @@ enum LegateSparseOpCode {
   // like iterative linear solvers.
   LEGATE_SPARSE_AXPBY,
 
+  // Sparse direct linear solve
+  LEGATE_SPARSE_SPSOLVE,
+
+  // Computes GEAM: alpha * A + beta * B = C
+  LEGATE_SPARSE_GEAM_CSR_CSR_SYMBOLIC,
+  LEGATE_SPARSE_GEAM_CSR_CSR_COMPUTE,
+
   // nonzero API
   LEGATE_SPARSE_NONZERO,
 
diff --git a/src/legate_sparse/cudalibs.cu b/src/legate_sparse/cudalibs.cu
index 6ec45bd5..d2607fb2 100644
--- a/src/legate_sparse/cudalibs.cu
+++ b/src/legate_sparse/cudalibs.cu
@@ -22,7 +22,7 @@
 
 namespace sparse {
 
-CUDALibraries::CUDALibraries() : finalized_(false), cusparse_(nullptr) {}
+CUDALibraries::CUDALibraries() : finalized_(false), cusparse_(nullptr), cudss_(nullptr) {}
 
 CUDALibraries::~CUDALibraries() { finalize(); }
 
@@ -34,6 +34,9 @@ void CUDALibraries::finalize()
   if (cusparse_ != nullptr) {
     finalize_cusparse();
   }
+  if (cudss_ != nullptr) {
+    finalize_cudss();
+  }
   finalized_ = true;
 }
 
@@ -51,6 +54,20 @@ cusparseHandle_t CUDALibraries::get_cusparse()
   return this->cusparse_;
 }
 
+void CUDALibraries::finalize_cudss()
+{
+  CHECK_CUDSS(cudssDestroy(cudss_));
+  cudss_ = nullptr;
+}
+
+cudssHandle_t CUDALibraries::get_cudss()
+{
+  if (this->cudss_ == nullptr) {
+    CHECK_CUDSS(cudssCreate(&this->cudss_));
+  }
+  return this->cudss_;
+}
+
 static CUDALibraries& get_cuda_libraries(legate::Processor proc)
 {
   if (proc.kind() != legate::Processor::TOC_PROC) {
@@ -63,16 +80,18 @@ static CUDALibraries& get_cuda_libraries(legate::Processor proc)
   return cuda_libraries[proc_id];
 }
 
-legate::cuda::StreamView get_cached_stream()
+cusparseHandle_t get_cusparse()
 {
-  return legate::cuda::StreamPool::get_stream_pool().get_stream();
+  const auto proc = legate::Processor::get_executing_processor();
+  auto& lib       = get_cuda_libraries(proc);
+  return lib.get_cusparse();
 }
 
-cusparseHandle_t get_cusparse()
+cudssHandle_t get_cudss()
 {
   const auto proc = legate::Processor::get_executing_processor();
   auto& lib       = get_cuda_libraries(proc);
-  return lib.get_cusparse();
+  return lib.get_cudss();
 }
 
 class LoadCUDALibsTask : public SparseTask<LoadCUDALibsTask> {
@@ -86,6 +105,7 @@ class LoadCUDALibsTask : public SparseTask<LoadCUDALibsTask> {
     const auto proc = legate::Processor::get_executing_processor();
     auto& lib       = get_cuda_libraries(proc);
     lib.get_cusparse();
+    lib.get_cudss();
   }
 };
 
diff --git a/src/legate_sparse/cudalibs.h b/src/legate_sparse/cudalibs.h
index 5a387200..47596a79 100644
--- a/src/legate_sparse/cudalibs.h
+++ b/src/legate_sparse/cudalibs.h
@@ -33,13 +33,16 @@ struct CUDALibraries {
  public:
   void finalize();
   cusparseHandle_t get_cusparse();
+  cudssHandle_t get_cudss();
 
  private:
   void finalize_cusparse();
+  void finalize_cudss();
 
  private:
   bool finalized_;
   cusparseHandle_t cusparse_;
+  cudssHandle_t cudss_;
 };
 
 }  // namespace sparse
diff --git a/src/legate_sparse/linalg/axpby.cc b/src/legate_sparse/linalg/axpby.cc
index 547ad927..43e99520 100644
--- a/src/legate_sparse/linalg/axpby.cc
+++ b/src/legate_sparse/linalg/axpby.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code VAL_CODE, bool IS_ALPHA, bool NEGATE>
 struct AXPBYImplBody<VariantKind::CPU, VAL_CODE, IS_ALPHA, NEGATE> {
+  TaskContext context;
+  explicit AXPBYImplBody(TaskContext context) : context(context) {}
+
   using VAL_TY = type_of<VAL_CODE>;
 
   void operator()(const AccessorRW<VAL_TY, 1>& y,
diff --git a/src/legate_sparse/linalg/axpby.cu b/src/legate_sparse/linalg/axpby.cu
index 784e77a3..f7ee1feb 100644
--- a/src/legate_sparse/linalg/axpby.cu
+++ b/src/legate_sparse/linalg/axpby.cu
@@ -48,6 +48,9 @@ __global__ void axpby_kernel(size_t elems,
 
 template <Type::Code VAL_CODE, bool IS_ALPHA, bool NEGATE>
 struct AXPBYImplBody<VariantKind::GPU, VAL_CODE, IS_ALPHA, NEGATE> {
+  TaskContext context;
+  explicit AXPBYImplBody(TaskContext context) : context(context) {}
+
   using VAL_TY = type_of<VAL_CODE>;
 
   void operator()(const AccessorRW<VAL_TY, 1>& y,
@@ -58,7 +61,7 @@ struct AXPBYImplBody<VariantKind::GPU, VAL_CODE, IS_ALPHA, NEGATE> {
   {
     auto elems  = rect.volume();
     auto blocks = get_num_blocks_1d(elems);
-    auto stream = get_cached_stream();
+    auto stream = context.get_task_stream();
     axpby_kernel<VAL_TY, IS_ALPHA, NEGATE>
       <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(elems, rect.lo[0], y, x, a, b);
     LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
diff --git a/src/legate_sparse/linalg/axpby_omp.cc b/src/legate_sparse/linalg/axpby_omp.cc
index eb019b5c..c5569fba 100644
--- a/src/legate_sparse/linalg/axpby_omp.cc
+++ b/src/legate_sparse/linalg/axpby_omp.cc
@@ -23,6 +23,9 @@ using namespace legate;
 
 template <Type::Code VAL_CODE, bool IS_ALPHA, bool NEGATE>
 struct AXPBYImplBody<VariantKind::OMP, VAL_CODE, IS_ALPHA, NEGATE> {
+  TaskContext context;
+  explicit AXPBYImplBody(TaskContext context) : context(context) {}
+
   using VAL_TY = type_of<VAL_CODE>;
 
   void operator()(const AccessorRW<VAL_TY, 1>& y,
diff --git a/src/legate_sparse/linalg/axpby_template.inl b/src/legate_sparse/linalg/axpby_template.inl
index 8651672d..d110f6de 100644
--- a/src/legate_sparse/linalg/axpby_template.inl
+++ b/src/legate_sparse/linalg/axpby_template.inl
@@ -29,6 +29,9 @@ struct AXPBYImplBody;
 
 template <VariantKind KIND>
 struct AXPBYImpl {
+  TaskContext context;
+  explicit AXPBYImpl(TaskContext context) : context(context) {}
+
   template <Type::Code VAL_CODE>
   void operator()(AXPBYArgs& args) const
   {
@@ -42,15 +45,15 @@ struct AXPBYImpl {
     }
     if (args.isalpha) {
       if (args.negate) {
-        AXPBYImplBody<KIND, VAL_CODE, true, true>()(y, x, a, b, args.y.shape<1>());
+        AXPBYImplBody<KIND, VAL_CODE, true, true>{context}(y, x, a, b, args.y.shape<1>());
       } else {
-        AXPBYImplBody<KIND, VAL_CODE, true, false>()(y, x, a, b, args.y.shape<1>());
+        AXPBYImplBody<KIND, VAL_CODE, true, false>{context}(y, x, a, b, args.y.shape<1>());
       }
     } else {
       if (args.negate) {
-        AXPBYImplBody<KIND, VAL_CODE, false, true>()(y, x, a, b, args.y.shape<1>());
+        AXPBYImplBody<KIND, VAL_CODE, false, true>{context}(y, x, a, b, args.y.shape<1>());
       } else {
-        AXPBYImplBody<KIND, VAL_CODE, false, false>()(y, x, a, b, args.y.shape<1>());
+        AXPBYImplBody<KIND, VAL_CODE, false, false>{context}(y, x, a, b, args.y.shape<1>());
       }
     }
   }
@@ -67,7 +70,7 @@ static void axpby_template(TaskContext context)
     context.scalars()[0].value<bool>(),
     context.scalars()[1].value<bool>(),
   };
-  value_type_dispatch(args.y.code(), AXPBYImpl<KIND>{}, args);
+  value_type_dispatch(args.y.code(), AXPBYImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/linalg/spsolve.cc b/src/legate_sparse/linalg/spsolve.cc
new file mode 100644
index 00000000..446d9e04
--- /dev/null
+++ b/src/legate_sparse/linalg/spsolve.cc
@@ -0,0 +1,34 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/linalg/spsolve.h"
+#include "legate_sparse/util/dispatch.h"
+#include "legate_sparse/util/legate_utils.h"
+
+namespace sparse {
+
+using namespace legate;
+
+namespace  // unnamed
+{
+static const auto sparse_reg_task_ = []() -> char {
+  SpSolve::register_variants();
+  return 0;
+}();
+
+}  // namespace
+
+}  // namespace sparse
diff --git a/src/legate_sparse/linalg/spsolve.cu b/src/legate_sparse/linalg/spsolve.cu
new file mode 100644
index 00000000..ace6fe5e
--- /dev/null
+++ b/src/legate_sparse/linalg/spsolve.cu
@@ -0,0 +1,184 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/linalg/spsolve.h"
+#include "legate_sparse/util/cusparse_utils.h"
+#include "legate_sparse/util/cudss_utils.h"
+#include "legate_sparse/util/dispatch.h"
+#include "legate_sparse/util/legate_utils.h"
+
+namespace sparse {
+
+struct SpSolveImpl {
+  TaskContext context;
+  explicit SpSolveImpl(TaskContext context) : context(context) {}
+
+  template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+  void operator()(SpSolveArgs& args, int num_gpus) const
+  {
+    using INDEX_TY = type_of<INDEX_CODE>;
+    using VAL_TY   = type_of<VAL_CODE>;
+
+    auto& A_pos      = args.A_pos;
+    auto& A_crd      = args.A_crd;
+    auto& A_vals     = args.A_vals;
+    auto& b          = args.b;
+    auto& x          = args.x;  // output
+    auto comms       = args.comms;
+    uint64_t nrows_g = args.nrows_g;
+    uint64_t nnz_g   = args.nnz_g;
+    uint64_t ncols_g = nrows_g;
+
+    int hybrid_mode = 0;  // 0 = GPU-only execution in cuDSS
+
+    // cuDSS handle and stream set
+    auto handle = get_cudss();
+    auto stream = context.get_task_stream();
+    CHECK_CUDSS(cudssSetStream(handle, stream));
+
+    // create configuration and data objects
+    cudssConfig_t config;
+    cudssData_t solverData;
+
+    CHECK_CUDSS(cudssConfigCreate(&config));
+    CHECK_CUDSS(cudssConfigSet(config, CUDSS_CONFIG_HYBRID_MODE, &hybrid_mode, sizeof(int)));
+    CHECK_CUDSS(cudssDataCreate(handle, &solverData));
+
+    //    A      x   =   b
+    // (m, n) (n, 1) = (m, 1); m = nrows, n = ncols
+    // _l: local  (e.g., shape of the partitioned array)
+    // _g: global (e.g., global shape of the array)
+
+    int64_t nrows_l = A_pos.domain().get_volume();
+    int64_t ncols_l = x.domain().get_volume();
+    int64_t nnz_l   = A_vals.domain().get_volume();
+
+    int64_t nrhs = 1;        // Number of right-hand side
+    int64_t ldb  = nrows_g;  // leading dimension of b
+    int64_t ldx  = ncols_g;  // leading dimension of x
+
+    auto A_indptr = CREATE_BUFFER(int64_t, nrows_l + 1, Memory::GPU_FB_MEM, "A_indptr");
+    {
+      auto blocks = get_num_blocks_1d(nrows_l);
+      convertGlobalPosToLocalIndPtr<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
+        nrows_l, A_pos.read_accessor<Rect<1>, 1>().ptr(A_pos.domain().lo()), A_indptr.ptr(0));
+    }
+
+    CHECK_CUDSS(cudssSetStream(handle, stream));
+
+    cudssMatrix_t mat_A, vec_b, vec_x;
+    CHECK_CUDSS(cudssMatrixCreateCsr(&mat_A,                  // pointer to the matrix
+                                     nrows_g,                 // number of rows
+                                     ncols_g,                 // number of columns
+                                     nnz_g,                   // number of non-zeros
+                                     (void*)A_indptr.ptr(0),  // offsets,
+                                     nullptr,                 // end index if start index was used
+                                     getPtrFromStore<INDEX_TY, 1>(A_crd),  // column indices
+                                     getPtrFromStore<VAL_TY, 1>(A_vals),   // values
+                                     cudssIndexType<INDEX_TY>(),           // indexType
+                                     cudssDataType<VAL_TY>(),              // valueType
+                                     CUDSS_MTYPE_GENERAL,                  // matrix type
+                                     CUDSS_MVIEW_FULL,                     // matrix view
+                                     CUDSS_BASE_ZERO                       // indexBase
+                                     ));
+
+    // NOTE:
+    // nrhs should be derived from b (b.shape[1]) and MUST be 1 right now.
+    // When we support multi-dimensional right-hand sides, we need to
+    // make sure that a column major order is chosen in the mapper
+
+    auto x_ptr = getPtrFromStore<VAL_TY, 1>(x);
+
+    // Create dense output vector, x, of shape (ncol_g, nrhs)
+    CHECK_CUDSS(cudssMatrixCreateDn(&vec_x,
+                                    ncols_g,                  // number of rows
+                                    nrhs,                     // number of RHS, set to 1
+                                    ldx,                      // Leading dimension of x
+                                    (void*)x_ptr,             // Values of the dense matrix
+                                    cudssDataType<VAL_TY>(),  // Data type of the dense vector
+                                    CUDSS_LAYOUT_COL_MAJOR)   // Layout
+    );
+
+    auto b_ptr = getPtrFromStore<VAL_TY, 1>(b);
+
+    // Create dense RHS vector, b, of shape (nrows_g, nrhs)
+    CHECK_CUDSS(cudssMatrixCreateDn(&vec_b,
+                                    nrows_g,                  // number of rows
+                                    nrhs,                     // number of RHS, set to 1
+                                    ldb,                      // Leading dimension of b
+                                    (void*)b_ptr,             // Values of the dense matrix
+                                    cudssDataType<VAL_TY>(),  // Data type of the dense vector
+                                    CUDSS_LAYOUT_COL_MAJOR)   // Layout
+    );
+
+    // Matrix and Vectors are partitioned row-wise
+    if (num_gpus > 1) {
+      ncclComm_t* comm = comms[0].get<ncclComm_t*>();
+      cudssMatrixSetDistributionRow1d(mat_A,
+                                      static_cast<int64_t>(A_pos.domain().lo()[0]),
+                                      static_cast<int64_t>(A_pos.domain().hi()[0]));
+      cudssMatrixSetDistributionRow1d(
+        vec_b, static_cast<int64_t>(b.domain().lo()[0]), static_cast<int64_t>(b.domain().hi()[0]));
+      cudssMatrixSetDistributionRow1d(
+        vec_x, static_cast<int64_t>(x.domain().lo()[0]), static_cast<int64_t>(x.domain().hi()[0]));
+
+      // path to libcudss_commlayer_nccl.so is obtained from the env CUDSS_COMM_LIB
+      CHECK_CUDSS(cudssSetCommLayer(handle, nullptr));
+      CHECK_CUDSS(cudssDataSet(handle, solverData, CUDSS_DATA_COMM, comm, sizeof(ncclComm_t*)));
+    }
+
+    // Solve
+    CHECK_CUDSS(
+      cudssExecute(handle, CUDSS_PHASE_ANALYSIS, config, solverData, mat_A, vec_x, vec_b));
+
+    CHECK_CUDSS(
+      cudssExecute(handle, CUDSS_PHASE_FACTORIZATION, config, solverData, mat_A, vec_x, vec_b));
+
+    CHECK_CUDSS(cudssExecute(handle, CUDSS_PHASE_SOLVE, config, solverData, mat_A, vec_x, vec_b));
+
+    // Destroy matrix, vectors, and setup
+    CHECK_CUDSS(cudssMatrixDestroy(mat_A));
+    CHECK_CUDSS(cudssMatrixDestroy(vec_x));
+    CHECK_CUDSS(cudssMatrixDestroy(vec_b));
+    CHECK_CUDSS(cudssDataDestroy(handle, solverData));
+    CHECK_CUDSS(cudssConfigDestroy(config));
+
+    LEGATE_SPARSE_CHECK_CUDA(cudaStreamSynchronize(stream));
+  }
+};
+
+/* static */ void SpSolve::gpu_variant(TaskContext context)
+{
+  auto inputs  = context.inputs();
+  auto outputs = context.outputs();
+  auto comms   = context.communicators();
+
+  SpSolveArgs args{inputs[0],                               // A_pos
+                   inputs[1],                               // A_crd
+                   inputs[2],                               // A_vals
+                   inputs[3],                               // b
+                   outputs[0],                              // x
+                   context.scalars()[0].value<uint64_t>(),  // nrows_g
+                   context.scalars()[1].value<uint64_t>(),  // nnz_g
+                   comms};
+  int num_gpus = static_cast<size_t>(context.get_launch_domain().hi()[0]) + 1;
+  index_type_floating_point_value_type_dispatch(
+    args.A_crd.code(), args.A_vals.code(), SpSolveImpl{context}, args, num_gpus);
+}
+
+using namespace legate;
+
+}  // namespace sparse
diff --git a/src/legate_sparse/linalg/spsolve.h b/src/legate_sparse/linalg/spsolve.h
new file mode 100644
index 00000000..68908f3e
--- /dev/null
+++ b/src/legate_sparse/linalg/spsolve.h
@@ -0,0 +1,48 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/sparse.h"
+#include "legate_sparse/sparse_c.h"
+#include "legate.h"
+
+namespace sparse {
+
+struct SpSolveArgs {
+  const legate::PhysicalStore& A_pos;
+  const legate::PhysicalStore& A_crd;
+  const legate::PhysicalStore& A_vals;
+  const legate::PhysicalStore& b;
+  const legate::PhysicalStore& x;  // output
+  const uint64_t nrows_g;          // global number of rows
+  const uint64_t nnz_g;            // global number of nonzeros
+  std::vector<legate::comm::Communicator> comms;
+};
+
+class SpSolve : public SparseTask<SpSolve> {
+ public:
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPSOLVE}};
+  static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
+    legate::VariantOptions{}.with_has_allocations(true);
+
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext ctx);
+#endif
+};
+
+}  // namespace sparse
diff --git a/src/legate_sparse/mapper/mapper.cc b/src/legate_sparse/mapper/mapper.cc
index 6357d898..8c330b9d 100644
--- a/src/legate_sparse/mapper/mapper.cc
+++ b/src/legate_sparse/mapper/mapper.cc
@@ -126,6 +126,10 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
       return std::nullopt;
     }
 
+    case LEGATE_SPARSE_SPSOLVE: {
+      return std::nullopt;
+    }
+
     default: {
       // Handle any unhandled enum values
       LEGATE_ABORT("Unsupported Legate Sparse task_id: " + std::to_string(task_id));
diff --git a/src/legate_sparse/partition/fast_image_partition.cu b/src/legate_sparse/partition/fast_image_partition.cu
index 47a79606..1825bce0 100644
--- a/src/legate_sparse/partition/fast_image_partition.cu
+++ b/src/legate_sparse/partition/fast_image_partition.cu
@@ -28,6 +28,9 @@ using namespace legate;
 
 template <Type::Code INDEX_CODE>
 struct FastImageRangeImplBody<VariantKind::GPU, INDEX_CODE> {
+  TaskContext context;
+  explicit FastImageRangeImplBody(TaskContext context) : context(context) {}
+
   using INDEX_TY = type_of<INDEX_CODE>;
 
   void operator()(const AccessorWO<Rect<1>, 1>& out_pos,
@@ -37,7 +40,7 @@ struct FastImageRangeImplBody<VariantKind::GPU, INDEX_CODE> {
                   const Rect<1>& bounds)
   {
     ThrustAllocator alloc(Memory::GPU_FB_MEM);
-    auto stream             = get_cached_stream();
+    auto stream             = context.get_task_stream();
     auto thrust_exec_policy = thrust::cuda::par(alloc).on(stream);
 
     thrust::pair<const INDEX_TY*, const INDEX_TY*> result = thrust::minmax_element(
diff --git a/src/legate_sparse/partition/fast_image_partition_template.inl b/src/legate_sparse/partition/fast_image_partition_template.inl
index 4e74731e..2b7cb3e6 100644
--- a/src/legate_sparse/partition/fast_image_partition_template.inl
+++ b/src/legate_sparse/partition/fast_image_partition_template.inl
@@ -30,6 +30,9 @@ struct FastImageRangeImplBody;
 
 template <VariantKind KIND>
 struct FastImageRangeImpl {
+  TaskContext context;
+  explicit FastImageRangeImpl(TaskContext context) : context(context) {}
+
   template <Type::Code INDEX_CODE>
   void operator()(FastImageRangeArgs& args) const
   {
@@ -43,7 +46,7 @@ struct FastImageRangeImpl {
     if (args.input_crd.domain().empty()) {
       return;
     }
-    FastImageRangeImplBody<KIND, INDEX_CODE>()(
+    FastImageRangeImplBody<KIND, INDEX_CODE>{context}(
       output_pos, input_pos, input_crd, args.input_pos.shape<1>(), args.input_crd.shape<1>());
   }
 };
@@ -52,7 +55,7 @@ template <VariantKind KIND>
 static void fast_image_range_template(TaskContext context)
 {
   FastImageRangeArgs args{context.output(0), context.input(0), context.input(1)};
-  index_type_dispatch(args.input_crd.code(), FastImageRangeImpl<KIND>{}, args);
+  index_type_dispatch(args.input_crd.code(), FastImageRangeImpl<KIND>{context}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/util/cuda_help.h b/src/legate_sparse/util/cuda_help.h
index d009f9d6..61e83da0 100644
--- a/src/legate_sparse/util/cuda_help.h
+++ b/src/legate_sparse/util/cuda_help.h
@@ -18,9 +18,13 @@
 
 #include <cstdlib>
 #include "legate.h"
-#include "legate/cuda/cuda.h"
-#include "legate/cuda/stream_pool.h"
+
+// For sparse matrix ops like spGEMM and spMv
 #include <cusparse.h>
+
+// For direct solvers
+#include <cudss.h>
+
 #include <nccl.h>
 
 #define THREADS_PER_BLOCK 128
@@ -31,6 +35,12 @@
     check_cusparse(result, __FILE__, __LINE__); \
   } while (false)
 
+#define CHECK_CUDSS(expr)                    \
+  do {                                       \
+    cudssStatus_t result = (expr);           \
+    check_cudss(result, __FILE__, __LINE__); \
+  } while (false)
+
 #define CHECK_NCCL(expr)                    \
   do {                                      \
     ncclResult_t result = (expr);           \
@@ -102,6 +112,24 @@ __host__ inline void check_cusparse(cusparseStatus_t status, const char* file, i
   }
 }
 
+__host__ inline void check_cudss(cudssStatus_t status, const char* file, int line)
+{
+  // TODO: Need to get the equivalent error message from cuDSS
+  if (status != CUDSS_STATUS_SUCCESS) {
+    fprintf(stderr,
+            "Internal CUDSS failure with error code %d in file %s at line %d\n",
+            status,
+            // TODO
+            file,
+            line);
+#ifdef DEBUG_LEGATE_SPARSE
+    assert(false);
+#else
+    exit(status);
+#endif
+  }
+}
+
 __host__ inline void check_nccl(ncclResult_t error, const char* file, int line)
 {
   if (error != ncclSuccess) {
@@ -118,10 +146,9 @@ __host__ inline void check_nccl(ncclResult_t error, const char* file, int line)
   }
 }
 
-// Return a cached stream for the current GPU.
-legate::cuda::StreamView get_cached_stream();
-
 // Method to get the CUSPARSE handle associated with the current GPU.
 cusparseHandle_t get_cusparse();
 
+cudssHandle_t get_cudss();
+
 }  // namespace sparse
diff --git a/src/legate_sparse/util/cudss_utils.h b/src/legate_sparse/util/cudss_utils.h
new file mode 100644
index 00000000..e72d466c
--- /dev/null
+++ b/src/legate_sparse/util/cudss_utils.h
@@ -0,0 +1,73 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/sparse.h"
+#include "legate_sparse/util/cuda_help.h"
+#include "legate_sparse/util/legate_utils.h"
+
+namespace sparse {
+
+using namespace legate;
+
+// Template dispatch for value type.
+// Note: cuDSS only supports floating-point and complex types.
+// Integer and boolean types are not supported by cuDSS.
+template <typename VAL_TY>
+cudaDataType_t cudssDataType();
+
+template <>
+inline cudaDataType_t cudssDataType<float>()
+{
+  return CUDA_R_32F;
+}
+
+template <>
+inline cudaDataType_t cudssDataType<double>()
+{
+  return CUDA_R_64F;
+}
+
+template <>
+inline cudaDataType_t cudssDataType<legate::Complex<float>>()
+{
+  return CUDA_C_32F;
+}
+
+template <>
+inline cudaDataType_t cudssDataType<legate::Complex<double>>()
+{
+  return CUDA_C_64F;
+}
+
+// Template dispatch for the index type.
+template <typename INDEX_TY>
+cudaDataType_t cudssIndexType();
+
+template <>
+inline cudaDataType_t cudssIndexType<int32_t>()
+{
+  return CUDA_R_32I;
+}
+
+template <>
+inline cudaDataType_t cudssIndexType<int64_t>()
+{
+  return CUDA_R_64I;
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/util/cusparse_utils.h b/src/legate_sparse/util/cusparse_utils.h
index 6d496a3e..3ea1029b 100644
--- a/src/legate_sparse/util/cusparse_utils.h
+++ b/src/legate_sparse/util/cusparse_utils.h
@@ -14,9 +14,12 @@
  *
  */
 
+#pragma once
+
 #include "legate_sparse/sparse.h"
 #include "legate_sparse/util/cuda_help.h"
 #include "legate_sparse/util/legate_utils.h"
+#include <legate/redop/redop.h>
 
 namespace sparse {
 
@@ -75,8 +78,7 @@ void* getPtrFromStore(const legate::PhysicalStore& store)
   } else if (!store.is_writable() && store.is_readable()) {
     return const_cast<T*>(store.read_accessor<T, DIM>().ptr(dom.lo()));
   } else if (store.is_reducible()) {
-    return store.reduce_accessor<Legion::SumReduction<T>, true /* exclusive */, DIM>().ptr(
-      dom.lo());
+    return store.reduce_accessor<SumReduction<T>, true, DIM>().ptr(dom.lo());
   } else {
     assert(false);
     return nullptr;
@@ -100,13 +102,13 @@ inline cudaDataType cusparseDataType<double>()
 }
 
 template <>
-inline cudaDataType cusparseDataType<complex<float>>()
+inline cudaDataType cusparseDataType<legate::Complex<float>>()
 {
   return CUDA_C_32F;
 }
 
 template <>
-inline cudaDataType cusparseDataType<complex<double>>()
+inline cudaDataType cusparseDataType<legate::Complex<double>>()
 {
   return CUDA_C_64F;
 }
@@ -133,10 +135,10 @@ template <typename INDEX_TY = int64_t, typename VAL_TY = double>
 cusparseSpMatDescr_t makeCuSparseCSR(const legate::PhysicalStore& pos,
                                      const legate::PhysicalStore& crd,
                                      const legate::PhysicalStore& vals,
-                                     size_t cols)
+                                     size_t cols,
+                                     cudaStream_t stream)
 {
   cusparseSpMatDescr_t matDescr;
-  auto stream = get_cached_stream();
 
   auto pos_domain = pos.domain();
   auto crd_domain = crd.domain();
@@ -169,10 +171,10 @@ template <typename INDEX_TY = int64_t, typename VAL_TY = double>
 cusparseSpMatDescr_t makeCuSparseCSC(const legate::PhysicalStore& pos,
                                      const legate::PhysicalStore& crd,
                                      const legate::PhysicalStore& vals,
-                                     size_t rows)
+                                     size_t rows,
+                                     cudaStream_t stream)
 {
   cusparseSpMatDescr_t matDescr;
-  auto stream = get_cached_stream();
 
   auto pos_domain = pos.domain();
   auto crd_domain = crd.domain();
@@ -237,7 +239,7 @@ cusparseDnMatDescr_t makeCuSparseDenseMat(const legate::PhysicalStore& mat)
     valsPtr  = const_cast<VAL_TY*>(acc.ptr(d.lo()));
     ld       = acc.accessor.strides[0] / sizeof(VAL_TY);
   } else if (mat.is_reducible()) {
-    auto acc = mat.reduce_accessor<Legion::SumReduction<VAL_TY>, true /* exclusive */, 2>();
+    auto acc = mat.reduce_accessor<SumReduction<VAL_TY>, true, 2>();
     valsPtr  = acc.ptr(d.lo());
     ld       = acc.accessor.strides[0] / sizeof(VAL_TY);
   } else {
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index a8b2f17e..0629e734 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -42,7 +42,9 @@ def _create_mask(rows, density=0.3):
         row_idx = numpy.random.randint(0, rows, size=nnz)
         col_idx = numpy.random.randint(0, cols, size=nnz)
         data = numpy.ones(nnz, dtype=bool)
-        A_scipy = scipy_sparse.csr_array((data, (row_idx, col_idx)), shape=(rows, cols))
+        A_scipy = scipy_sparse.csr_array(
+            (data, (row_idx, col_idx)), shape=(rows, cols)
+        )
 
         # Sparse
         A_sparse = sparse.csr_array(A_scipy.todense())
@@ -90,7 +92,9 @@ def create_matrix():
     """
 
     def _create_matrix(N, tol=0.5):
-        _, A_scipy, _ = simple_system_gen(N, N, scipy_sparse.csr_array, tol=tol)
+        _, A_scipy, _ = simple_system_gen(
+            N, N, scipy_sparse.csr_array, tol=tol
+        )
         A_sparse = sparse.csr_array(A_scipy)
 
         # Verify matrices are equivalent
@@ -103,3 +107,318 @@ def _create_matrix(N, tol=0.5):
         return A_scipy, A_sparse
 
     return _create_matrix
+
+
+@pytest.fixture
+def create_tridiagonal_complex_hermitian_matrix():
+    """Create a tridiagonal complex Hermitian sparse matrix.
+
+    This fixture creates a tridiagonal complex Hermitian sparse matrix suitable
+    for eigenvalue computations. The matrix has a real main diagonal and complex
+    conjugate off-diagonals.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+
+    Returns
+    -------
+    scipy.sparse.csr_array
+        A tridiagonal complex Hermitian sparse matrix in SciPy CSR format.
+
+    Notes
+    -----
+    The matrix is constructed with:
+    - Main diagonal: 4.0
+    - Upper diagonal: -(1.0 + 1.0j)
+    - Lower diagonal: -(1.0 - 1.0j) (complex conjugate)
+
+    """
+
+    def _create_tridiagonal_complex_hermitian_matrix(N: int):
+        """Returns a scipy.sparse csr_array that is tridiagonal Hermitian"""
+        main_diag_val = 4.0
+        off_diag_val = -(1.0 + 1.0j)
+
+        main_diag = numpy.full(N, main_diag_val)
+        upper_diag = numpy.full(N - 1, off_diag_val)
+        lower_diag = numpy.full(N - 1, numpy.conjugate(off_diag_val))
+
+        diagonals = [lower_diag, main_diag, upper_diag]
+        offsets = [-1, 0, 1]
+
+        A = scipy_sparse.diags(
+            diagonals,
+            offsets,
+            shape=(N, N),
+            format="csr",
+            dtype=numpy.complex128,
+        )
+
+        return A
+
+    return _create_tridiagonal_complex_hermitian_matrix
+
+
+@pytest.fixture
+def create_tridiagonal_real_symmetric_matrix():
+    """Create a tridiagonal real symmetric sparse matrix.
+
+    This fixture creates a tridiagonal real symmetric sparse matrix suitable
+    for eigenvalue computations. The matrix has a constant main diagonal and
+    constant off-diagonals.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+
+    Returns
+    -------
+    scipy.sparse.csr_array
+        A tridiagonal real symmetric sparse matrix in SciPy CSR format.
+
+    Notes
+    -----
+    The matrix is constructed with:
+    - Main diagonal: 4.0
+    - Upper diagonal: -1.0
+    - Lower diagonal: -1.0
+
+    """
+
+    def _create_tridiagonal_real_symmetric_matrix(N: int):
+        """Returns a scipy.sparse csr_array that is tridiagonal symmetric"""
+        main_diag_val = 4.0
+        off_diag_val = -1.0
+
+        main_diag = numpy.full(N, main_diag_val)
+        upper_diag = numpy.full(N - 1, off_diag_val)
+        lower_diag = numpy.full(N - 1, numpy.conjugate(off_diag_val))
+
+        diagonals = [lower_diag, main_diag, upper_diag]
+        offsets = [-1, 0, 1]
+
+        A = scipy_sparse.diags(
+            diagonals, offsets, shape=(N, N), format="csr", dtype=numpy.float64
+        )
+
+        return A
+
+    return _create_tridiagonal_real_symmetric_matrix
+
+
+@pytest.fixture
+def create_sparse_real_symmetric_matrix():
+    """Create a generic real symmetric sparse matrix with random sparsity.
+
+    This fixture creates a real symmetric sparse matrix suitable for eigenvalue
+    computations. The sparsity pattern changes with N, making it suitable for
+    testing across different matrix sizes.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+    density : float, optional
+        Approximate density of non-zero elements. Default is 0.3.
+    seed : int, optional
+        Random seed for reproducibility. Default is 42.
+
+    Returns
+    -------
+    scipy.sparse.csr_array
+        A real symmetric sparse matrix in SciPy CSR format.
+
+    Notes
+    -----
+    The matrix is constructed by:
+    1. Generating a random sparse matrix
+    2. Making it symmetric: A = (A + A.T) / 2
+    3. Adding a diagonal component to ensure positive definiteness
+
+    """
+
+    def _create_sparse_real_symmetric_matrix(N: int, density=0.3, seed=42):
+        """Returns a scipy.sparse csr_array that is symmetric with random sparsity"""
+        numpy.random.seed(seed)
+
+        # Generate random sparse matrix
+        nnz = int(N * N * density)
+        row_idx = numpy.random.randint(0, N, size=nnz)
+        col_idx = numpy.random.randint(0, N, size=nnz)
+        data = numpy.random.randn(nnz)
+
+        A = scipy_sparse.csr_array((data, (row_idx, col_idx)), shape=(N, N))
+
+        # Make it symmetric: A = (A + A.T) / 2
+        A = (A + A.T) / 2
+
+        # Add diagonal dominance to ensure well-conditioned matrix
+        # This helps with convergence in eigenvalue computations
+        A = A + scipy_sparse.eye(N, format="csr") * N
+
+        return A
+
+    return _create_sparse_real_symmetric_matrix
+
+
+@pytest.fixture
+def create_sparse_complex_hermitian_matrix():
+    """Create a generic complex Hermitian sparse matrix with random sparsity.
+
+    This fixture creates a complex Hermitian sparse matrix suitable for
+    eigenvalue computations. The sparsity pattern changes with N, making it
+    suitable for testing across different matrix sizes.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+    density : float, optional
+        Approximate density of non-zero elements. Default is 0.3.
+    seed : int, optional
+        Random seed for reproducibility. Default is 42.
+
+    Returns
+    -------
+    scipy.sparse.csr_array
+        A complex Hermitian sparse matrix in SciPy CSR format.
+
+    Notes
+    -----
+    The matrix is constructed by:
+    1. Generating a random complex sparse matrix
+    2. Making it Hermitian: A = (A + A.H) / 2
+    3. Adding a diagonal component to ensure positive definiteness
+
+    """
+
+    def _create_sparse_complex_hermitian_matrix(N: int, density=0.3, seed=42):
+        """Returns a scipy.sparse csr_array that is Hermitian with random sparsity"""
+        numpy.random.seed(seed)
+
+        # Generate random complex sparse matrix
+        nnz = int(N * N * density)
+        row_idx = numpy.random.randint(0, N, size=nnz)
+        col_idx = numpy.random.randint(0, N, size=nnz)
+        data_real = numpy.random.randn(nnz)
+        data_imag = numpy.random.randn(nnz)
+        data = data_real + 1j * data_imag
+
+        A = scipy_sparse.csr_array(
+            (data, (row_idx, col_idx)), shape=(N, N), dtype=numpy.complex128
+        )
+
+        # Make it Hermitian: A = (A + A.H) / 2
+        A = (A + A.conjugate().T) / 2
+
+        # Add diagonal dominance to ensure well-conditioned matrix
+        # This helps with convergence in eigenvalue computations
+        A = A + scipy_sparse.eye(N, format="csr", dtype=numpy.complex128) * N
+
+        return A
+
+    return _create_sparse_complex_hermitian_matrix
+
+
+@pytest.fixture
+def create_matrix_with_zero_diagonal():
+    """Create a symmetric/Hermitian matrix with at least one zero diagonal entry.
+
+    This fixture creates a sparse matrix with a missing diagonal element
+    to test error handling in eigenvalue computations.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+    dtype : numpy.dtype
+        Data type of the matrix (numpy.float64 or numpy.complex128).
+    zero_index : int, optional
+        Index of the diagonal element to set to zero. Default is N//2.
+    density : float, optional
+        Approximate density of non-zero elements. Default is 0.3.
+    seed : int, optional
+        Random seed for reproducibility. Default is 42.
+
+    Returns
+    -------
+    scipy.sparse.csr_array
+        A sparse matrix with a zero diagonal entry.
+
+    """
+
+    def _create_matrix_with_zero_diagonal(
+        N: int, dtype=numpy.float64, zero_index=None, density=0.3, seed=42
+    ):
+        """Returns a scipy.sparse csr_array with a zero diagonal entry"""
+        if zero_index is None:
+            zero_index = N // 2
+
+        numpy.random.seed(seed)
+
+        # Generate random sparse matrix
+        nnz = int(N * N * density)
+        row_idx = numpy.random.randint(0, N, size=nnz)
+        col_idx = numpy.random.randint(0, N, size=nnz)
+
+        if dtype == numpy.complex128:
+            data_real = numpy.random.randn(nnz)
+            data_imag = numpy.random.randn(nnz)
+            data = data_real + 1j * data_imag
+            A = scipy_sparse.csr_array(
+                (data, (row_idx, col_idx)), shape=(N, N), dtype=dtype
+            )
+            # Make it Hermitian
+            A = (A + A.conjugate().T) / 2
+            # Add diagonal dominance except for the zero index
+            diag_vals = numpy.full(N, N, dtype=dtype)
+            diag_vals[zero_index] = 0.0
+            A = A + scipy_sparse.diags(diag_vals, 0, format="csr", dtype=dtype)
+        else:
+            data = numpy.random.randn(nnz)
+            A = scipy_sparse.csr_array(
+                (data, (row_idx, col_idx)), shape=(N, N)
+            )
+            # Make it symmetric
+            A = (A + A.T) / 2
+            # Add diagonal dominance except for the zero index
+            diag_vals = numpy.full(N, N, dtype=dtype)
+            diag_vals[zero_index] = 0.0
+            A = A + scipy_sparse.diags(diag_vals, 0, format="csr")
+
+        # Remove the zero from the sparse representation
+        A.eliminate_zeros()
+
+        return A
+
+    return _create_matrix_with_zero_diagonal
+
+
+@pytest.fixture
+def create_non_square_matrix():
+    """Create a non-square matrix for testing error handling.
+
+    Parameters
+    ----------
+    rows : int
+        Number of rows in the matrix.
+    cols : int
+        Number of columns in the matrix.
+    dtype : numpy.dtype
+        Data type of the matrix.
+
+    Returns
+    -------
+    numpy.ndarray
+        A non-square dense matrix.
+
+    """
+
+    def _create_non_square_matrix(rows: int, cols: int, dtype=numpy.float64):
+        """Returns a non-square matrix"""
+        return numpy.random.randn(rows, cols).astype(dtype)
+
+    return _create_non_square_matrix
diff --git a/tests/integration/test_block_array.py b/tests/integration/test_block_array.py
new file mode 100644
index 00000000..c4cbaad9
--- /dev/null
+++ b/tests/integration/test_block_array.py
@@ -0,0 +1,176 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for block_array construction function."""
+
+import sys
+
+import cupynumeric as np
+import pytest
+import scipy.sparse as sp
+
+import legate_sparse as sparse
+
+# Temporary release unblock for a known cupynumeric runtime issue.
+pytestmark = pytest.mark.skip(
+    reason=(
+        "Temporarily disabled for release unblock: "
+        "https://github.com/nv-legate/cupynumeric/issues/1224"
+    )
+)
+
+
+class TestBlockArray:
+    """Tests for the block_array function."""
+
+    def test_basic_2x2_blocks(self):
+        """Test basic 2x2 block assembly."""
+        A = sparse.csr_array(np.array([[1, 2], [3, 4]]))
+        B = sparse.csr_array(np.array([[5, 6], [7, 8]]))
+        C = sparse.csr_array(np.array([[9, 10], [11, 12]]))
+        D = sparse.csr_array(np.array([[13, 14], [15, 16]]))
+
+        result = sparse.block_array([[A, B], [C, D]])
+
+        expected = np.array(
+            [[1, 2, 5, 6], [3, 4, 7, 8], [9, 10, 13, 14], [11, 12, 15, 16]]
+        )
+        assert np.array_equal(result.todense(), expected)
+
+    def test_with_none_blocks(self):
+        """Test block assembly with None (zero) blocks."""
+        A = sparse.csr_array(np.array([[1, 2], [3, 4]]))
+        B = sparse.csr_array(np.array([[5, 6], [7, 8]]))
+
+        result = sparse.block_array([[A, None], [None, B]])
+
+        expected = np.array(
+            [[1, 2, 0, 0], [3, 4, 0, 0], [0, 0, 5, 6], [0, 0, 7, 8]]
+        )
+        assert np.array_equal(result.todense(), expected)
+
+    def test_rectangular_blocks(self):
+        """Test with rectangular blocks."""
+        A = sparse.csr_array(np.array([[1, 2, 3], [4, 5, 6]]))
+        B = sparse.csr_array(np.array([[7], [8]]))
+
+        result = sparse.block_array([[A, B]])
+
+        expected = np.array([[1, 2, 3, 7], [4, 5, 6, 8]])
+        assert np.array_equal(result.todense(), expected)
+
+    def test_single_block(self):
+        """Test with a single block."""
+        A = sparse.csr_array(np.array([[1, 2], [3, 4]]))
+        result = sparse.block_array([[A]])
+        assert np.array_equal(result.todense(), A.todense())
+
+    def test_dtype_inference(self):
+        """Test that dtype is correctly inferred."""
+        A = sparse.csr_array(np.array([[1.5, 2.5]]))
+        B = sparse.csr_array(np.array([[3, 4]]))
+        result = sparse.block_array([[A], [B]])
+        assert result.dtype == np.float64
+
+    def test_explicit_dtype(self):
+        """Test explicit dtype specification."""
+        A = sparse.csr_array(np.array([[1, 2]]))
+        result = sparse.block_array([[A]], dtype=np.float32)
+        assert result.dtype == np.float32
+
+    def test_sparse_blocks(self):
+        """Test with actual sparse blocks (blocks with zeros)."""
+        # Create sparse matrices with actual zero patterns
+        data_A = np.array([1, 0, 0, 2])
+        A = sparse.csr_array(data_A.reshape(2, 2))
+
+        data_B = np.array([0, 3, 4, 0])
+        B = sparse.csr_array(data_B.reshape(2, 2))
+
+        result = sparse.block_array([[A, B]])
+
+        expected = np.array([[1, 0, 0, 3], [0, 2, 4, 0]])
+        assert np.array_equal(result.todense(), expected)
+
+    def test_matches_scipy(self):
+        """Test that output matches SciPy's block_array."""
+        np.random.seed(42)
+
+        # Create random sparse blocks
+        A_dense = np.random.rand(3, 4)
+        B_dense = np.random.rand(3, 2)
+        C_dense = np.random.rand(2, 4)
+        D_dense = np.random.rand(2, 2)
+
+        # SciPy version
+        A_sp = sp.csr_array(A_dense)
+        B_sp = sp.csr_array(B_dense)
+        C_sp = sp.csr_array(C_dense)
+        D_sp = sp.csr_array(D_dense)
+        scipy_result = sp.block_array([[A_sp, B_sp], [C_sp, D_sp]]).todense()
+
+        # Legate version
+        A_lg = sparse.csr_array(A_dense)
+        B_lg = sparse.csr_array(B_dense)
+        C_lg = sparse.csr_array(C_dense)
+        D_lg = sparse.csr_array(D_dense)
+        legate_result = sparse.block_array(
+            [[A_lg, B_lg], [C_lg, D_lg]]
+        ).todense()
+
+        assert np.allclose(legate_result, scipy_result)
+
+
+class TestBlockArrayErrors:
+    """Tests for block_array error handling."""
+
+    def test_empty_blocks_raises(self):
+        """Test that empty blocks raises ValueError."""
+        with pytest.raises(ValueError, match="cannot be empty"):
+            sparse.block_array([])
+
+    def test_inconsistent_row_count_raises(self):
+        """Test that inconsistent row counts raise ValueError."""
+        A = sparse.csr_array(np.array([[1, 2], [3, 4]]))
+        B = sparse.csr_array(np.array([[5, 6, 7]]))  # Only 1 row
+
+        with pytest.raises(ValueError, match="rows"):
+            sparse.block_array([[A, B]])
+
+    def test_inconsistent_col_count_raises(self):
+        """Test that inconsistent column counts raise ValueError."""
+        A = sparse.csr_array(np.array([[1, 2], [3, 4]]))
+        B = sparse.csr_array(np.array([[5], [6], [7]]))  # 3 rows, but 1 col
+        C = sparse.csr_array(
+            np.array([[8, 9]])
+        )  # 1 row, but needs 3 cols below B
+
+        with pytest.raises(ValueError):
+            sparse.block_array([[A, B], [C, None]])
+
+    def test_unsupported_format_raises(self):
+        """Test that unsupported format raises ValueError."""
+        A = sparse.csr_array(np.array([[1, 2]]))
+        with pytest.raises(ValueError, match="csr"):
+            sparse.block_array([[A]], format="coo")
+
+    def test_non_csr_block_raises(self):
+        """Test that non-CSR blocks raise TypeError."""
+        A = sparse.csr_array(np.array([[1, 2]]))
+        with pytest.raises(TypeError, match="csr_array"):
+            sparse.block_array([[A, np.array([[3, 4]])]])
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_cg_solve.py b/tests/integration/test_cg_solve.py
index d8e046e3..abed202b 100644
--- a/tests/integration/test_cg_solve.py
+++ b/tests/integration/test_cg_solve.py
@@ -50,7 +50,7 @@ def test_cg_solve():
     x = sample_dense_vector(D, 0.1, seed)
     y = A @ x
     x_pred, iters = linalg.cg(A, y, tol=1e-8)
-    assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0)
+    assert np.allclose((A @ x_pred), y, rtol=1e-8)
 
 
 def test_cg_solve_with_callback():
@@ -92,7 +92,7 @@ def callback(x):
         residuals.append(y - A @ x)
 
     x_pred, iters = linalg.cg(A, y, tol=1e-8, callback=callback)
-    assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0)
+    assert np.allclose((A @ x_pred), y, rtol=1e-8)
     assert len(residuals) > 0
 
 
@@ -150,7 +150,7 @@ def matvec(x):
     x_pred, iters = linalg.cg(
         linalg.LinearOperator(A.shape, matvec=matvec), y, tol=1e-8
     )
-    assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0)
+    assert np.allclose((A @ x_pred), y, rtol=1e-8)
 
     def matvec(x, out=None):
         return A.dot(x, out=out)
@@ -158,7 +158,7 @@ def matvec(x, out=None):
     x_pred, iters = linalg.cg(
         linalg.LinearOperator(A.shape, matvec=matvec), y, tol=1e-8
     )
-    assert np.allclose((A @ x_pred), y, rtol=1e-8, atol=0.0)
+    assert np.allclose((A @ x_pred), y, rtol=1e-8)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_csr_from_csr.py b/tests/integration/test_csr_from_csr.py
index 4dd7b2f8..bdfe57b1 100644
--- a/tests/integration/test_csr_from_csr.py
+++ b/tests/integration/test_csr_from_csr.py
@@ -31,8 +31,12 @@ def test_csr_from_csr_fixed():
     7 0 0 0 2 1
     """
     row_offsets = np.array([0, 2, 5, 7, 9, 11, 14], dtype=np.int64)
-    csr_vals = np.array([2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64)
-    col_indices = np.array([0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64)
+    csr_vals = np.array(
+        [2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64
+    )
+    col_indices = np.array(
+        [0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64
+    )
     matrix_shape = (6, 6)
 
     A = sparse.csr_array(  # noqa: F841
diff --git a/tests/integration/test_csr_to_dense.py b/tests/integration/test_csr_to_dense.py
index 7a444938..5177efa0 100644
--- a/tests/integration/test_csr_to_dense.py
+++ b/tests/integration/test_csr_to_dense.py
@@ -22,11 +22,17 @@
 
 def test_csr_to_dense():
     row_offsets = np.array([0, 2, 5, 7, 9, 11, 14], dtype=np.int64)
-    csr_vals = np.array([2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64)
-    col_indices = np.array([0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64)
+    csr_vals = np.array(
+        [2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64
+    )
+    col_indices = np.array(
+        [0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64
+    )
     matrix_shape = (6, 6)
 
-    A = sparse.csr_array((csr_vals, col_indices, row_offsets), shape=matrix_shape)
+    A = sparse.csr_array(
+        (csr_vals, col_indices, row_offsets), shape=matrix_shape
+    )
 
     B = A.todense()
     expected_B = np.array(
diff --git a/tests/integration/test_diags.py b/tests/integration/test_diags.py
index 04cf7c56..a8fcc936 100644
--- a/tests/integration/test_diags.py
+++ b/tests/integration/test_diags.py
@@ -23,7 +23,9 @@
 
 @pytest.mark.parametrize("N", [12, 34])
 @pytest.mark.parametrize("diagonals", [3, 5])
-@pytest.mark.parametrize("dtype", (np.float32, np.float64, np.complex64, np.complex128))
+@pytest.mark.parametrize(
+    "dtype", (np.float32, np.float64, np.complex64, np.complex128)
+)
 @pytest.mark.parametrize("fmt", ["csr", "dia"])
 def test_diags(N, diagonals, dtype, fmt):
     A = sparse.diags(
diff --git a/tests/integration/test_eigsh.py b/tests/integration/test_eigsh.py
new file mode 100644
index 00000000..c3819d57
--- /dev/null
+++ b/tests/integration/test_eigsh.py
@@ -0,0 +1,392 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupynumeric as cn
+import numpy
+import pytest
+
+import legate_sparse.linalg as linalg
+from legate_sparse import csr_array
+
+
+@pytest.fixture
+def check_eigsh_result():
+    """Checks if the Eigenvalues match Ax = wx.
+
+    Parameters
+    ----------
+    A : csr_array
+        Input sparse matrix
+    w: numpy.ndarray
+        Eigen values
+    x: numpy.ndarray
+        Eigen vectors
+    res_tol: float, optional
+        Acceptable residual
+    """
+
+    def _check_eigsh_result(A, w, x, res_tol: float = 1e-3):
+        """Verify eigsh results by checking residual, Ax - wx"""
+        for i in range(w.size):
+            # ||Ax - wx|| / ||w||
+            Ax = A @ x[:, i]
+            wx = w[i] * x[:, i]
+            res = cn.linalg.norm(Ax - wx) / cn.abs(w[i])
+            assert res < res_tol, (
+                f"Residual {res} exceeds tol of {res_tol} for {i}th eigen value"
+            )
+
+    return _check_eigsh_result
+
+
+class TestEigsh:
+    """Test eigsh with various parameters following CuPy's testing approach."""
+
+    # ------ Test arguments: N, k, which
+
+    @pytest.mark.parametrize("N", [10, 16])
+    @pytest.mark.parametrize("which", ["LM", "LA", "SA"])
+    @pytest.mark.parametrize("k", [1, 3])
+    def test_eigsh_real_symmetric(
+        self,
+        N,
+        which,
+        k,
+        create_tridiagonal_real_symmetric_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with real symmetric tridiagonal matrices."""
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real for real symmetric matrices"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues, found {w.shape}"
+
+        check_eigsh_result(A, w, x)
+
+    @pytest.mark.parametrize("N", [10, 16])
+    @pytest.mark.parametrize("which", ["LM", "LA", "SA"])
+    @pytest.mark.parametrize("k", [1, 3])
+    def test_eigsh_complex_hermitian(
+        self,
+        N,
+        which,
+        k,
+        create_tridiagonal_complex_hermitian_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with complex Hermitian tridiagonal matrices."""
+        A_scipy = create_tridiagonal_complex_hermitian_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real for Hermitian matrices"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+
+        check_eigsh_result(A, w, x)
+
+    # ------ Test argument return_eigenvector
+
+    def test_eigsh_eigenvalues_only_real(
+        self, create_tridiagonal_real_symmetric_matrix
+    ):
+        """Test eigsh with return_eigenvectors=False for real matrices."""
+        N, k = 10, 2
+        which = "LM"
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w = linalg.eigsh(A, k=k, which=which, return_eigenvectors=False)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+
+    def test_eigsh_eigenvalues_only_complex(
+        self, create_tridiagonal_complex_hermitian_matrix
+    ):
+        """Test eigsh with return_eigenvectors=False for complex matrices."""
+        N, k = 10, 2
+        which = "LM"
+        A_scipy = create_tridiagonal_complex_hermitian_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w = linalg.eigsh(A, k=k, which=which, return_eigenvectors=False)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+
+    # ------ Test argument v0
+
+    def test_eigsh_with_v0_real(
+        self, create_tridiagonal_real_symmetric_matrix, check_eigsh_result
+    ):
+        """Test eigsh with user-provided initial vector v0 for real matrices."""
+        N, k = 10, 2
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        v0 = numpy.array(cn.random.randn(N), dtype=numpy.float64)
+
+        w, x = linalg.eigsh(
+            A, k=k, which="LM", v0=v0, return_eigenvectors=True
+        )
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        check_eigsh_result(A, w, x)
+
+    def test_eigsh_with_v0_complex(
+        self, create_tridiagonal_complex_hermitian_matrix, check_eigsh_result
+    ):
+        """Test eigsh with user-provided initial vector v0 for complex matrices."""
+        N, k = 10, 2
+        A_scipy = create_tridiagonal_complex_hermitian_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        v0 = cn.array(
+            numpy.random.randn(N) + 1j * numpy.random.randn(N),
+            dtype=numpy.complex128,
+        )
+
+        w, x = linalg.eigsh(
+            A, k=k, which="LM", v0=v0, return_eigenvectors=True
+        )
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-10), (
+            "Eigenvalues should be real"
+        )
+        check_eigsh_result(A, w, x)
+
+    # ------ Test output sortedness
+
+    def test_eigsh_sorted_eigenvalues(
+        self, create_tridiagonal_real_symmetric_matrix
+    ):
+        """Test that eigenvalues are returned sorted."""
+        N, k = 20, 6
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w, _ = linalg.eigsh(A, k=k, which="LM", return_eigenvectors=True)
+
+        # Eigenvalues should be sorted in ascending order
+        w_sorted = cn.sort(w)
+        assert cn.allclose(w, w_sorted), "Eigenvalues should be sorted"
+
+
+class TestEigshLargeProblems:
+    """Test eigsh with larger problem sizes."""
+
+    @pytest.mark.parametrize("N", [15, 30])
+    @pytest.mark.parametrize("k", [3, 6])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_large_real_symmetric(
+        self,
+        N,
+        k,
+        which,
+        create_tridiagonal_real_symmetric_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with large real symmetric tridiagonal matrices."""
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A, w, x)
+
+    @pytest.mark.parametrize("N", [15, 30])
+    @pytest.mark.parametrize("k", [3, 6])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_large_complex_hermitian(
+        self,
+        N,
+        k,
+        which,
+        create_tridiagonal_complex_hermitian_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with large complex Hermitian tridiagonal matrices."""
+        A_scipy = create_tridiagonal_complex_hermitian_matrix(N)
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A, w, x)
+
+
+class TestEigshRandomSparse:
+    """Test eigsh with random sparse symmetric/Hermitian matrices."""
+
+    @pytest.mark.parametrize("N", [15, 30])
+    @pytest.mark.parametrize("k", [1, 3])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_random_real_symmetric(
+        self,
+        N,
+        k,
+        which,
+        create_sparse_real_symmetric_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with random sparse real symmetric matrices."""
+        A_scipy = create_sparse_real_symmetric_matrix(N, density=0.3, seed=42)
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A, w, x)
+
+    @pytest.mark.parametrize("N", [15, 30])
+    @pytest.mark.parametrize("k", [1, 3])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_random_complex_hermitian(
+        self,
+        N,
+        k,
+        which,
+        create_sparse_complex_hermitian_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with random sparse complex Hermitian matrices."""
+        A_scipy = create_sparse_complex_hermitian_matrix(
+            N, density=0.3, seed=42
+        )
+        A = csr_array(A_scipy.todense())
+
+        w, x = linalg.eigsh(A, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A, w, x)
+
+
+class TestEigshLinearOperator:
+    """Test eigsh with LinearOperator input."""
+
+    @pytest.mark.parametrize("N", [10, 20])
+    @pytest.mark.parametrize("k", [1, 3])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_linear_operator_real(
+        self,
+        N,
+        k,
+        which,
+        create_tridiagonal_real_symmetric_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with LinearOperator wrapping a real symmetric matrix."""
+        A_scipy = create_tridiagonal_real_symmetric_matrix(N)
+        A_dense = cn.array(A_scipy.todense())
+
+        A_op = linalg.LinearOperator(
+            shape=(N, N), matvec=lambda v: A_dense @ v, dtype=A_dense.dtype
+        )
+
+        w, x = linalg.eigsh(A_op, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0.0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A_dense, w, x)
+
+    @pytest.mark.parametrize("N", [10, 20])
+    @pytest.mark.parametrize("k", [1, 3])
+    @pytest.mark.parametrize("which", ["LM", "SA"])
+    def test_eigsh_linear_operator_complex(
+        self,
+        N,
+        k,
+        which,
+        create_tridiagonal_complex_hermitian_matrix,
+        check_eigsh_result,
+    ):
+        """Test eigsh with LinearOperator wrapping a complex Hermitian matrix."""
+        A_scipy = create_tridiagonal_complex_hermitian_matrix(N)
+        A_dense = cn.array(A_scipy.todense())
+
+        A_op = linalg.LinearOperator(
+            shape=(N, N), matvec=lambda v: A_dense @ v, dtype=A_dense.dtype
+        )
+
+        w, x = linalg.eigsh(A_op, k=k, which=which, return_eigenvectors=True)
+
+        assert cn.allclose(cn.imag(w), 0.0, atol=1e-6), (
+            "Eigenvalues should be real"
+        )
+        assert w.shape == (k,), f"Expected {k} eigenvalues"
+        check_eigsh_result(A_dense, w, x)
+
+
+class TestEigshErrors:
+    """Test eigsh error handling."""
+
+    def test_non_square_matrix(self):
+        """Test that non-square matrix raises ValueError."""
+        A_rect = csr_array(numpy.random.randn(10, 15))
+        with pytest.raises(ValueError, match="expected square matrix"):
+            linalg.eigsh(A_rect, k=1)
+
+    def test_k_too_large(self):
+        """Test that k >= n raises ValueError."""
+        n = 10
+        A = csr_array(numpy.eye(n))
+        with pytest.raises(ValueError, match="k must be smaller than n"):
+            linalg.eigsh(A, k=n)
+
+    def test_k_zero_or_negative(self):
+        """Test that k <= 0 raises ValueError."""
+        A = csr_array(numpy.eye(10))
+        with pytest.raises(ValueError, match="k must be greater than 0"):
+            linalg.eigsh(A, k=0)
+
+    def test_invalid_which(self):
+        """Test that invalid which raises ValueError."""
+        A = csr_array(numpy.eye(10))
+        with pytest.raises(ValueError, match="which must be"):
+            linalg.eigsh(A, k=1, which="INVALID")
+
+
+if __name__ == "__main__":
+    import sys
+
+    pytest.main(sys.argv)
diff --git a/tests/integration/test_geam.py b/tests/integration/test_geam.py
new file mode 100644
index 00000000..665c5092
--- /dev/null
+++ b/tests/integration/test_geam.py
@@ -0,0 +1,269 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for GEAM API and sparse matrix arithmetic operations."""
+
+import sys
+
+import cupynumeric as np
+import pytest
+from utils.banded_matrix import banded_matrix
+from utils.sample import simple_system_gen
+
+import legate_sparse as sparse
+from legate_sparse.csr import geam
+
+
+# =============================================================================
+# GEAM API Tests - Error Cases
+# =============================================================================
+
+
+def test_geam_sparse_dense_mismatch_A_dense():
+    """Test that geam raises error when only one of the arrays is sparse."""
+    N = 5
+    np.random.seed(42)
+    A_dense = np.random.rand(N, N)
+    B_sparse = banded_matrix(N, 3)
+
+    with pytest.raises((TypeError, AttributeError)):
+        geam(A_dense, B_sparse, 1.0, 2.0)
+
+    with pytest.raises((TypeError, AttributeError)):
+        geam(B_sparse, A_dense, 1.0, 2.0)
+
+
+def test_geam_wrong_sparsity_pattern_for_C():
+    """Providing C with incompatible sparsity pattern leads to incorrect results."""
+    N = 5
+    np.random.seed(42)
+
+    A = banded_matrix(N, 3)  # tri-diagonal
+    B = banded_matrix(N, 5)  # penta-diagonal
+
+    C_correct = geam(A, B, 2.0, 3.0)
+    C_wrong = banded_matrix(N, 3)  # wrong pattern - too few non-zeros
+    C_result = geam(A, B, 2.0, 3.0, C=C_wrong)
+
+    # Results should NOT match due to incompatible sparsity
+    assert not np.allclose(C_correct.todense(), C_result.todense())
+
+
+# =============================================================================
+# GEAM API Tests - Success Cases
+# =============================================================================
+
+
+@pytest.mark.parametrize("N", [5, 15, 30])
+def test_geam_basic_without_C(N):
+    """Test geam without providing C."""
+    np.random.seed(42)
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+    B_dense, B_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+
+    C_sparse = geam(A_sparse, B_sparse, 2.5, -1.5)
+    C_expected = 2.5 * A_dense + (-1.5) * B_dense
+
+    assert np.allclose(C_sparse.todense(), C_expected, rtol=1e-10, atol=1e-12)
+
+
+@pytest.mark.parametrize("N", [5, 15, 30])
+def test_geam_basic_with_C(N):
+    """Test geam with pre-allocated C, then reuse it."""
+    np.random.seed(42)
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+    B_dense, B_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+
+    C_sparse = geam(A_sparse, B_sparse, 2.0, 3.0)
+    assert np.allclose(C_sparse.todense(), 2.0 * A_dense + 3.0 * B_dense)
+
+    C_sparse = geam(A_sparse, B_sparse, -1.0, 0.5, C=C_sparse)
+    assert np.allclose(C_sparse.todense(), -1.0 * A_dense + 0.5 * B_dense)
+
+
+@pytest.mark.parametrize(
+    "alpha,beta", [(1.0, 1.0), (1.0, -1.0), (2.0, 0.0), (0.0, 3.0)]
+)
+def test_geam_various_scalars(alpha, beta):
+    """Test geam with various scalar combinations."""
+    N = 15
+    np.random.seed(42)
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+    B_dense, B_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+
+    C_sparse = geam(A_sparse, B_sparse, alpha, beta)
+    assert np.allclose(C_sparse.todense(), alpha * A_dense + beta * B_dense)
+
+
+def test_geam_loop_with_C_reuse():
+    """Test geam in a loop where C is reused across iterations."""
+    N = 15
+    np.random.seed(42)
+
+    A_sparse = banded_matrix(N, 3)
+    B_sparse = banded_matrix(N, 3)
+    C_sparse = geam(A_sparse, B_sparse, 1.0, 1.0)
+
+    for i in range(1, 5):
+        A_new = banded_matrix(N, 3, init_with_ones=False)
+        B_new = banded_matrix(N, 3, init_with_ones=False)
+        scale_A, scale_B = float(i + 1), float(i + 2)
+
+        C_sparse = geam(A_new, B_new, scale_A, scale_B, C=C_sparse)
+        C_expected = scale_A * A_new.todense() + scale_B * B_new.todense()
+
+        assert np.allclose(C_sparse.todense(), C_expected)
+
+
+def test_geam_identical_matrices():
+    """Test geam when A and B are identical."""
+    N = 15
+    np.random.seed(42)
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+
+    C_sparse = geam(A_sparse, A_sparse, 2.0, 3.0)
+    assert np.allclose(C_sparse.todense(), 5.0 * A_dense)
+
+
+def test_geam_disjoint_sparsity_patterns():
+    """Test geam when A and B have disjoint sparsity patterns."""
+    N = 15
+    np.random.seed(42)
+
+    A_dense = np.triu(np.random.rand(N, N))
+    A_sparse = sparse.csr_array(A_dense)
+    B_dense = np.tril(np.random.rand(N, N), k=-1)
+    B_sparse = sparse.csr_array(B_dense)
+
+    C_sparse = geam(A_sparse, B_sparse, 1.5, 2.5)
+    assert np.allclose(C_sparse.todense(), 1.5 * A_dense + 2.5 * B_dense)
+
+
+# =============================================================================
+# Dunder Method Tests (__add__, __sub__, __radd__, __rsub__)
+# =============================================================================
+
+
+class TestCSRArithmetic:
+    """Tests for CSR matrix arithmetic dunder methods."""
+
+    @pytest.fixture
+    def matrices(self):
+        """Create test matrices."""
+        N = 15
+        np.random.seed(42)
+        A_dense, A_sparse, _ = simple_system_gen(
+            N, N, sparse.csr_array, tol=0.3
+        )
+        B_dense, B_sparse, _ = simple_system_gen(
+            N, N, sparse.csr_array, tol=0.3
+        )
+        return A_dense, A_sparse, B_dense, B_sparse
+
+    # -------------------------------------------------------------------------
+    # Sparse + Sparse, Sparse - Sparse
+    # -------------------------------------------------------------------------
+
+    def test_add_sparse_sparse(self, matrices):
+        """A + B where both are sparse."""
+        A_dense, A_sparse, B_dense, B_sparse = matrices
+        C = A_sparse + B_sparse
+        assert np.allclose(C.todense(), A_dense + B_dense)
+
+        C = A_sparse - B_sparse
+        assert np.allclose(C.todense(), A_dense - B_dense)
+
+    # -------------------------------------------------------------------------
+    # Sparse + Dense, Dense + Sparse
+    # -------------------------------------------------------------------------
+
+    def test_add_sparse_dense(self, matrices):
+        """sparse + dense returns dense."""
+        A_dense, A_sparse, B_dense, _ = matrices
+        C = A_sparse + B_dense
+        assert np.allclose(C, A_dense + B_dense)
+
+    @pytest.mark.skip(
+        reason="cupynumeric intercepts dense+sparse before __radd__ is called"
+    )
+    def test_add_dense_sparse(self, matrices):
+        """dense + sparse should return dense (currently broken in cupynumeric)."""
+        A_dense, _, B_dense, B_sparse = matrices
+        C = A_dense + B_sparse
+        assert np.allclose(C, A_dense + B_dense)
+
+    # -------------------------------------------------------------------------
+    # Sparse - Dense, Dense - Sparse
+    # -------------------------------------------------------------------------
+
+    def test_sub_sparse_dense(self, matrices):
+        """sparse - dense returns dense."""
+        A_dense, A_sparse, B_dense, _ = matrices
+        C = A_sparse - B_dense
+        assert np.allclose(C, A_dense - B_dense)
+
+    @pytest.mark.skip(
+        reason="cupynumeric intercepts dense-sparse before __rsub__ is called"
+    )
+    def test_sub_dense_sparse(self, matrices):
+        """dense - sparse should return dense (currently broken in cupynumeric)."""
+        A_dense, _, B_dense, B_sparse = matrices
+        C = A_dense - B_sparse
+        assert np.allclose(C, A_dense - B_dense)
+
+    # -------------------------------------------------------------------------
+    # Sparse + Scalar, Scalar + Sparse
+    # -------------------------------------------------------------------------
+
+    def test_add_sparse_zero(self, matrices):
+        """A + 0 should return a copy of A."""
+        A_dense, A_sparse, _, _ = matrices
+        C = A_sparse + 0
+        assert np.allclose(C.todense(), A_dense)
+
+        C = 0 + A_sparse
+        assert np.allclose(C.todense(), A_dense)
+
+    def test_add_sparse_nonzero_scalar_raises(self, matrices):
+        """A + nonzero scalar should raise NotImplementedError."""
+        _, A_sparse, _, _ = matrices
+        with pytest.raises(NotImplementedError):
+            _ = A_sparse + 5.0
+        with pytest.raises(NotImplementedError):
+            _ = 5.0 + A_sparse
+
+    # -------------------------------------------------------------------------
+    # Sparse - Scalar, Scalar - Sparse
+    # -------------------------------------------------------------------------
+
+    def test_sub_sparse_zero(self, matrices):
+        """A - 0 should return a copy of A."""
+        A_dense, A_sparse, _, _ = matrices
+        C = A_sparse - 0
+        assert np.allclose(C.todense(), A_dense)
+
+        C = 0 - A_sparse
+        assert np.allclose(C.todense(), -A_dense)
+
+    def test_sub_sparse_nonzero_scalar_raises(self, matrices):
+        """Subtracting a nonzero scalar should raise NotImplementedError."""
+        _, A_sparse, _, _ = matrices
+        with pytest.raises(NotImplementedError):
+            _ = A_sparse - 5.0
+        with pytest.raises(NotImplementedError):
+            _ = 5.0 - A_sparse
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_indexing.py b/tests/integration/test_indexing.py
index 259c7996..01b2dbac 100644
--- a/tests/integration/test_indexing.py
+++ b/tests/integration/test_indexing.py
@@ -72,7 +72,7 @@ def test_incompatible_mask(self, N, create_matrix, create_mask):
 
         # make sure the values are updated correctly
         A_dense = numpy.asarray(A.todense())
-        assert numpy.allclose(A_dense[mask_dense].sum() / num_nonzeros, value)
+        assert numpy.allclose(A_dense[mask_dense].sum(), value * num_nonzeros)
 
         # TODO: Add a check/test for location of nonzeros as well
 
@@ -200,33 +200,11 @@ def test_random_column_order(self):
         This is important because CSR format requires column indices to be
         sorted within each row for efficient operations.
         """
-        row_indices = cupynumeric.array(
-            [
-                2,
-                4,
-                5,
-                3,
-                5,
-                1,
-                1,
-                5,
-                5,
-            ]
+        row_indices = cupynumeric.array([2, 4, 5, 3, 5, 1, 1, 5, 5])
+        col_indices = cupynumeric.array([3, 1, 2, 2, 5, 1, 4, 1, 3])
+        data = cupynumeric.array(
+            [7.0, 9.0, 3.0, 4.0, 5.0, 19.0, 2.0, 99.0, 109.0]
         )
-        col_indices = cupynumeric.array(
-            [
-                3,
-                1,
-                2,
-                2,
-                5,
-                1,
-                4,
-                1,
-                3,
-            ]
-        )
-        data = cupynumeric.array([7.0, 9.0, 3.0, 4.0, 5.0, 19.0, 2.0, 99.0, 109.0])
 
         # note that the data in row 5 is ordered (2, 5, 1, 3),which will get
         # sorted to (1, 2, 5, 3) during instantiation, which is needed for indexing
diff --git a/tests/integration/test_manual_sorting.py b/tests/integration/test_manual_sorting.py
index 7d3ed282..4999c946 100644
--- a/tests/integration/test_manual_sorting.py
+++ b/tests/integration/test_manual_sorting.py
@@ -15,6 +15,7 @@
 import cupynumeric as np
 import numpy
 import pytest
+
 from legate_sparse.utils import sort_by_rows_then_cols
 
 
diff --git a/tests/integration/test_negate.py b/tests/integration/test_negate.py
new file mode 100644
index 00000000..ab9a1e21
--- /dev/null
+++ b/tests/integration/test_negate.py
@@ -0,0 +1,38 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for sparse matrix negation (__neg__)."""
+
+import sys
+
+import cupynumeric as np
+import pytest
+from utils.sample import simple_system_gen
+
+import legate_sparse as sparse
+
+
+def test_negate():
+    """-A returns a sparse matrix with negated values."""
+    N = 15
+    np.random.seed(42)
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.3)
+
+    C = -A_sparse
+
+    assert np.allclose(C.todense(), -A_dense)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_spgemm.py b/tests/integration/test_spgemm.py
index 5954df79..9e4c725d 100644
--- a/tests/integration/test_spgemm.py
+++ b/tests/integration/test_spgemm.py
@@ -16,11 +16,11 @@
 
 import cupynumeric as np
 import pytest
-from legate_sparse.runtime import runtime
 from utils.banded_matrix import banded_matrix
 from utils.sample import simple_system_gen
 
 import legate_sparse as sparse
+from legate_sparse.runtime import runtime
 
 
 @pytest.mark.parametrize("N", [5, 29])
diff --git a/tests/integration/test_spmv.py b/tests/integration/test_spmv.py
index 0c3590df..1b953150 100644
--- a/tests/integration/test_spmv.py
+++ b/tests/integration/test_spmv.py
@@ -16,11 +16,11 @@
 
 import cupynumeric as np
 import pytest
-from legate_sparse.runtime import runtime
 from utils.banded_matrix import banded_matrix
 from utils.sample import simple_system_gen
 
 import legate_sparse as sparse
+from legate_sparse.runtime import runtime
 
 
 @pytest.mark.parametrize("N", [5, 29])
@@ -105,5 +105,39 @@ def test_csr_spmv_unsupported_dtype(N, nnz_per_row, unsupported_dtype):
             y = A.dot(x)  # noqa: F841
 
 
+@pytest.mark.parametrize("N", [5, 29])
+@pytest.mark.parametrize("M", [7, 17])
+@pytest.mark.parametrize("complex_dtype", [np.complex64, np.complex128])
+def test_csr_spmv_complex(N, M, complex_dtype):
+    """Test sparse matrix-vector multiplication with complex datatypes.
+
+    This test verifies that sparse matrix-vector multiplication works
+    correctly for complex64 and complex128 datatypes.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    M : int
+        Number of columns in the matrix.
+    complex_dtype : dtype
+        Complex datatype to use (complex64 or complex128).
+    """
+
+    # get real and imag parts separately
+    A_dense_real, _, x_real = simple_system_gen(N, M, sparse.csr_array)
+    A_dense_imag, _, x_imag = simple_system_gen(N, M, sparse.csr_array)
+
+    A_dense = A_dense_real.astype(complex_dtype) + 1j * A_dense_imag.astype(
+        complex_dtype
+    )
+    x = x_real.astype(complex_dtype) + 1j * x_imag.astype(complex_dtype)
+    A = sparse.csr_array(A_dense.astype(complex_dtype))
+
+    y = A @ x
+
+    assert np.all(np.isclose(y, A_dense @ x))
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_spsolve.py b/tests/integration/test_spsolve.py
new file mode 100644
index 00000000..f4b03e2e
--- /dev/null
+++ b/tests/integration/test_spsolve.py
@@ -0,0 +1,199 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupynumeric as np
+import pytest
+import scipy.sparse as scipy_sparse
+import scipy.sparse.linalg as scipy_linalg
+from utils.sample import sample_dense
+
+import legate_sparse.linalg as linalg
+from legate_sparse import csr_array
+from legate_sparse.runtime import runtime
+
+# Skip all tests in this module if no GPUs are available
+# since spsolve is only supported on GPU
+pytestmark = pytest.mark.skipif(
+    runtime.num_gpus == 0, reason="spsolve is only supported on GPU backend"
+)
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_identity_matrix(N):
+    """Test spsolve with an identity matrix."""
+    A = csr_array(np.eye(N))
+    b = np.ones(N)
+    x = linalg.spsolve(A, b)
+
+    # For identity matrix, x should equal b
+    assert np.allclose(x, b, rtol=1e-10, atol=1e-12), (
+        f"Identity matrix solution incorrect: max error = {np.max(np.abs(x - b))}"
+    )
+
+
+def test_spsolve_basic_square_matrix():
+    """Test spsolve with a basic square matrix."""
+
+    N = 5
+    np.random.seed(42)
+    A_dense = sample_dense(N, N, 0.3, 42)
+    A_dense = A_dense + N * np.eye(N)
+
+    A = csr_array(A_dense)
+    b = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), (
+        f"Solution verification failed: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+    A_scipy = scipy_sparse.csr_matrix(np.array(A.todense()))
+    x_scipy = scipy_linalg.spsolve(A_scipy, np.array(b))
+    assert np.allclose(x, x_scipy, rtol=1e-5, atol=1e-6), (
+        f"Solution differs from SciPy: max error = {np.max(np.abs(x - x_scipy))}"
+    )
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_diagonal_matrix(N):
+    """Test spsolve with a diagonal matrix."""
+    diag_values = np.arange(1.0, N + 1.0)
+    A_dense = np.diag(diag_values)
+    A = csr_array(A_dense)
+    b = np.ones(N)
+    x = linalg.spsolve(A, b)
+    x_expected = b / diag_values
+    assert np.allclose(x, x_expected, rtol=1e-10, atol=1e-12), (
+        f"Diagonal matrix solution incorrect: max error = {np.max(np.abs(x - x_expected))}"
+    )
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_tridiagonal_matrix(N):
+    """Test spsolve with a tridiagonal matrix."""
+    main_diag = np.full(N, 4.0)
+    off_diag = np.full(N - 1, -1.0)
+    A_dense = np.diag(main_diag) + np.diag(off_diag, 1) + np.diag(off_diag, -1)
+    A = csr_array(A_dense)
+    b = np.ones(N)
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), (
+        f"Tridiagonal solution verification failed: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+    A_scipy = scipy_sparse.csr_matrix(np.array(A.todense()))
+    x_scipy = scipy_linalg.spsolve(A_scipy, np.array(b))
+    assert np.allclose(x, x_scipy, rtol=1e-5, atol=1e-6), (
+        f"Tridiagonal solution differs from SciPy: max error = {np.max(np.abs(x - x_scipy))}"
+    )
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_symmetric_positive_definite(N):
+    """Test spsolve with a symmetric positive definite matrix.
+    We create an SPD matrix by A = B^T * B + N * I.
+    """
+    seed = 42
+    B_dense = sample_dense(N, N, 0.2, seed)
+    A_dense = B_dense.T @ B_dense + N * np.eye(N)
+    A = csr_array(A_dense)
+
+    # make sure it's positive definite
+    eigenvalues = np.linalg.eigvals(A_dense)
+    assert np.all(eigenvalues > 0), "Matrix is not positive definite"
+
+    b = np.random.rand(N)
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-4, atol=1e-5), (
+        f"SPD solution verification failed: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+
+@pytest.mark.parametrize(
+    "dtype", [np.float32, np.float64, np.complex64, np.complex128]
+)
+def test_spsolve_all_dtypes(dtype):
+    """Comprehensive test for spsolve with all cuDSS-supported data types.
+
+    Note: cuDSS only supports floating-point and complex types.
+    Integer and boolean types are not supported
+    """
+    N = 10
+
+    # Create a well-conditioned matrix for each dtype
+    if dtype in [np.complex64, np.complex128]:
+        # For complex types, create a Hermitian positive definite matrix
+        seed = 42
+        np.random.seed(seed)
+        B = np.random.randn(N, N) + 1j * np.random.randn(N, N)
+        A_dense = (B @ B.conj().T + N * np.eye(N)).astype(dtype)
+        b = np.ones(N, dtype=dtype)
+    else:
+        seed = 42
+        A_dense = sample_dense(N, N, 0.3, seed).astype(dtype)
+        A_dense = A_dense + N * np.eye(N, dtype=dtype)
+        b = np.ones(N, dtype=dtype)
+
+    # Solve the system
+    A = csr_array(A_dense)
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-4, atol=1e-5), (
+        f"Solution verification failed for dtype {dtype}: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+    assert x.dtype == b.dtype, (
+        f"Output dtype {x.dtype} doesn't match input dtype {b.dtype} for dtype {dtype}"
+    )
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_upper_triangular(N):
+    """Test spsolve with an upper triangular matrix."""
+    A_dense = np.triu(np.random.rand(N, N) + np.eye(N))
+    A = csr_array(A_dense)
+    b = np.ones(N)
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), (
+        f"Upper triangular solution verification failed: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+
+@pytest.mark.parametrize("N", [5, 10, 20, 50])
+def test_spsolve_lower_triangular(N):
+    """Test spsolve with a lower triangular matrix."""
+    A_dense = np.tril(np.ones((N, N)) + np.eye(N))
+    A = csr_array(A_dense)
+    b = np.ones(N)
+    x = linalg.spsolve(A, b)
+
+    b_computed = A @ x
+    assert np.allclose(b_computed, b, rtol=1e-5, atol=1e-6), (
+        f"Lower triangular solution verification failed: max error = {np.max(np.abs(b_computed - b))}"
+    )
+
+
+if __name__ == "__main__":
+    import sys
+
+    pytest.main(sys.argv)
+    sys.exit(0)
diff --git a/tests/integration/test_unary_operation.py b/tests/integration/test_unary_operation.py
index f1f3c07d..432381d3 100644
--- a/tests/integration/test_unary_operation.py
+++ b/tests/integration/test_unary_operation.py
@@ -22,11 +22,17 @@
 
 def test_unary_operation():
     row_offsets = np.array([0, 2, 5, 7, 9, 11, 14], dtype=np.int64)
-    csr_vals = np.array([2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64)
-    col_indices = np.array([0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64)
+    csr_vals = np.array(
+        [2, 1, 5, 8, 2, 3, 4, 6, 1, 9, 4, 7, 2, 1], dtype=np.float64
+    )
+    col_indices = np.array(
+        [0, 4, 0, 1, 5, 2, 3, 1, 3, 0, 4, 0, 4, 5], dtype=np.int64
+    )
     matrix_shape = (6, 6)
 
-    A = sparse.csr_array((csr_vals, col_indices, row_offsets), shape=matrix_shape)
+    A = sparse.csr_array(
+        (csr_vals, col_indices, row_offsets), shape=matrix_shape
+    )
 
     B = A * 2
     Bvalues = np.asarray(B.vals)
diff --git a/tests/integration/utils/banded_matrix.py b/tests/integration/utils/banded_matrix.py
index fda5ef5f..2467f452 100644
--- a/tests/integration/utils/banded_matrix.py
+++ b/tests/integration/utils/banded_matrix.py
@@ -90,7 +90,9 @@ def banded_matrix(
 
         pred = np.arange(nnz_per_row - half_nnz, nnz_per_row + 1)
         post = np.flip(pred)
-        nnz_arr = np.concatenate((pred, np.ones(main_rows) * nnz_per_row, post))
+        nnz_arr = np.concatenate(
+            (pred, np.ones(main_rows) * nnz_per_row, post)
+        )
 
         if sparse.__name__ == "legate_sparse":
             row_offsets = np.zeros(N + 1).astype(sparse.coord_ty)
diff --git a/tests/testdata/GlossGT.mtx b/tests/testdata/GlossGT.mtx
index 27869886..b3bbe5d0 100644
--- a/tests/testdata/GlossGT.mtx
+++ b/tests/testdata/GlossGT.mtx
@@ -14,15 +14,15 @@
 %-------------------------------------------------------------------------------
 % notes:
 % ------------------------------------------------------------------------------
-% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse 
+% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse
 % matrix collection, Tim Davis.  For Pajek datasets, See V. Batagelj & A. Mrvar,
-% http://vlado.fmf.uni-lj.si/pub/networks/data/.                                
+% http://vlado.fmf.uni-lj.si/pub/networks/data/.
 % ------------------------------------------------------------------------------
-%  Bill Cherowitzo: Graph and Digraph Glossary                                  
-%  http://www-math.cudenver.edu/~wcherowi/courses/m4408/glossary.html           
-%  Pajek's network: Barbara Zemlji"c, 2. nov 2003                               
-% The original problem had 3D xyz coordinates, but all values of z were equal   
-% to 0, and have been removed.  This graph has 2D coordinates.                  
+%  Bill Cherowitzo: Graph and Digraph Glossary
+%  http://www-math.cudenver.edu/~wcherowi/courses/m4408/glossary.html
+%  Pajek's network: Barbara Zemlji"c, 2. nov 2003
+% The original problem had 3D xyz coordinates, but all values of z were equal
+% to 0, and have been removed.  This graph has 2D coordinates.
 %-------------------------------------------------------------------------------
 72 72 122
 3 4
diff --git a/tests/testdata/Ragusa18.mtx b/tests/testdata/Ragusa18.mtx
index 2e8bd6ce..24eaa03b 100644
--- a/tests/testdata/Ragusa18.mtx
+++ b/tests/testdata/Ragusa18.mtx
@@ -14,9 +14,9 @@
 %-------------------------------------------------------------------------------
 % notes:
 % ------------------------------------------------------------------------------
-% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse 
+% Pajek network converted to sparse adjacency matrix for inclusion in UF sparse
 % matrix collection, Tim Davis.  For Pajek datasets, See V. Batagelj & A. Mrvar,
-% http://vlado.fmf.uni-lj.si/pub/networks/data/.                                
+% http://vlado.fmf.uni-lj.si/pub/networks/data/.
 % ------------------------------------------------------------------------------
 %-------------------------------------------------------------------------------
 23 23 64
diff --git a/tests/testdata/karate.mtx b/tests/testdata/karate.mtx
index 59df7607..9ecdff42 100644
--- a/tests/testdata/karate.mtx
+++ b/tests/testdata/karate.mtx
@@ -12,14 +12,14 @@
 % kind: undirected graph
 %-------------------------------------------------------------------------------
 % notes:
-% Network collection from M. Newman                                          
-% http://www-personal.umich.edu/~mejn/netdata/                               
-%                                                                            
-% The graph "karate" contains the network of friendships between the 34      
-% members of a karate club at a US university, as described by Wayne Zachary 
+% Network collection from M. Newman
+% http://www-personal.umich.edu/~mejn/netdata/
+%
+% The graph "karate" contains the network of friendships between the 34
+% members of a karate club at a US university, as described by Wayne Zachary
 % in 1977.  If you use these data in your work, please cite W. W. Zachary, An
 % information flow model for conflict and fission in small groups, Journal of
-% Anthropological Research 33, 452-473 (1977).                               
+% Anthropological Research 33, 452-473 (1977).
 %-------------------------------------------------------------------------------
 34 34 78
 2 1

From f3bb2df3ec60e349cd74a2b5af692c352eda16ff Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 20:53:20 +0000
Subject: [PATCH 3/3] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/adrienverge/yamllint: v1.37.1 → v1.38.0](https://github.com/adrienverge/yamllint/compare/v1.37.1...v1.38.0)
- [github.com/pre-commit/pre-commit-hooks: v5.0.0 → v6.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v5.0.0...v6.0.0)
- [github.com/astral-sh/ruff-pre-commit: v0.12.4 → v0.15.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.4...v0.15.9)
- [github.com/pre-commit/mirrors-clang-format: v16.0.6 → v22.1.2](https://github.com/pre-commit/mirrors-clang-format/compare/v16.0.6...v22.1.2)
---
 .pre-commit-config.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6083ac22..b42e51d7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,14 +1,14 @@
 ---
 repos:
   - repo: https://github.com/adrienverge/yamllint
-    rev: v1.37.1
+    rev: v1.38.0
     hooks:
       - id: yamllint
         types: [yaml]
         args: [-c, ./scripts/pre-commit/yamllint.yml]
         exclude: meta\.yaml$
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
       - id: check-json  # checks that all json files have proper syntax
       - id: check-toml  # checks that all toml files have proper syntax
@@ -27,13 +27,13 @@ repos:
       - id: check-merge-conflict
       - id: check-shebang-scripts-are-executable
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.4
+    rev: v0.15.9
     hooks:
       - id: ruff-format
       - id: ruff
         args: ["--config=./pyproject.toml", "--fix"]
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.6  # Use the sha / tag you want to point at
+    rev: v22.1.2  # Use the sha / tag you want to point at
     hooks:
       - id: clang-format
         files: \.(cu|cuh|h|cc|inl)$