From d3cde651a7c581e171c7bad60a177954f7e1ef34 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Sat, 18 Apr 2026 16:21:40 +0300
Subject: [PATCH 1/3] feat: implement solve_lu

---
 sumpy/symbolic.py | 142 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 140 insertions(+), 2 deletions(-)

diff --git a/sumpy/symbolic.py b/sumpy/symbolic.py
index 0459485f..7bb35ba4 100644
--- a/sumpy/symbolic.py
+++ b/sumpy/symbolic.py
@@ -39,7 +39,7 @@
 
 import logging
 import math
-from typing import TYPE_CHECKING, ClassVar, cast
+from typing import TYPE_CHECKING, Any, ClassVar, cast
 
 from typing_extensions import override
 
@@ -48,6 +48,8 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+
     from pymbolic.typing import ArithmeticExpression, Expression
     from pytools import T
     from pytools.obj_array import ObjectArray1D
@@ -95,6 +97,9 @@ def _find_symbolic_backend():
 
 # }}}
 
+
+# {{{ symbolic API classes
+
 if TYPE_CHECKING or not USE_SYMENGINE:
     import sympy as sym
 
@@ -170,6 +175,8 @@ def doit(expr: Expr) -> Expr:
     def unevaluated_pow(a: Expr, b: complex | Expr) -> Expr:
         return Pow(a, b, evaluate=False)
 
+# }}}
+
 
 # {{{ debugging of sympy CSE via Maxima
 
@@ -253,12 +260,125 @@ def checked_cse(exprs, symbols=None):
 # }}}
 
 
+# {{{ solve_lu
+
+def forward_substitution(
+        L: Matrix,  # noqa: N803
+        b: Matrix,
+        postprocess: Callable[[Basic], Basic],
+    ) -> Matrix:
+    """Given a lower triangular matrix *L* and a column vector *b*,
+    solve the system ``Lx = b`` and apply the callable *postprocess_division*
+    on each expression at the end of division calls.
+    """
+    n = len(b)
+    x = Matrix(b)
+
+    for i in range(n):
+        for j in range(i):
+            x[i] -= L[i, j] * x[j]
+        x[i] = postprocess(x[i] / L[i, i])
+
+    return x
+
+
+def backward_substitution(
+        U: Matrix,  # noqa: N803
+        b: Matrix,
+        postprocess: Callable[[Basic], Basic],
+    ) -> Matrix:
+    """Given an upper triangular matrix *U* and a column vector *b*,
+    solve the system ``Ux = b`` and apply the callable *postprocess_division*
+    on each expression at the end of division calls.
+    """
+    n = len(b)
+    x = sym.Matrix(b)
+
+    for i in range(n - 1, -1, -1):
+        for j in range(n - 1, i, -1):
+            x[i] -= U[i, j] * x[j]
+        x[i] = postprocess(x[i] / U[i, i])
+
+    return x
+
+
+def solve_lu(
+        L: Matrix,  # noqa: N803
+        U: Matrix,  # noqa: N803
+        permutation: Sequence[tuple[int, int]],
+        b: Matrix, *,
+        postprocess: Callable[[Basic], Basic] | None = None,
+    ) -> Matrix:
+    if postprocess is None:
+        def default_postprocess(x: Basic) -> Basic:
+            return x
+
+        postprocess = default_postprocess
+
+    # Permute first
+    b = sym.Matrix(b)
+    for p, q in permutation:
+        b[p], b[q] = b[q], b[p]
+
+    return backward_substitution(
+        U,
+        forward_substitution(L, b, postprocess),
+        postprocess,
+    )
+
+# }}}
+
+
+# {{{ helpers
+
+
+if USE_SYMENGINE:
+    def evalf(expr: Expr, prec: int = 100) -> Expr:
+        """Evaluate an expression numerically using ``prec`` number of bits."""
+        return expr.n(prec=prec)
+
+    def simplify(expr: Expr) -> Expr:
+        return expr.simplify()
+else:
+    def evalf(expr: Expr, prec: int = 100) -> Expr:
+        """Evaluate an expression numerically using ``prec`` number of bits."""
+        dps = int(sym.log(2**prec, 10))
+        return expr.n(n=dps)
+
+    def simplify(expr: Expr) -> Expr:
+        return sym.simplify(expr)
+
+
+def chop(expr: Basic, tol: float = 1.0e-8) -> Basic:
+    """Replace numbers in *expr* with their closest integer.
+
+    Given a symbolic expression, this removes all occurrences of numbers
+    with absolute value less than the given tolerance and replaces floating
+    point numbers that are close to an integer up to the given relative
+    tolerance by the integer.
+    """
+    nums = expr.atoms(Number)
+    replace_dict: dict[Any, float] = {}
+    for num in nums:
+        if float(abs(num)) < tol:
+            replace_dict[num] = 0
+        else:
+            new_num = float(num)
+            if abs(new_num) < tol or abs(int(new_num) - new_num) < tol * abs(new_num):
+                new_num = int(new_num)
+
+            replace_dict[num] = new_num
+
+    return expr.xreplace(replace_dict)
+
+
 def sym_real_norm_2(x: Matrix) -> Expr:
     return sqrt((x.T*x)[0, 0])
 
 
 def pymbolic_real_norm_2(
-            x: ObjectArray1D[ArithmeticExpression]) -> ArithmeticExpression:
+        x: ObjectArray1D[ArithmeticExpression]
+    ) -> ArithmeticExpression:
     return prim.Variable("sqrt")(x @ x)
 
 
@@ -292,6 +412,10 @@ def from_sympy(cls, expr: Symbol) -> SpatialConstant:
         """Convert :mod:`sympy` expression to a constant."""
         return cls(expr.name[len(cls.prefix):])
 
+# }}}
+
+
+# {{{ PymbolicToSym mappers
 
 class PymbolicToSympyMapper(PymbolicToSympyMapperBase):
     def map_spatial_constant(self, expr: SpatialConstant) -> Basic:
@@ -361,6 +485,18 @@ def map_call(self, expr: prim.Call) -> sym.Basic:
         return PymbolicToSympyMapper.map_call(self, expr)
 
 
+def to_sympy(expr: Expression) -> Basic:
+    return PymbolicToSympyMapperWithSymbols().to_expr(expr)
+
+
+def to_pymbolic(expr: Basic) -> Expression:
+    return SympyToPymbolicMapper()(expr)
+
+# }}}
+
+
+# {{{ Bessel and Hankel functions
+
 from sympy import Function as SympyFunction
 
 
@@ -399,4 +535,6 @@ def BesselJ(*args):   # noqa: N802
     def Hankel1(*args):   # noqa: N802
         return sympify(_SympyHankel1(*args))
 
+# }}}
+
 # vim: fdm=marker

From cc4d5ce925c3e9bf23d14ec56802967ed627a6d3 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Sat, 18 Apr 2026 16:24:08 +0300
Subject: [PATCH 2/3] feat: implement LU-based kernel rewriting

---
 sumpy/kernel_rewrite.py | 339 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 339 insertions(+)
 create mode 100644 sumpy/kernel_rewrite.py

diff --git a/sumpy/kernel_rewrite.py b/sumpy/kernel_rewrite.py
new file mode 100644
index 00000000..60f16173
--- /dev/null
+++ b/sumpy/kernel_rewrite.py
@@ -0,0 +1,339 @@
+from __future__ import annotations
+
+
+__copyright__ = """
+Copyright (C) 2012 Andreas Kloeckner
+Copyright (C) 2020 Isuru Fernando
+Copyright (C) 2026 Alexandru Fikl
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, NamedTuple
+
+import numpy as np
+
+from pytools import (
+    generate_nonnegative_integer_tuples_summing_to_at_most as gnitstam,
+)
+
+import sumpy.symbolic as sym
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    import optype.numpy as onp
+
+    from pymbolic.typing import ArithmeticExpression
+
+    from sumpy.expansions.diff_op import MultiIndex
+    from sumpy.kernel import Kernel
+
+logger = logging.getLogger(__name__)
+
+
+# {{{ rewrite_using_base_kernel
+
+@dataclass(frozen=True)
+class LinearOperatorRepresentation:
+    r"""Expresses a target kernel as a linear operator acting on a base kernel.
+
+    .. math::
+
+        G(\boldsymbol{r}) = C + \sum_{|\boldsymbol{\alpha}| < p}
+            c_{\boldsymbol{\alpha}}
+            \frac{\partial^{|\boldsymbol{\alpha}|} G_0}
+                 {\partial \boldsymbol{r}^{\boldsymbol{\alpha}}}
+
+    .. autoattribute:: target_kernel
+    .. autoattribute:: base_kernel
+    .. autoattribute:: mis
+    .. autoattribute:: coeffs
+    """
+
+    target_kernel: Kernel
+    """The target kernel expressed in terms of :attr:`base_kernel`."""
+    base_kernel: Kernel
+    """A base kernel used to express the target kernel."""
+
+    mis: Sequence[MultiIndex]
+    """Multi-indices for each non-zero term in the linear combination."""
+    coeffs: Sequence[ArithmeticExpression]
+    """Constant coefficients in the linear combination. Note that the coefficients
+    have length ``len(mis) + 1``, where the first element is always the constant term.
+    """
+
+    def pretty(self) -> str:
+        from sumpy.kernel import AxisTargetDerivative
+
+        terms = []
+        if self.coeffs[0] != 0:
+            terms.append(str(self.coeffs[0]))
+
+        for mi, c in zip(self.mis, self.coeffs[1:], strict=True):
+            expr = self.base_kernel
+            for d, n in enumerate(mi):
+                for _ in range(n):
+                    expr = AxisTargetDerivative(d, expr)
+
+            terms.append(f"{c} * {expr}")
+
+        return f"{self.target_kernel} = " + " + ".join(terms)
+
+
+def rewrite_using_base_kernel(
+        target_kernel: Kernel,
+        base_kernel: Kernel,
+        *,
+        order: int | None = None,
+        atol: float = 1.0e-10,
+    ) -> LinearOperatorRepresentation:
+    pde = base_kernel.get_pde_as_diff_op()
+    if order is None:
+        order = pde.order
+
+    # TODO: pick the best algorithm here
+    return rewrite_using_base_kernel_lu(
+        target_kernel, base_kernel,
+        order=order,
+    )
+
+# }}}
+
+
+# {{{ rewrite_using_base_kernel_lu
+
+INT_MAX = 10 ** 15
+
+
+class FactorizationFailedError(Exception):
+    pass
+
+
+class RewriteFailedError(Exception):
+    pass
+
+
+class _LUDecomposition(NamedTuple):
+    L: sym.Matrix
+    U: sym.Matrix
+    permutation: Sequence[tuple[int, int]]
+
+    mis: Sequence[MultiIndex]
+    """The multi-indices for which the derivatives were computed. These correspond
+    to rows in the matrix and should be used to recover the expansion of the
+    target kernel in terms of the base kernel.
+    """
+    points: onp.Array2D[Any]
+    """An array of shape ``(dim, npoints)`` of points where the base kernel was
+    evaluated to compute the current LU factorization.
+    """
+
+
+def rewrite_using_base_kernel_lu(
+        target_kernel: Kernel,
+        base_kernel: Kernel,
+        *,
+        min_order: int | None = None,
+        retries: int = 5,
+        rng: np.random.Generator | None = None,
+    ) -> LinearOperatorRepresentation:
+    """Find a relation between the *target_kernel* and the *base_kernel* using
+    a numerical LU-based algorithm.
+
+    The algorithm samples the *base_kernel* and its derivatives at random
+    points to get a matrix ``A``. It also samples the target kernel at the same
+    points to get a vector ``b`` and solving for the system ``A c = b`` using
+    an LU factorization of ``A``. The solution ``c`` is the vector of coefficients
+    in the linear combination :class:`DifferentialRepresentation`.
+
+    :arg order: starting maximum derivative order to use when attempting the
+        decomposition. By default, this will be the order of the PDE solved by
+        *base_kernel*.
+    :arg retries: maximum number of retries for each order. If the LU decomposition
+        fails due to a poor choice of random points, it is retried several times.
+    """
+
+    pde = base_kernel.get_pde_as_diff_op()
+    if min_order is None:
+        min_order = pde.order
+
+    if min_order > pde.order:
+        raise NotImplementedError(
+            "Rewriting when the base kernel's derivatives are linearly dependent "
+            "is not implemented")
+
+    if rng is None:
+        rng = np.random.default_rng()
+
+    coeffs: list[sym.Basic] = []
+    mis: list[MultiIndex] = []
+
+    dim = base_kernel.dim
+    dvec = sym.make_sym_vector("d", dim)
+    target_expr = target_kernel.get_expression(dvec)
+
+    target_scaling = target_kernel.get_global_scaling_const()
+    base_scaling = base_kernel.get_global_scaling_const()
+
+    order = min_order
+    while order <= pde.order:
+        try:
+            lu = _get_base_kernel_matrix_lu_factorization(
+                base_kernel, order, rng=rng, retries=retries
+            )
+        except FactorizationFailedError as exc:
+            if order == pde.order:
+                raise RewriteFailedError(
+                    f"failed to compute LU factorization for orders in "
+                    f"[{min_order}, {pde.order}] for base kernel {base_kernel} "
+                    f"after {retries} retries"
+                ) from exc
+
+            order += 1
+            continue
+
+        # evaluate right-hand side
+        b = sym.Matrix([
+            target_expr.xreplace(dict(zip(dvec, lu.points[:, i], strict=True)))
+            for i in range(lu.points.shape[1])
+        ])
+
+        # solve
+        all_coeffs = sym.solve_lu(lu.L, lu.U, lu.permutation, b,
+                                  postprocess=lambda x: x.expand())
+
+        # gather all non-zero coefficients from the result
+        const = sym.Integer(0)
+        coeffs = []
+        mis = []
+        for i, coeff in enumerate(all_coeffs):
+            coeff = sym.simplify(coeff)
+            if coeff == 0:
+                continue
+
+            if i == 0:
+                const = sym.to_pymbolic(sym.simplify(coeff * target_scaling))
+                logger.debug("  %s", coeff)
+            else:
+                mi = lu.mis[i - 1]
+                coeff = sym.simplify(coeff * target_scaling / base_scaling)
+
+                mis.append(mi)
+                coeffs.append(sym.to_pymbolic(coeff))
+                logger.debug("  + %s*%s.diff%s", coeff, base_kernel, mi)
+
+        if coeffs:
+            coeffs.insert(0, const)
+            break
+
+        order += 1
+
+    if not coeffs:
+        raise RewriteFailedError(
+            f"could not express {target_kernel} in terms of {base_kernel}"
+        )
+
+    return LinearOperatorRepresentation(target_kernel, base_kernel, mis, coeffs)
+
+
+def _get_base_kernel_matrix_lu_factorization(
+        base_kernel: Kernel,
+        order: int,
+        *,
+        rng: np.random.Generator,
+        retries: int,
+    ) -> _LUDecomposition:
+    pde = base_kernel.get_pde_as_diff_op()
+    if order > pde.order:
+        raise NotImplementedError(
+            "Rewriting when the base kernel's derivatives are linearly dependent "
+            "is not implemented")
+
+    dim = base_kernel.dim
+
+    mis = list(gnitstam(order, dim))
+    if order == pde.order:
+        pde_mis = [ident.mi for eq in pde.eqs for ident in eq]
+        pde_mis = [mi for mi in pde_mis if sum(mi) == order]
+        mis.remove(pde_mis[-1])
+
+        logger.debug("Removing %s to avoid linear dependent mis", pde_mis[-1])
+
+    # get sympy expression for the base kernel
+    dvec = sym.make_sym_vector("d", dim)
+    base_expr = base_kernel.get_expression(dvec)
+
+    # evaluate all the needed derivatives
+    mi_to_derivative: dict[MultiIndex, sym.Basic] = {}
+    for mi in mis:
+        expr = base_expr
+        for i, nderivs in enumerate(mi):
+            if nderivs == 0:
+                continue
+            expr = expr.diff(dvec[i], nderivs)
+
+        mi_to_derivative[mi] = sym.simplify(expr)
+
+    # try to LU factorize on random points
+    for _ in range(retries):
+        # TODO: is it faster to generate numbers and then sympify them?
+        points = np.empty((dim, len(mis) + 1), dtype=object)
+        for i in range(points.shape[0]):
+            for j in range(points.shape[1]):
+                points[i, j] = sym.Integer(rng.integers(1, INT_MAX)) / INT_MAX
+
+        # evaluate derivatives at points and construct matrix
+        entries: list[list[sym.Basic]] = []
+        for i in range(points.shape[1]):
+            row: list[sym.Basic] = [sym.Integer(1)]
+
+            for mi in mis:
+                expr = mi_to_derivative[mi].replace(
+                    dict(zip(dvec, points[:, i], strict=True))
+                )
+                row.append(sym.simplify(expr))
+
+            entries.append(row)
+        mat = sym.Matrix(entries)
+
+        # TODO: LUdecomposition in symengine is not implemented for non-square matrices
+        try:
+            L, U, perm = mat.LUdecomposition()  # noqa: N806
+        except RuntimeError:
+            # NOTE: symengine seems to throw a SymEngineError -> RuntimeError when
+            # it fails to do the LU factorization due to rank-deficiency
+            continue
+        else:
+            if not sym.USE_SYMENGINE and all(expr == 0 for expr in U[-1, :]):
+                continue
+
+        return _LUDecomposition(L, U, perm, mis, points)
+
+    raise FactorizationFailedError(
+        f"failed to compute LU factorization to order {order} for {base_kernel} "
+        f"after {retries} retries"
+    )
+
+# }}}

From 7b78a116a633aa9619583061ae0119739a520c6b Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Fri, 8 May 2026 10:04:44 +0300
Subject: [PATCH 3/3] test: add some tests for rewrite_using_base_kernel_lu

---
 sumpy/test/test_kernel_rewrite.py | 52 +++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 sumpy/test/test_kernel_rewrite.py

diff --git a/sumpy/test/test_kernel_rewrite.py b/sumpy/test/test_kernel_rewrite.py
new file mode 100644
index 00000000..ed83df4f
--- /dev/null
+++ b/sumpy/test/test_kernel_rewrite.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+
+__copyright__ = """
+Copyright (C) 2020 Isuru Fernando
+Copyright (C) 2026 Alexandru Fikl
+"""
+
+import logging
+
+import numpy as np
+import pytest
+
+from sumpy.kernel import BiharmonicKernel, StokesletKernel
+from sumpy.kernel_rewrite import rewrite_using_base_kernel_lu
+
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.skip()
+@pytest.mark.parametrize("dim", [2, 3])
+def test_stokeslet_biharmonic_rewrite(dim: int) -> None:
+    from pytools import generate_nonnegative_integer_tuples_below as gnitb
+
+    rng = np.random.default_rng(seed=42)
+
+    base_kernel = BiharmonicKernel(dim)
+    for i, j in gnitb(dim, 2):
+        if not i <= j:
+            continue
+
+        target_kernel = StokesletKernel(dim, i, j, viscosity_mu=1)
+        result = rewrite_using_base_kernel_lu(target_kernel, base_kernel, rng=rng)
+        print(result.pretty())
+
+
+@pytest.mark.skip()
+@pytest.mark.parametrize("dim", [2, 3])
+def test_stresslet_biharmonic_rewrite(dim: int) -> None:
+    pass
+
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        pytest.main([__file__])
+
+# vim: fdm=marker